116 files changed, 8455 insertions, 5907 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 31610ea73aec..9b72dcf1cd25 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,4 +7,4 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
-           compression.o delayed-ref.o relocation.o
+           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 44ea5b92e1ba..f66fc9959733 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -288,7 +288,7 @@ int btrfs_acl_chmod(struct inode *inode)
                return 0;
        acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl) || !acl)
+        if (IS_ERR_OR_NULL(acl))
                return PTR_ERR(acl);
        clone = posix_acl_clone(acl, GFP_KERNEL);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 57c3bb2884ce..93b1aa932014 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -22,6 +22,7 @@
 #include "extent_map.h"
 #include "extent_io.h"
 #include "ordered-data.h"
+#include "delayed-inode.h"
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -152,20 +153,34 @@ struct btrfs_inode {
        unsigned ordered_data_close:1;
        unsigned orphan_meta_reserved:1;
        unsigned dummy_inode:1;
+        unsigned in_defrag:1;
        /*
         * always compress this one file
         */
        unsigned force_compress:4;
+        struct btrfs_delayed_node *delayed_node;
        struct inode vfs_inode;
 };
+extern unsigned char btrfs_filetype_table[];
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 {
        return container_of(inode, struct btrfs_inode, vfs_inode);
 }
+static inline u64 btrfs_ino(struct inode *inode)
+{
+        u64 ino = BTRFS_I(inode)->location.objectid;
+        if (ino <= BTRFS_FIRST_FREE_OBJECTID)
+                ino = inode->i_ino;
+        return ino;
+}
 static inline void btrfs_i_size_write(struct inode *inode, u64 size)
 {
        i_size_write(inode, size);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 41d1d7c70e29..bfe42b03eaf9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -125,9 +125,10 @@ static int check_compressed_csum(struct inode *inode,
                kunmap_atomic(kaddr, KM_USER0);
                if (csum != *cb_sum) {
-                        printk(KERN_INFO "btrfs csum failed ino %lu "
+                        printk(KERN_INFO "btrfs csum failed ino %llu "
                               "extent %llu csum %u "
-                               "wanted %u mirror %d\n", inode->i_ino,
+                               "wanted %u mirror %d\n",
+                               (unsigned long long)btrfs_ino(inode),
                               (unsigned long long)disk_start,
                               csum, *cb_sum, cb->mirror_num);
                        ret = -EIO;
@@ -332,7 +333,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        struct compressed_bio *cb;
        unsigned long bytes_left;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-        int page_index = 0;
+        int pg_index = 0;
        struct page *page;
        u64 first_byte = disk_start;
        struct block_device *bdev;
@@ -366,8 +367,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        /* create and submit bios for the compressed pages */
        bytes_left = compressed_len;
-        for (page_index = 0; page_index < cb->nr_pages; page_index++) {
+        for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
-                page = compressed_pages[page_index];
+                page = compressed_pages[pg_index];
                page->mapping = inode->i_mapping;
                if (bio->bi_size)
                        ret = io_tree->ops->merge_bio_hook(page, 0,
@@ -432,7 +433,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                                     struct compressed_bio *cb)
 {
        unsigned long end_index;
-        unsigned long page_index;
+        unsigned long pg_index;
        u64 last_offset;
        u64 isize = i_size_read(inode);
        int ret;
@@ -456,13 +457,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
        end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
        while (last_offset < compressed_end) {
-                page_index = last_offset >> PAGE_CACHE_SHIFT;
+                pg_index = last_offset >> PAGE_CACHE_SHIFT;
-                if (page_index > end_index)
+                if (pg_index > end_index)
                        break;
                rcu_read_lock();
-                page = radix_tree_lookup(&mapping->page_tree, page_index);
+                page = radix_tree_lookup(&mapping->page_tree, pg_index);
                rcu_read_unlock();
                if (page) {
                        misses++;
@@ -476,7 +477,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                if (!page)
                        break;
-                if (add_to_page_cache_lru(page, mapping, page_index,
+                if (add_to_page_cache_lru(page, mapping, pg_index,
                                                                GFP_NOFS)) {
                        page_cache_release(page);
                        goto next;
@@ -560,7 +561,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
        unsigned long compressed_len;
        unsigned long nr_pages;
-        unsigned long page_index;
+        unsigned long pg_index;
        struct page *page;
        struct block_device *bdev;
        struct bio *comp_bio;
@@ -613,10 +614,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-        for (page_index = 0; page_index < nr_pages; page_index++) {
+        for (pg_index = 0; pg_index < nr_pages; pg_index++) {
-                cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+                cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
                                                              __GFP_HIGHMEM);
-                if (!cb->compressed_pages[page_index])
+                if (!cb->compressed_pages[pg_index])
                        goto fail2;
        }
        cb->nr_pages = nr_pages;
@@ -634,8 +635,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        comp_bio->bi_end_io = end_compressed_bio_read;
        atomic_inc(&cb->pending_bios);
-        for (page_index = 0; page_index < nr_pages; page_index++) {
+        for (pg_index = 0; pg_index < nr_pages; pg_index++) {
-                page = cb->compressed_pages[page_index];
+                page = cb->compressed_pages[pg_index];
                page->mapping = inode->i_mapping;
                page->index = em_start >> PAGE_CACHE_SHIFT;
@@ -702,8 +703,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        return 0;
 fail2:
-        for (page_index = 0; page_index < nr_pages; page_index++)
+        for (pg_index = 0; pg_index < nr_pages; pg_index++)
-                free_page((unsigned long)cb->compressed_pages[page_index]);
+                free_page((unsigned long)cb->compressed_pages[pg_index]);
        kfree(cb->compressed_pages);
 fail1:
@@ -945,7 +946,7 @@ void btrfs_exit_compress(void)
 int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
                              unsigned long total_out, u64 disk_start,
                              struct bio_vec *bvec, int vcnt,
-                              unsigned long *page_index,
+                              unsigned long *pg_index,
                              unsigned long *pg_offset)
 {
        unsigned long buf_offset;
@@ -954,7 +955,7 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
        unsigned long working_bytes = total_out - buf_start;
        unsigned long bytes;
        char *kaddr;
-        struct page *page_out = bvec[*page_index].bv_page;
+        struct page *page_out = bvec[*pg_index].bv_page;
        /*
         * start byte is the first byte of the page we're currently
@@ -995,11 +996,11 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
                /* check if we need to pick another page */
                if (*pg_offset == PAGE_CACHE_SIZE) {
-                        (*page_index)++;
+                        (*pg_index)++;
-                        if (*page_index >= vcnt)
+                        if (*pg_index >= vcnt)
                                return 0;
-                        page_out = bvec[*page_index].bv_page;
+                        page_out = bvec[*pg_index].bv_page;
                        *pg_offset = 0;
                        start_byte = page_offset(page_out) - disk_start;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 51000174b9d7..a12059f4f0fd 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -37,7 +37,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
 int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
                              unsigned long total_out, u64 disk_start,
                              struct bio_vec *bvec, int vcnt,
-                              unsigned long *page_index,
+                              unsigned long *pg_index,
                              unsigned long *pg_offset);
 int btrfs_submit_compressed_write(struct inode *inode, u64 start,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 84d7ca1fe0ba..b0e18d986e0a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,11 +38,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                              struct extent_buffer *src_buf);
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct btrfs_path *path, int level, int slot);
-static int setup_items_for_insert(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root, struct btrfs_path *path,
-                        struct btrfs_key *cpu_key, u32 *data_size,
-                        u32 total_data, u32 total_size, int nr);
 struct btrfs_path *btrfs_alloc_path(void)
 {
@@ -107,7 +102,7 @@ void btrfs_free_path(struct btrfs_path *p)
 {
        if (!p)
                return;
-        btrfs_release_path(NULL, p);
+        btrfs_release_path(p);
        kmem_cache_free(btrfs_path_cachep, p);
 }
@@ -117,7 +112,7 @@ void btrfs_free_path(struct btrfs_path *p)
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
-noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+noinline void btrfs_release_path(struct btrfs_path *p)
 {
        int i;
@@ -1328,7 +1323,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                ret = -EAGAIN;
                /* release the whole path */
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                /* read the blocks */
                if (block1)
@@ -1475,7 +1470,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
                                return 0;
                        }
                        free_extent_buffer(tmp);
-                        btrfs_release_path(NULL, p);
+                        btrfs_release_path(p);
                        return -EIO;
                }
        }
@@ -1494,7 +1489,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        if (p->reada)
                reada_for_search(root, p, level, slot, key->objectid);
-        btrfs_release_path(NULL, p);
+        btrfs_release_path(p);
        ret = -EAGAIN;
        tmp = read_tree_block(root, blocknr, blocksize, 0);
@@ -1563,7 +1558,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                }
                b = p->nodes[level];
                if (!b) {
-                        btrfs_release_path(NULL, p);
+                        btrfs_release_path(p);
                        goto again;
                }
                BUG_ON(btrfs_header_nritems(b) == 1);
@@ -1753,7 +1748,7 @@ done:
        if (!p->leave_spinning)
                btrfs_set_path_blocking(p);
        if (ret < 0)
-                btrfs_release_path(root, p);
+                btrfs_release_path(p);
        return ret;
 }
@@ -3026,7 +3021,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
                                    struct btrfs_file_extent_item);
                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        path->keep_locks = 1;
        path->search_for_split = 1;
@@ -3216,7 +3211,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
                        struct btrfs_path *path,
                        u32 new_size, int from_end)
 {
-        int ret = 0;
        int slot;
        struct extent_buffer *leaf;
        struct btrfs_item *item;
@@ -3314,12 +3308,11 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
        btrfs_set_item_size(leaf, item, new_size);
        btrfs_mark_buffer_dirty(leaf);
-        ret = 0;
        if (btrfs_leaf_free_space(root, leaf) < 0) {
                btrfs_print_leaf(root, leaf);
                BUG();
        }
-        return ret;
+        return 0;
 }
 /*
@@ -3329,7 +3322,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, struct btrfs_path *path,
                      u32 data_size)
 {
-        int ret = 0;
        int slot;
        struct extent_buffer *leaf;
        struct btrfs_item *item;
@@ -3394,12 +3386,11 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
        btrfs_set_item_size(leaf, item, old_size + data_size);
        btrfs_mark_buffer_dirty(leaf);
-        ret = 0;
        if (btrfs_leaf_free_space(root, leaf) < 0) {
                btrfs_print_leaf(root, leaf);
                BUG();
        }
-        return ret;
+        return 0;
 }
 /*
@@ -3559,11 +3550,10 @@ out:
 * to save stack depth by doing the bulk of the work in a function
 * that doesn't call btrfs_search_slot
 */
-static noinline_for_stack int
+int setup_items_for_insert(struct btrfs_trans_handle *trans,
-setup_items_for_insert(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, struct btrfs_path *path,
-                      struct btrfs_root *root, struct btrfs_path *path,
+                           struct btrfs_key *cpu_key, u32 *data_size,
-                      struct btrfs_key *cpu_key, u32 *data_size,
+                           u32 total_data, u32 total_size, int nr)
-                      u32 total_data, u32 total_size, int nr)
 {
        struct btrfs_item *item;
        int i;
@@ -3647,7 +3637,6 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
        ret = 0;
        if (slot == 0) {
-                struct btrfs_disk_key disk_key;
                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
                ret = fixup_low_keys(trans, root, path, &disk_key, 1);
        }
@@ -3949,7 +3938,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
        else
                return 1;
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                return ret;
@@ -4073,7 +4062,7 @@ find_next_key:
                        sret = btrfs_find_next_key(root, path, min_key, level,
                                                  cache_only, min_trans);
                        if (sret == 0) {
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                                goto again;
                        } else {
                                goto out;
@@ -4152,7 +4141,7 @@ next:
                                btrfs_node_key_to_cpu(c, &cur_key, slot);
                        orig_lowest = path->lowest_level;
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        path->lowest_level = level;
                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
                                                0, 0);
@@ -4229,7 +4218,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 again:
        level = 1;
        next = NULL;
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        path->keep_locks = 1;
@@ -4285,7 +4274,7 @@ again:
                        goto again;
                if (ret < 0) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto done;
                }
@@ -4324,7 +4313,7 @@ again:
                        goto again;
                if (ret < 0) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto done;
                }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8f4b81de3ae2..332323e19dd1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -23,6 +23,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/fs.h>
+#include <linux/rwsem.h>
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
@@ -33,6 +34,7 @@
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
+#include "ioctl.h"
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -105,6 +107,12 @@ struct btrfs_ordered_sum;
 /* For storing free space cache */
 #define BTRFS_FREE_SPACE_OBJECTID -11ULL
+/*
+ * The inode number assigned to the special inode for sotring
+ * free ino cache
+ */
+#define BTRFS_FREE_INO_OBJECTID -12ULL
 /* dummy objectid represents multiple objectids */
 #define BTRFS_MULTIPLE_OBJECTIDS -255ULL
@@ -187,7 +195,6 @@ struct btrfs_mapping_tree {
        struct extent_map_tree map_tree;
 };
-#define BTRFS_UUID_SIZE 16
 struct btrfs_dev_item {
        /* the internal btrfs device id */
        __le64 devid;
@@ -294,7 +301,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
                sizeof(struct btrfs_stripe) * (num_stripes - 1);
 }
-#define BTRFS_FSID_SIZE 16
 #define BTRFS_HEADER_FLAG_WRITTEN       (1ULL << 0)
 #define BTRFS_HEADER_FLAG_RELOC         (1ULL << 1)
@@ -510,6 +516,12 @@ struct btrfs_extent_item_v0 {
 /* use full backrefs for extent pointers in the block */
 #define BTRFS_BLOCK_FLAG_FULL_BACKREF   (1ULL << 8)
+/*
+ * this flag is only used internally by scrub and may be changed at any time
+ * it is only declared here to avoid collisions
+ */
+#define BTRFS_EXTENT_FLAG_SUPER         (1ULL << 48)
 struct btrfs_tree_block_info {
        struct btrfs_disk_key key;
        u8 level;
@@ -740,12 +752,12 @@ struct btrfs_space_info {
         */
        unsigned long reservation_progress;
-        int full:1;             /* indicates that we cannot allocate any more
+        unsigned int full:1;    /* indicates that we cannot allocate any more
                                   chunks for this space */
-        int chunk_alloc:1;      /* set if we are allocating a chunk */
+        unsigned int chunk_alloc:1;     /* set if we are allocating a chunk */
-        int force_alloc;        /* set if we need to force a chunk alloc for
+        unsigned int force_alloc;       /* set if we need to force a chunk
-                                   this space */
+                                           alloc for this space */
        struct list_head list;
@@ -830,9 +842,6 @@ struct btrfs_block_group_cache {
        u64 bytes_super;
        u64 flags;
        u64 sectorsize;
-        int extents_thresh;
-        int free_extents;
-        int total_bitmaps;
        unsigned int ro:1;
        unsigned int dirty:1;
        unsigned int iref:1;
@@ -847,9 +856,7 @@ struct btrfs_block_group_cache {
        struct btrfs_space_info *space_info;
        /* free space cache stuff */
-        spinlock_t tree_lock;
+        struct btrfs_free_space_ctl *free_space_ctl;
-        struct rb_root free_space_offset;
-        u64 free_space;
        /* block group cache stuff */
        struct rb_node cache_node;
@@ -869,6 +876,7 @@ struct btrfs_block_group_cache {
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
+struct btrfs_delayed_root;
 struct btrfs_fs_info {
        u8 fsid[BTRFS_FSID_SIZE];
        u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
@@ -895,7 +903,10 @@ struct btrfs_fs_info {
        /* logical->physical extent mapping */
        struct btrfs_mapping_tree mapping_tree;
-        /* block reservation for extent, checksum and root tree */
+        /*
+         * block reservation for extent, checksum, root tree and
+         * delayed dir index item
+         */
        struct btrfs_block_rsv global_block_rsv;
        /* block reservation for delay allocation */
        struct btrfs_block_rsv delalloc_block_rsv;
@@ -1022,6 +1033,7 @@ struct btrfs_fs_info {
         * for the sys_munmap function call path
         */
        struct btrfs_workers fixup_workers;
+        struct btrfs_workers delayed_workers;
        struct task_struct *transaction_kthread;
        struct task_struct *cleaner_kthread;
        int thread_pool_size;
@@ -1062,6 +1074,11 @@ struct btrfs_fs_info {
        /* all metadata allocations go through this cluster */
        struct btrfs_free_cluster meta_alloc_cluster;
+        /* auto defrag inodes go here */
+        spinlock_t defrag_inodes_lock;
+        struct rb_root defrag_inodes;
+        atomic_t defrag_running;
        spinlock_t ref_cache_lock;
        u64 total_ref_cache_size;
@@ -1077,8 +1094,21 @@ struct btrfs_fs_info {
        void *bdev_holder;
+        /* private scrub information */
+        struct mutex scrub_lock;
+        atomic_t scrubs_running;
+        atomic_t scrub_pause_req;
+        atomic_t scrubs_paused;
+        atomic_t scrub_cancel_req;
+        wait_queue_head_t scrub_pause_wait;
+        struct rw_semaphore scrub_super_lock;
+        int scrub_workers_refcnt;
+        struct btrfs_workers scrub_workers;
        /* filesystem state */
        u64 fs_state;
+        struct btrfs_delayed_root *delayed_root;
 };
 /*
@@ -1088,9 +1118,6 @@ struct btrfs_fs_info {
 struct btrfs_root {
        struct extent_buffer *node;
-        /* the node lock is held while changing the node pointer */
-        spinlock_t node_lock;
        struct extent_buffer *commit_root;
        struct btrfs_root *log_root;
        struct btrfs_root *reloc_root;
@@ -1107,6 +1134,16 @@ struct btrfs_root {
        spinlock_t accounting_lock;
        struct btrfs_block_rsv *block_rsv;
+        /* free ino cache stuff */
+        struct mutex fs_commit_mutex;
+        struct btrfs_free_space_ctl *free_ino_ctl;
+        enum btrfs_caching_type cached;
+        spinlock_t cache_lock;
+        wait_queue_head_t cache_wait;
+        struct btrfs_free_space_ctl *free_ino_pinned;
+        u64 cache_progress;
+        struct inode *cache_inode;
        struct mutex log_mutex;
        wait_queue_head_t log_writer_wait;
        wait_queue_head_t log_commit_wait[2];
@@ -1162,12 +1199,49 @@ struct btrfs_root {
        struct rb_root inode_tree;
        /*
+         * radix tree that keeps track of delayed nodes of every inode,
+         * protected by inode_lock
+         */
+        struct radix_tree_root delayed_nodes_tree;
+        /*
         * right now this just gets used so that a root has its own devid
         * for stat.  It may be used for more later
         */
        struct super_block anon_super;
 };
+struct btrfs_ioctl_defrag_range_args {
+        /* start of the defrag operation */
+        __u64 start;
+        /* number of bytes to defrag, use (u64)-1 to say all */
+        __u64 len;
+        /*
+         * flags for the operation, which can include turning
+         * on compression for this one defrag
+         */
+        __u64 flags;
+        /*
+         * any extent bigger than this will be considered
+         * already defragged.  Use 0 to take the kernel default
+         * Use 1 to say every single extent must be rewritten
+         */
+        __u32 extent_thresh;
+        /*
+         * which compression method to use if turning on compression
+         * for this defrag operation.  If unspecified, zlib will
+         * be used
+         */
+        __u32 compress_type;
+        /* spare for later */
+        __u32 unused[4];
+};
 /*
 * inode items have the data typically returned from stat and store other
 * info about object characteristics.  There is one for every file and dir in
@@ -1265,6 +1339,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_CLEAR_CACHE         (1 << 13)
 #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
 #define BTRFS_MOUNT_ENOSPC_DEBUG         (1 << 15)
+#define BTRFS_MOUNT_AUTO_DEFRAG         (1 << 16)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -1440,26 +1515,12 @@ static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
        return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
 }
-static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
-                                             struct btrfs_chunk *c, int nr,
-                                             u64 val)
-{
-        btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
-}
 static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
                                         struct btrfs_chunk *c, int nr)
 {
        return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
 }
-static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
-                                             struct btrfs_chunk *c, int nr,
-                                             u64 val)
-{
-        btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
-}
 /* struct btrfs_block_group_item */
 BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
                         used, 64);
@@ -1517,14 +1578,6 @@ btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
        return (struct btrfs_timespec *)ptr;
 }
-static inline struct btrfs_timespec *
-btrfs_inode_otime(struct btrfs_inode_item *inode_item)
-{
-        unsigned long ptr = (unsigned long)inode_item;
-        ptr += offsetof(struct btrfs_inode_item, otime);
-        return (struct btrfs_timespec *)ptr;
-}
 BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
@@ -1875,33 +1928,6 @@ static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
        return (u8 *)ptr;
 }
-static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
-{
-        unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
-        return (u8 *)ptr;
-}
-static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
-{
-        unsigned long ptr = offsetof(struct btrfs_header, csum);
-        return (u8 *)ptr;
-}
-static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
-{
-        return NULL;
-}
-static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
-{
-        return NULL;
-}
-static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
-{
-        return NULL;
-}
 static inline int btrfs_is_leaf(struct extent_buffer *eb)
 {
        return btrfs_header_level(eb) == 0;
@@ -2055,22 +2081,6 @@ static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
        return sb->s_fs_info;
 }
-static inline int btrfs_set_root_name(struct btrfs_root *root,
-                                      const char *name, int len)
-{
-        /* if we already have a name just free it */
-        kfree(root->name);
-        root->name = kmalloc(len+1, GFP_KERNEL);
-        if (!root->name)
-                return -ENOMEM;
-        memcpy(root->name, name, len);
-        root->name[len] = '\0';
-        return 0;
-}
 static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
 {
        if (level == 0)
@@ -2099,6 +2109,13 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
 }
 /* extent-tree.c */
+static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
+                                                 int num_items)
+{
+        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+                3 * num_items;
+}
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
@@ -2108,12 +2125,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 objectid, u64 offset, u64 bytenr);
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 struct btrfs_fs_info *info,
                                                 u64 bytenr);
@@ -2290,10 +2304,12 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct extent_buffer *parent,
                       int start_slot, int cache_only, u64 *last_ret,
                       struct btrfs_key *progress);
-void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
+void btrfs_release_path(struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_clear_path_blocking(struct btrfs_path *p,
+                               struct extent_buffer *held);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2305,13 +2321,12 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
        return btrfs_del_items(trans, root, path, path->slots[0], 1);
 }
+int setup_items_for_insert(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, struct btrfs_path *path,
+                           struct btrfs_key *cpu_key, u32 *data_size,
+                           u32 total_data, u32 total_size, int nr);
 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_key *key, void *data, u32 data_size);
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
-                            struct btrfs_path *path,
-                            struct btrfs_key *cpu_key, u32 *data_size,
-                            int nr);
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path,
@@ -2357,8 +2372,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
                      *item);
 int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
                         btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_search_root(struct btrfs_root *root, u64 search_start,
-                      u64 *found_objectid);
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
 int btrfs_set_root_node(struct btrfs_root_item *item,
@@ -2368,7 +2381,7 @@ void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, const char *name,
-                          int name_len, u64 dir,
+                          int name_len, struct inode *dir,
                          struct btrfs_key *location, u8 type, u64 index);
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
                                             struct btrfs_root *root,
@@ -2413,12 +2426,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, u64 offset);
 int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
-/* inode-map.c */
-int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *fs_root,
-                             u64 dirid, u64 *objectid);
-int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
 /* inode-item.c */
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
@@ -2463,8 +2470,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                       struct bio *bio, u64 file_start, int contig);
-int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
-                          u64 start, unsigned long len);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path,
@@ -2472,8 +2477,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct btrfs_path *path,
                        u64 isize);
-int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
-                             u64 end, struct list_head *list);
+                             struct list_head *list, int search_commit);
 /* inode.c */
 /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
@@ -2502,8 +2507,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
-                                   int sync);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
@@ -2520,7 +2523,6 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
-void btrfs_put_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
@@ -2531,10 +2533,8 @@ void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                         struct btrfs_root *root, int *was_new);
-int btrfs_commit_write(struct file *file, struct page *page,
-                       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
-                                    size_t page_offset, u64 start, u64 end,
+                                    size_t pg_offset, u64 start, u64 end,
                                    int create);
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
@@ -2566,12 +2566,16 @@ extern const struct dentry_operations btrfs_dentry_operations;
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 void btrfs_update_iflags(struct inode *inode);
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+                      struct btrfs_ioctl_defrag_range_args *range,
+                      u64 newer_than, unsigned long max_pages);
 /* file.c */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+                           struct inode *inode);
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                            int skip_pinned);
-int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern const struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
                       u64 start, u64 end, u64 *hint_byte, int drop_cache);
@@ -2591,10 +2595,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 /* sysfs.c */
 int btrfs_init_sysfs(void);
 void btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
-int btrfs_sysfs_add_root(struct btrfs_root *root);
-void btrfs_sysfs_del_root(struct btrfs_root *root);
-void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
 /* xattr.c */
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -2637,4 +2637,18 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
                              u64 *bytes_to_reserve);
 void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
                              struct btrfs_pending_snapshot *pending);
+/* scrub.c */
+int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+                    struct btrfs_scrub_progress *progress, int readonly);
+int btrfs_scrub_pause(struct btrfs_root *root);
+int btrfs_scrub_pause_super(struct btrfs_root *root);
+int btrfs_scrub_continue(struct btrfs_root *root);
+int btrfs_scrub_continue_super(struct btrfs_root *root);
+int btrfs_scrub_cancel(struct btrfs_root *root);
+int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
+int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+                         struct btrfs_scrub_progress *progress);
 #endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
new file mode 100644
index 000000000000..01e29503a54b
--- /dev/null
+++ b/fs/btrfs/delayed-inode.c
@@ -0,0 +1,1695 @@
+/*
+ * Copyright (C) 2011 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/slab.h>
+#include "delayed-inode.h"
+#include "disk-io.h"
+#include "transaction.h"
+#define BTRFS_DELAYED_WRITEBACK         400
+#define BTRFS_DELAYED_BACKGROUND        100
+static struct kmem_cache *delayed_node_cache;
+int __init btrfs_delayed_inode_init(void)
+{
+        delayed_node_cache = kmem_cache_create("delayed_node",
+                                        sizeof(struct btrfs_delayed_node),
+                                        0,
+                                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                        NULL);
+        if (!delayed_node_cache)
+                return -ENOMEM;
+        return 0;
+}
+void btrfs_delayed_inode_exit(void)
+{
+        if (delayed_node_cache)
+                kmem_cache_destroy(delayed_node_cache);
+}
+static inline void btrfs_init_delayed_node(
+                                struct btrfs_delayed_node *delayed_node,
+                                struct btrfs_root *root, u64 inode_id)
+{
+        delayed_node->root = root;
+        delayed_node->inode_id = inode_id;
+        atomic_set(&delayed_node->refs, 0);
+        delayed_node->count = 0;
+        delayed_node->in_list = 0;
+        delayed_node->inode_dirty = 0;
+        delayed_node->ins_root = RB_ROOT;
+        delayed_node->del_root = RB_ROOT;
+        mutex_init(&delayed_node->mutex);
+        delayed_node->index_cnt = 0;
+        INIT_LIST_HEAD(&delayed_node->n_list);
+        INIT_LIST_HEAD(&delayed_node->p_list);
+        delayed_node->bytes_reserved = 0;
+}
+static inline int btrfs_is_continuous_delayed_item(
+                                        struct btrfs_delayed_item *item1,
+                                        struct btrfs_delayed_item *item2)
+{
+        if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
+            item1->key.objectid == item2->key.objectid &&
+            item1->key.type == item2->key.type &&
+            item1->key.offset + 1 == item2->key.offset)
+                return 1;
+        return 0;
+}
+static inline struct btrfs_delayed_root *btrfs_get_delayed_root(
+                                                        struct btrfs_root *root)
+{
+        return root->fs_info->delayed_root;
+}
+static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
+                                                        struct inode *inode)
+{
+        struct btrfs_delayed_node *node;
+        struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+        struct btrfs_root *root = btrfs_inode->root;
+        u64 ino = btrfs_ino(inode);
+        int ret;
+again:
+        node = ACCESS_ONCE(btrfs_inode->delayed_node);
+        if (node) {
+                atomic_inc(&node->refs);        /* can be accessed */
+                return node;
+        }
+        spin_lock(&root->inode_lock);
+        node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+        if (node) {
+                if (btrfs_inode->delayed_node) {
+                        spin_unlock(&root->inode_lock);
+                        goto again;
+                }
+                btrfs_inode->delayed_node = node;
+                atomic_inc(&node->refs);        /* can be accessed */
+                atomic_inc(&node->refs);        /* cached in the inode */
+                spin_unlock(&root->inode_lock);
+                return node;
+        }
+        spin_unlock(&root->inode_lock);
+        node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
+        if (!node)
+                return ERR_PTR(-ENOMEM);
+        btrfs_init_delayed_node(node, root, ino);
+        atomic_inc(&node->refs);        /* cached in the btrfs inode */
+        atomic_inc(&node->refs);        /* can be accessed */
+        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+        if (ret) {
+                kmem_cache_free(delayed_node_cache, node);
+                return ERR_PTR(ret);
+        }
+        spin_lock(&root->inode_lock);
+        ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
+        if (ret == -EEXIST) {
+                kmem_cache_free(delayed_node_cache, node);
+                spin_unlock(&root->inode_lock);
+                radix_tree_preload_end();
+                goto again;
+        }
+        btrfs_inode->delayed_node = node;
+        spin_unlock(&root->inode_lock);
+        radix_tree_preload_end();
+        return node;
+}
+/*
+ * Call it when holding delayed_node->mutex
+ *
+ * If mod = 1, add this node into the prepared list.
+ */
+static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
+                                     struct btrfs_delayed_node *node,
+                                     int mod)
+{
+        spin_lock(&root->lock);
+        if (node->in_list) {
+                if (!list_empty(&node->p_list))
+                        list_move_tail(&node->p_list, &root->prepare_list);
+                else if (mod)
+                        list_add_tail(&node->p_list, &root->prepare_list);
+        } else {
+                list_add_tail(&node->n_list, &root->node_list);
+                list_add_tail(&node->p_list, &root->prepare_list);
+                atomic_inc(&node->refs);        /* inserted into list */
+                root->nodes++;
+                node->in_list = 1;
+        }
+        spin_unlock(&root->lock);
+}
+/* Call it when holding delayed_node->mutex */
+static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
+                                       struct btrfs_delayed_node *node)
+{
+        spin_lock(&root->lock);
+        if (node->in_list) {
+                root->nodes--;
+                atomic_dec(&node->refs);        /* not in the list */
+                list_del_init(&node->n_list);
+                if (!list_empty(&node->p_list))
+                        list_del_init(&node->p_list);
+                node->in_list = 0;
+        }
+        spin_unlock(&root->lock);
+}
+struct btrfs_delayed_node *btrfs_first_delayed_node(
+                        struct btrfs_delayed_root *delayed_root)
+{
+        struct list_head *p;
+        struct btrfs_delayed_node *node = NULL;
+        spin_lock(&delayed_root->lock);
+        if (list_empty(&delayed_root->node_list))
+                goto out;
+        p = delayed_root->node_list.next;
+        node = list_entry(p, struct btrfs_delayed_node, n_list);
+        atomic_inc(&node->refs);
+out:
+        spin_unlock(&delayed_root->lock);
+        return node;
+}
+struct btrfs_delayed_node *btrfs_next_delayed_node(
+                                                struct btrfs_delayed_node *node)
+{
+        struct btrfs_delayed_root *delayed_root;
+        struct list_head *p;
+        struct btrfs_delayed_node *next = NULL;
+        delayed_root = node->root->fs_info->delayed_root;
+        spin_lock(&delayed_root->lock);
+        if (!node->in_list) {   /* not in the list */
+                if (list_empty(&delayed_root->node_list))
+                        goto out;
+                p = delayed_root->node_list.next;
+        } else if (list_is_last(&node->n_list, &delayed_root->node_list))
+                goto out;
+        else
+                p = node->n_list.next;
+        next = list_entry(p, struct btrfs_delayed_node, n_list);
+        atomic_inc(&next->refs);
+out:
+        spin_unlock(&delayed_root->lock);
+        return next;
+}
+static void __btrfs_release_delayed_node(
+                                struct btrfs_delayed_node *delayed_node,
+                                int mod)
+{
+        struct btrfs_delayed_root *delayed_root;
+        if (!delayed_node)
+                return;
+        delayed_root = delayed_node->root->fs_info->delayed_root;
+        mutex_lock(&delayed_node->mutex);
+        if (delayed_node->count)
+                btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
+        else
+                btrfs_dequeue_delayed_node(delayed_root, delayed_node);
+        mutex_unlock(&delayed_node->mutex);
+        if (atomic_dec_and_test(&delayed_node->refs)) {
+                struct btrfs_root *root = delayed_node->root;
+                spin_lock(&root->inode_lock);
+                if (atomic_read(&delayed_node->refs) == 0) {
+                        radix_tree_delete(&root->delayed_nodes_tree,
+                                          delayed_node->inode_id);
+                        kmem_cache_free(delayed_node_cache, delayed_node);
+                }
+                spin_unlock(&root->inode_lock);
+        }
+}
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+{
+        __btrfs_release_delayed_node(node, 0);
+}
+struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
+                                        struct btrfs_delayed_root *delayed_root)
+{
+        struct list_head *p;
+        struct btrfs_delayed_node *node = NULL;
+        spin_lock(&delayed_root->lock);
+        if (list_empty(&delayed_root->prepare_list))
+                goto out;
+        p = delayed_root->prepare_list.next;
+        list_del_init(p);
+        node = list_entry(p, struct btrfs_delayed_node, p_list);
+        atomic_inc(&node->refs);
+out:
+        spin_unlock(&delayed_root->lock);
+        return node;
+}
+static inline void btrfs_release_prepared_delayed_node(
+                                        struct btrfs_delayed_node *node)
+{
+        __btrfs_release_delayed_node(node, 1);
+}
+struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+{
+        struct btrfs_delayed_item *item;
+        item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
+        if (item) {
+                item->data_len = data_len;
+                item->ins_or_del = 0;
+                item->bytes_reserved = 0;
+                item->block_rsv = NULL;
+                item->delayed_node = NULL;
+                atomic_set(&item->refs, 1);
+        }
+        return item;
+}
+/*
+ * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * @delayed_node: pointer to the delayed node
+ * @key:          the key to look up
+ * @prev:         used to store the prev item if the right item isn't found
+ * @next:         used to store the next item if the right item isn't found
+ *
+ * Note: if we don't find the right item, we will return the prev item and
+ * the next item.
+ */
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
+                                struct rb_root *root,
+                                struct btrfs_key *key,
+                                struct btrfs_delayed_item **prev,
+                                struct btrfs_delayed_item **next)
+{
+        struct rb_node *node, *prev_node = NULL;
+        struct btrfs_delayed_item *delayed_item = NULL;
+        int ret = 0;
+        node = root->rb_node;
+        while (node) {
+                delayed_item = rb_entry(node, struct btrfs_delayed_item,
+                                        rb_node);
+                prev_node = node;
+                ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
+                if (ret < 0)
+                        node = node->rb_right;
+                else if (ret > 0)
+                        node = node->rb_left;
+                else
+                        return delayed_item;
+        }
+        if (prev) {
+                if (!prev_node)
+                        *prev = NULL;
+                else if (ret < 0)
+                        *prev = delayed_item;
+                else if ((node = rb_prev(prev_node)) != NULL) {
+                        *prev = rb_entry(node, struct btrfs_delayed_item,
+                                         rb_node);
+                } else
+                        *prev = NULL;
+        }
+        if (next) {
+                if (!prev_node)
+                        *next = NULL;
+                else if (ret > 0)
+                        *next = delayed_item;
+                else if ((node = rb_next(prev_node)) != NULL) {
+                        *next = rb_entry(node, struct btrfs_delayed_item,
+                                         rb_node);
+                } else
+                        *next = NULL;
+        }
+        return NULL;
+}
+struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
+                                        struct btrfs_delayed_node *delayed_node,
+                                        struct btrfs_key *key)
+{
+        struct btrfs_delayed_item *item;
+        item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+                                           NULL, NULL);
+        return item;
+}
+struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item(
+                                        struct btrfs_delayed_node *delayed_node,
+                                        struct btrfs_key *key)
+{
+        struct btrfs_delayed_item *item;
+        item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
+                                           NULL, NULL);
+        return item;
+}
+struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item(
+                                        struct btrfs_delayed_node *delayed_node,
+                                        struct btrfs_key *key)
+{
+        struct btrfs_delayed_item *item, *next;
+        item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+                                           NULL, &next);
+        if (!item)
+                item = next;
+        return item;
+}
+struct btrfs_delayed_item *__btrfs_search_delayed_deletion_item(
+                                        struct btrfs_delayed_node *delayed_node,
+                                        struct btrfs_key *key)
+{
+        struct btrfs_delayed_item *item, *next;
+        item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
+                                           NULL, &next);
+        if (!item)
+                item = next;
+        return item;
+}
+static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
+                                    struct btrfs_delayed_item *ins,
+                                    int action)
+{
+        struct rb_node **p, *node;
+        struct rb_node *parent_node = NULL;
+        struct rb_root *root;
+        struct btrfs_delayed_item *item;
+        int cmp;
+        if (action == BTRFS_DELAYED_INSERTION_ITEM)
+                root = &delayed_node->ins_root;
+        else if (action == BTRFS_DELAYED_DELETION_ITEM)
+                root = &delayed_node->del_root;
+        else
+                BUG();
+        p = &root->rb_node;
+        node = &ins->rb_node;
+        while (*p) {
+                parent_node = *p;
+                item = rb_entry(parent_node, struct btrfs_delayed_item,
+                                 rb_node);
+                cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
+                if (cmp < 0)
+                        p = &(*p)->rb_right;
+                else if (cmp > 0)
+                        p = &(*p)->rb_left;
+                else
+                        return -EEXIST;
+        }
+        rb_link_node(node, parent_node, p);
+        rb_insert_color(node, root);
+        ins->delayed_node = delayed_node;
+        ins->ins_or_del = action;
+        if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
+            action == BTRFS_DELAYED_INSERTION_ITEM &&
+            ins->key.offset >= delayed_node->index_cnt)
+                        delayed_node->index_cnt = ins->key.offset + 1;
+        delayed_node->count++;
+        atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
+        return 0;
+}
+static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
+                                              struct btrfs_delayed_item *item)
+{
+        return __btrfs_add_delayed_item(node, item,
+                                        BTRFS_DELAYED_INSERTION_ITEM);
+}
+static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
+                                             struct btrfs_delayed_item *item)
+{
+        return __btrfs_add_delayed_item(node, item,
+                                        BTRFS_DELAYED_DELETION_ITEM);
+}
+static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
+{
+        struct rb_root *root;
+        struct btrfs_delayed_root *delayed_root;
+        delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
+        BUG_ON(!delayed_root);
+        BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
+               delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
+        if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+                root = &delayed_item->delayed_node->ins_root;
+        else
+                root = &delayed_item->delayed_node->del_root;
+        rb_erase(&delayed_item->rb_node, root);
+        delayed_item->delayed_node->count--;
+        atomic_dec(&delayed_root->items);
+        if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND &&
+            waitqueue_active(&delayed_root->wait))
+                wake_up(&delayed_root->wait);
+}
+static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
+{
+        if (item) {
+                __btrfs_remove_delayed_item(item);
+                if (atomic_dec_and_test(&item->refs))
+                        kfree(item);
+        }
+}
+struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
+                                        struct btrfs_delayed_node *delayed_node)
+{
+        struct rb_node *p;
+        struct btrfs_delayed_item *item = NULL;
+        p = rb_first(&delayed_node->ins_root);
+        if (p)
+                item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+        return item;
+}
+struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
+                                        struct btrfs_delayed_node *delayed_node)
+{
+        struct rb_node *p;
+        struct btrfs_delayed_item *item = NULL;
+        p = rb_first(&delayed_node->del_root);
+        if (p)
+                item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+        return item;
+}
+struct btrfs_delayed_item *__btrfs_next_delayed_item(
+                                                struct btrfs_delayed_item *item)
+{
+        struct rb_node *p;
+        struct btrfs_delayed_item *next = NULL;
+        p = rb_next(&item->rb_node);
+        if (p)
+                next = rb_entry(p, struct btrfs_delayed_item, rb_node);
+        return next;
+}
+static inline struct btrfs_delayed_node *btrfs_get_delayed_node(
+                                                        struct inode *inode)
+{
+        struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+        struct btrfs_delayed_node *delayed_node;
+        delayed_node = btrfs_inode->delayed_node;
+        if (delayed_node)
+                atomic_inc(&delayed_node->refs);
+        return delayed_node;
+}
+static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
+                                                   u64 root_id)
+{
+        struct btrfs_key root_key;
+        if (root->objectid == root_id)
+                return root;
+        root_key.objectid = root_id;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = (u64)-1;
+        return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
+}
+static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
+                                               struct btrfs_root *root,
+                                               struct btrfs_delayed_item *item)
+{
+        struct btrfs_block_rsv *src_rsv;
+        struct btrfs_block_rsv *dst_rsv;
+        u64 num_bytes;
+        int ret;
+        if (!trans->bytes_reserved)
+                return 0;
+        src_rsv = trans->block_rsv;
+        dst_rsv = &root->fs_info->global_block_rsv;
+        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+        if (!ret) {
+                item->bytes_reserved = num_bytes;
+                item->block_rsv = dst_rsv;
+        }
+        return ret;
+}
+static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
+                                                struct btrfs_delayed_item *item)
+{
+        if (!item->bytes_reserved)
+                return;
+        btrfs_block_rsv_release(root, item->block_rsv,
+                                item->bytes_reserved);
+}
+static int btrfs_delayed_inode_reserve_metadata(
+                                        struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_delayed_node *node)
+{
+        struct btrfs_block_rsv *src_rsv;
+        struct btrfs_block_rsv *dst_rsv;
+        u64 num_bytes;
+        int ret;
+        if (!trans->bytes_reserved)
+                return 0;
+        src_rsv = trans->block_rsv;
+        dst_rsv = &root->fs_info->global_block_rsv;
+        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+        if (!ret)
+                node->bytes_reserved = num_bytes;
+        return ret;
+}
+static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
+                                                struct btrfs_delayed_node *node)
+{
+        struct btrfs_block_rsv *rsv;
+        if (!node->bytes_reserved)
+                return;
+        rsv = &root->fs_info->global_block_rsv;
+        btrfs_block_rsv_release(root, rsv,
+                                node->bytes_reserved);
+        node->bytes_reserved = 0;
+}
+/*
+ * This helper will insert some continuous items into the same leaf according
+ * to the free space of the leaf.
+ */
+static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                struct btrfs_delayed_item *item)
+{
+        struct btrfs_delayed_item *curr, *next;
+        int free_space;
+        int total_data_size = 0, total_size = 0;
+        struct extent_buffer *leaf;
+        char *data_ptr;
+        struct btrfs_key *keys;
+        u32 *data_size;
+        struct list_head head;
+        int slot;
+        int nitems;
+        int i;
+        int ret = 0;
+        BUG_ON(!path->nodes[0]);
+        leaf = path->nodes[0];
+        free_space = btrfs_leaf_free_space(root, leaf);
+        INIT_LIST_HEAD(&head);
+        next = item;
+        /*
+         * count the number of the continuous items that we can insert in batch
+         */
+        while (total_size + next->data_len + sizeof(struct btrfs_item) <=
+               free_space) {
+                total_data_size += next->data_len;
+                total_size += next->data_len + sizeof(struct btrfs_item);
+                list_add_tail(&next->tree_list, &head);
+                nitems++;
+                curr = next;
+                next = __btrfs_next_delayed_item(curr);
+                if (!next)
+                        break;
+                if (!btrfs_is_continuous_delayed_item(curr, next))
+                        break;
+        }
+        if (!nitems) {
+                ret = 0;
+                goto out;
+        }
+        /*
+         * we need allocate some memory space, but it might cause the task
+         * to sleep, so we set all locked nodes in the path to blocking locks
+         * first.
+         */
+        btrfs_set_path_blocking(path);
+        keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS);
+        if (!keys) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS);
+        if (!data_size) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        /* get keys of all the delayed items */
+        i = 0;
+        list_for_each_entry(next, &head, tree_list) {
+                keys[i] = next->key;
+                data_size[i] = next->data_len;
+                i++;
+        }
+        /* reset all the locked nodes in the patch to spinning locks. */
+        btrfs_clear_path_blocking(path, NULL);
+        /* insert the keys of the items */
+        ret = setup_items_for_insert(trans, root, path, keys, data_size,
+                                     total_data_size, total_size, nitems);
+        if (ret)
+                goto error;
+        /* insert the dir index items */
+        slot = path->slots[0];
+        list_for_each_entry_safe(curr, next, &head, tree_list) {
+                data_ptr = btrfs_item_ptr(leaf, slot, char);
+                write_extent_buffer(leaf, &curr->data,
+                                    (unsigned long)data_ptr,
+                                    curr->data_len);
+                slot++;
+                btrfs_delayed_item_release_metadata(root, curr);
+                list_del(&curr->tree_list);
+                btrfs_release_delayed_item(curr);
+        }
+error:
+        kfree(data_size);
+        kfree(keys);
+out:
+        return ret;
+}
+/*
+ * This helper can just do simple insertion that needn't extend item for new
+ * data, such as directory name index insertion, inode insertion.
+ */
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     struct btrfs_path *path,
+                                     struct btrfs_delayed_item *delayed_item)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        char *ptr;
+        int ret;
+        ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
+                                      delayed_item->data_len);
+        if (ret < 0 && ret != -EEXIST)
+                return ret;
+        leaf = path->nodes[0];
+        item = btrfs_item_nr(leaf, path->slots[0]);
+        ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+        write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
+                            delayed_item->data_len);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_delayed_item_release_metadata(root, delayed_item);
+        return 0;
+}
+/*
+ * we insert an item first, then if there are some continuous items, we try
+ * to insert those items into the same leaf.
+ */
+static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
+                                      struct btrfs_path *path,
+                                      struct btrfs_root *root,
+                                      struct btrfs_delayed_node *node)
+{
+        struct btrfs_delayed_item *curr, *prev;
+        int ret = 0;
+do_again:
+        mutex_lock(&node->mutex);
+        curr = __btrfs_first_delayed_insertion_item(node);
+        if (!curr)
+                goto insert_end;
+        ret = btrfs_insert_delayed_item(trans, root, path, curr);
+        if (ret < 0) {
+                btrfs_release_path(path);
+                goto insert_end;
+        }
+        prev = curr;
+        curr = __btrfs_next_delayed_item(prev);
+        if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
+                /* insert the continuous items into the same leaf */
+                path->slots[0]++;
+                btrfs_batch_insert_items(trans, root, path, curr);
+        }
+        btrfs_release_delayed_item(prev);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_release_path(path);
+        mutex_unlock(&node->mutex);
+        goto do_again;
+insert_end:
+        mutex_unlock(&node->mutex);
+        return ret;
+}
+static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_path *path,
+                                    struct btrfs_delayed_item *item)
+{
+        struct btrfs_delayed_item *curr, *next;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        struct list_head head;
+        int nitems, i, last_item;
+        int ret = 0;
+        BUG_ON(!path->nodes[0]);
+        leaf = path->nodes[0];
+        i = path->slots[0];
+        last_item = btrfs_header_nritems(leaf) - 1;
+        if (i > last_item)
+                return -ENOENT; /* FIXME: Is errno suitable? */
+        next = item;
+        INIT_LIST_HEAD(&head);
+        btrfs_item_key_to_cpu(leaf, &key, i);
+        nitems = 0;
+        /*
+         * count the number of the dir index items that we can delete in batch
+         */
+        while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
+                list_add_tail(&next->tree_list, &head);
+                nitems++;
+                curr = next;
+                next = __btrfs_next_delayed_item(curr);
+                if (!next)
+                        break;
+                if (!btrfs_is_continuous_delayed_item(curr, next))
+                        break;
+                i++;
+                if (i > last_item)
+                        break;
+                btrfs_item_key_to_cpu(leaf, &key, i);
+        }
+        if (!nitems)
+                return 0;
+        ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
+        if (ret)
+                goto out;
+        list_for_each_entry_safe(curr, next, &head, tree_list) {
+                btrfs_delayed_item_release_metadata(root, curr);
+                list_del(&curr->tree_list);
+                btrfs_release_delayed_item(curr);
+        }
+out:
+        return ret;
+}
+static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
+                                      struct btrfs_path *path,
+                                      struct btrfs_root *root,
+                                      struct btrfs_delayed_node *node)
+{
+        struct btrfs_delayed_item *curr, *prev;
+        int ret = 0;
+do_again:
+        mutex_lock(&node->mutex);
+        curr = __btrfs_first_delayed_deletion_item(node);
+        if (!curr)
+                goto delete_fail;
+        ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
+        if (ret < 0)
+                goto delete_fail;
+        else if (ret > 0) {
+                /*
+                 * can't find the item which the node points to, so this node
+                 * is invalid, just drop it.
+                 */
+                prev = curr;
+                curr = __btrfs_next_delayed_item(prev);
+                btrfs_release_delayed_item(prev);
+                ret = 0;
+                btrfs_release_path(path);
+                if (curr)
+                        goto do_again;
+                else
+                        goto delete_fail;
+        }
+        btrfs_batch_delete_items(trans, root, path, curr);
+        btrfs_release_path(path);
+        mutex_unlock(&node->mutex);
+        goto do_again;
+delete_fail:
+        btrfs_release_path(path);
+        mutex_unlock(&node->mutex);
+        return ret;
+}
+static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
+{
+        struct btrfs_delayed_root *delayed_root;
+        if (delayed_node && delayed_node->inode_dirty) {
+                BUG_ON(!delayed_node->root);
+                delayed_node->inode_dirty = 0;
+                delayed_node->count--;
+                delayed_root = delayed_node->root->fs_info->delayed_root;
+                atomic_dec(&delayed_root->items);
+                if (atomic_read(&delayed_root->items) <
+                    BTRFS_DELAYED_BACKGROUND &&
+                    waitqueue_active(&delayed_root->wait))
+                        wake_up(&delayed_root->wait);
+        }
+}
+static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      struct btrfs_delayed_node *node)
+{
+        struct btrfs_key key;
+        struct btrfs_inode_item *inode_item;
+        struct extent_buffer *leaf;
+        int ret;
+        mutex_lock(&node->mutex);
+        if (!node->inode_dirty) {
+                mutex_unlock(&node->mutex);
+                return 0;
+        }
+        key.objectid = node->inode_id;
+        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.offset = 0;
+        ret = btrfs_lookup_inode(trans, root, path, &key, 1);
+        if (ret > 0) {
+                btrfs_release_path(path);
+                mutex_unlock(&node->mutex);
+                return -ENOENT;
+        } else if (ret < 0) {
+                mutex_unlock(&node->mutex);
+                return ret;
+        }
+        btrfs_unlock_up_safe(path, 1);
+        leaf = path->nodes[0];
+        inode_item = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_inode_item);
+        write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
+                            sizeof(struct btrfs_inode_item));
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(path);
+        btrfs_delayed_inode_release_metadata(root, node);
+        btrfs_release_delayed_inode(node);
+        mutex_unlock(&node->mutex);
+        return 0;
+}
+/* Called when committing the transaction. */
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root)
+{
+        struct btrfs_delayed_root *delayed_root;
+        struct btrfs_delayed_node *curr_node, *prev_node;
+        struct btrfs_path *path;
+        int ret = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        delayed_root = btrfs_get_delayed_root(root);
+        curr_node = btrfs_first_delayed_node(delayed_root);
+        while (curr_node) {
+                root = curr_node->root;
+                ret = btrfs_insert_delayed_items(trans, path, root,
+                                                 curr_node);
+                if (!ret)
+                        ret = btrfs_delete_delayed_items(trans, path, root,
+                                                         curr_node);
+                if (!ret)
+                        ret = btrfs_update_delayed_inode(trans, root, path,
+                                                         curr_node);
+                if (ret) {
+                        btrfs_release_delayed_node(curr_node);
+                        break;
+                }
+                prev_node = curr_node;
+                curr_node = btrfs_next_delayed_node(curr_node);
+                btrfs_release_delayed_node(prev_node);
+        }
+        btrfs_free_path(path);
+        return ret;
+}
+static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+                                              struct btrfs_delayed_node *node)
+{
+        struct btrfs_path *path;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+        if (!ret)
+                ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+        if (!ret)
+                ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+                                     struct inode *inode)
+{
+        struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+        int ret;
+        if (!delayed_node)
+                return 0;
+        mutex_lock(&delayed_node->mutex);
+        if (!delayed_node->count) {
+                mutex_unlock(&delayed_node->mutex);
+                btrfs_release_delayed_node(delayed_node);
+                return 0;
+        }
+        mutex_unlock(&delayed_node->mutex);
+        ret = __btrfs_commit_inode_delayed_items(trans, delayed_node);
+        btrfs_release_delayed_node(delayed_node);
+        return ret;
+}
+void btrfs_remove_delayed_node(struct inode *inode)
+{
+        struct btrfs_delayed_node *delayed_node;
+        delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node);
+        if (!delayed_node)
+                return;
+        BTRFS_I(inode)->delayed_node = NULL;
+        btrfs_release_delayed_node(delayed_node);
+}
+struct btrfs_async_delayed_node {
+        struct btrfs_root *root;
+        struct btrfs_delayed_node *delayed_node;
+        struct btrfs_work work;
+};
+static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
+{
+        struct btrfs_async_delayed_node *async_node;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_path *path;
+        struct btrfs_delayed_node *delayed_node = NULL;
+        struct btrfs_root *root;
+        unsigned long nr = 0;
+        int need_requeue = 0;
+        int ret;
+        async_node = container_of(work, struct btrfs_async_delayed_node, work);
+        path = btrfs_alloc_path();
+        if (!path)
+                goto out;
+        path->leave_spinning = 1;
+        delayed_node = async_node->delayed_node;
+        root = delayed_node->root;
+        trans = btrfs_join_transaction(root, 0);
+        if (IS_ERR(trans))
+                goto free_path;
+        ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
+        if (!ret)
+                ret = btrfs_delete_delayed_items(trans, path, root,
+                                                 delayed_node);
+        if (!ret)
+                btrfs_update_delayed_inode(trans, root, path, delayed_node);
+        /*
+         * Maybe new delayed items have been inserted, so we need requeue
+         * the work. Besides that, we must dequeue the empty delayed nodes
+         * to avoid the race between delayed items balance and the worker.
+         * The race like this:
+         *      Task1                           Worker thread
+         *                                      count == 0, needn't requeue
+         *                                        also needn't insert the
+         *                                        delayed node into prepare
+         *                                        list again.
+         *      add lots of delayed items
+         *      queue the delayed node
+         *        already in the list,
+         *        and not in the prepare
+         *        list, it means the delayed
+         *        node is being dealt with
+         *        by the worker.
+         *      do delayed items balance
+         *        the delayed node is being
+         *        dealt with by the worker
+         *        now, just wait.
+         *                                      the worker goto idle.
+         * Task1 will sleep until the transaction is commited.
+         */
+        mutex_lock(&delayed_node->mutex);
+        if (delayed_node->count)
+                need_requeue = 1;
+        else
+                btrfs_dequeue_delayed_node(root->fs_info->delayed_root,
+                                           delayed_node);
+        mutex_unlock(&delayed_node->mutex);
+        nr = trans->blocks_used;
+        btrfs_end_transaction_dmeta(trans, root);
+        __btrfs_btree_balance_dirty(root, nr);
+free_path:
+        btrfs_free_path(path);
+out:
+        if (need_requeue)
+                btrfs_requeue_work(&async_node->work);
+        else {
+                btrfs_release_prepared_delayed_node(delayed_node);
+                kfree(async_node);
+        }
+}
+static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
+                                     struct btrfs_root *root, int all)
+{
+        struct btrfs_async_delayed_node *async_node;
+        struct btrfs_delayed_node *curr;
+        int count = 0;
+again:
+        curr = btrfs_first_prepared_delayed_node(delayed_root);
+        if (!curr)
+                return 0;
+        async_node = kmalloc(sizeof(*async_node), GFP_NOFS);
+        if (!async_node) {
+                btrfs_release_prepared_delayed_node(curr);
+                return -ENOMEM;
+        }
+        async_node->root = root;
+        async_node->delayed_node = curr;
+        async_node->work.func = btrfs_async_run_delayed_node_done;
+        async_node->work.flags = 0;
+        btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work);
+        count++;
+        if (all || count < 4)
+                goto again;
+        return 0;
+}
+void btrfs_balance_delayed_items(struct btrfs_root *root)
+{
+        struct btrfs_delayed_root *delayed_root;
+        delayed_root = btrfs_get_delayed_root(root);
+        if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+                return;
+        if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
+                int ret;
+                ret = btrfs_wq_run_delayed_node(delayed_root, root, 1);
+                if (ret)
+                        return;
+                wait_event_interruptible_timeout(
+                                delayed_root->wait,
+                                (atomic_read(&delayed_root->items) <
+                                 BTRFS_DELAYED_BACKGROUND),
+                                HZ);
+                return;
+        }
+        btrfs_wq_run_delayed_node(delayed_root, root, 0);
+}
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root, const char *name,
+                                   int name_len, struct inode *dir,
+                                   struct btrfs_disk_key *disk_key, u8 type,
+                                   u64 index)
+{
+        struct btrfs_delayed_node *delayed_node;
+        struct btrfs_delayed_item *delayed_item;
+        struct btrfs_dir_item *dir_item;
+        int ret;
+        delayed_node = btrfs_get_or_create_delayed_node(dir);
+        if (IS_ERR(delayed_node))
+                return PTR_ERR(delayed_node);
+        delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+        if (!delayed_item) {
+                ret = -ENOMEM;
+                goto release_node;
+        }
+        ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+        /*
+         * we have reserved enough space when we start a new transaction,
+         * so reserving metadata failure is impossible
+         */
+        BUG_ON(ret);
+        delayed_item->key.objectid = btrfs_ino(dir);
+        btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
+        delayed_item->key.offset = index;
+        dir_item = (struct btrfs_dir_item *)delayed_item->data;
+        dir_item->location = *disk_key;
+        dir_item->transid = cpu_to_le64(trans->transid);
+        dir_item->data_len = 0;
+        dir_item->name_len = cpu_to_le16(name_len);
+        dir_item->type = type;
+        memcpy((char *)(dir_item + 1), name, name_len);
+        mutex_lock(&delayed_node->mutex);
+        ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
+        if (unlikely(ret)) {
+                printk(KERN_ERR "err add delayed dir index item(name: %s) into "
+                                "the insertion tree of the delayed node"
+                                "(root id: %llu, inode id: %llu, errno: %d)\n",
+                                name,
+                                (unsigned long long)delayed_node->root->objectid,
+                                (unsigned long long)delayed_node->inode_id,
+                                ret);
+                BUG();
+        }
+        mutex_unlock(&delayed_node->mutex);
+release_node:
+        btrfs_release_delayed_node(delayed_node);
+        return ret;
+}
+static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root,
+                                               struct btrfs_delayed_node *node,
+                                               struct btrfs_key *key)
+{
+        struct btrfs_delayed_item *item;
+        mutex_lock(&node->mutex);
+        item = __btrfs_lookup_delayed_insertion_item(node, key);
+        if (!item) {
+                mutex_unlock(&node->mutex);
+                return 1;
+        }
+        btrfs_delayed_item_release_metadata(root, item);
+        btrfs_release_delayed_item(item);
+        mutex_unlock(&node->mutex);
+        return 0;
+}
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root, struct inode *dir,
+                                   u64 index)
+{
+        struct btrfs_delayed_node *node;
+        struct btrfs_delayed_item *item;
+        struct btrfs_key item_key;
+        int ret;
+        node = btrfs_get_or_create_delayed_node(dir);
+        if (IS_ERR(node))
+                return PTR_ERR(node);
+        item_key.objectid = btrfs_ino(dir);
+        btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY);
+        item_key.offset = index;
+        ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
+        if (!ret)
+                goto end;
+        item = btrfs_alloc_delayed_item(0);
+        if (!item) {
+                ret = -ENOMEM;
+                goto end;
+        }
+        item->key = item_key;
+        ret = btrfs_delayed_item_reserve_metadata(trans, root, item);
+        /*
+         * we have reserved enough space when we start a new transaction,
+         * so reserving metadata failure is impossible.
+         */
+        BUG_ON(ret);
+        mutex_lock(&node->mutex);
+        ret = __btrfs_add_delayed_deletion_item(node, item);
+        if (unlikely(ret)) {
+                printk(KERN_ERR "err add delayed dir index item(index: %llu) "
+                                "into the deletion tree of the delayed node"
+                                "(root id: %llu, inode id: %llu, errno: %d)\n",
+                                (unsigned long long)index,
+                                (unsigned long long)node->root->objectid,
+                                (unsigned long long)node->inode_id,
+                                ret);
+                BUG();
+        }
+        mutex_unlock(&node->mutex);
+end:
+        btrfs_release_delayed_node(node);
+        return ret;
+}
+int btrfs_inode_delayed_dir_index_count(struct inode *inode)
+{
+        struct btrfs_delayed_node *delayed_node = BTRFS_I(inode)->delayed_node;
+        int ret = 0;
+        if (!delayed_node)
+                return -ENOENT;
+        /*
+         * Since we have held i_mutex of this directory, it is impossible that
+         * a new directory index is added into the delayed node and index_cnt
+         * is updated now. So we needn't lock the delayed node.
+         */
+        if (!delayed_node->index_cnt)
+                return -EINVAL;
+        BTRFS_I(inode)->index_cnt = delayed_node->index_cnt;
+        return ret;
+}
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+                             struct list_head *del_list)
+{
+        struct btrfs_delayed_node *delayed_node;
+        struct btrfs_delayed_item *item;
+        delayed_node = btrfs_get_delayed_node(inode);
+        if (!delayed_node)
+                return;
+        mutex_lock(&delayed_node->mutex);
+        item = __btrfs_first_delayed_insertion_item(delayed_node);
+        while (item) {
+                atomic_inc(&item->refs);
+                list_add_tail(&item->readdir_list, ins_list);
+                item = __btrfs_next_delayed_item(item);
+        }
+        item = __btrfs_first_delayed_deletion_item(delayed_node);
+        while (item) {
+                atomic_inc(&item->refs);
+                list_add_tail(&item->readdir_list, del_list);
+                item = __btrfs_next_delayed_item(item);
+        }
+        mutex_unlock(&delayed_node->mutex);
+        /*
+         * This delayed node is still cached in the btrfs inode, so refs
+         * must be > 1 now, and we needn't check it is going to be freed
+         * or not.
+         *
+         * Besides that, this function is used to read dir, we do not
+         * insert/delete delayed items in this period. So we also needn't
+         * requeue or dequeue this delayed node.
+         */
+        atomic_dec(&delayed_node->refs);
+}
+void btrfs_put_delayed_items(struct list_head *ins_list,
+                             struct list_head *del_list)
+{
+        struct btrfs_delayed_item *curr, *next;
+        list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+                list_del(&curr->readdir_list);
+                if (atomic_dec_and_test(&curr->refs))
+                        kfree(curr);
+        }
+        list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+                list_del(&curr->readdir_list);
+                if (atomic_dec_and_test(&curr->refs))
+                        kfree(curr);
+        }
+}
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+                                  u64 index)
+{
+        struct btrfs_delayed_item *curr, *next;
+        int ret;
+        if (list_empty(del_list))
+                return 0;
+        list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+                if (curr->key.offset > index)
+                        break;
+                list_del(&curr->readdir_list);
+                ret = (curr->key.offset == index);
+                if (atomic_dec_and_test(&curr->refs))
+                        kfree(curr);
+                if (ret)
+                        return 1;
+                else
+                        continue;
+        }
+        return 0;
+}
+/*
+ * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
+ *
+ */
+int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
+                                    filldir_t filldir,
+                                    struct list_head *ins_list)
+{
+        struct btrfs_dir_item *di;
+        struct btrfs_delayed_item *curr, *next;
+        struct btrfs_key location;
+        char *name;
+        int name_len;
+        int over = 0;
+        unsigned char d_type;
+        if (list_empty(ins_list))
+                return 0;
+        /*
+         * Changing the data of the delayed item is impossible. So
+         * we needn't lock them. And we have held i_mutex of the
+         * directory, nobody can delete any directory indexes now.
+         */
+        list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+                list_del(&curr->readdir_list);
+                if (curr->key.offset < filp->f_pos) {
+                        if (atomic_dec_and_test(&curr->refs))
+                                kfree(curr);
+                        continue;
+                }
+                filp->f_pos = curr->key.offset;
+                di = (struct btrfs_dir_item *)curr->data;
+                name = (char *)(di + 1);
+                name_len = le16_to_cpu(di->name_len);
+                d_type = btrfs_filetype_table[di->type];
+                btrfs_disk_key_to_cpu(&location, &di->location);
+                over = filldir(dirent, name, name_len, curr->key.offset,
+                               location.objectid, d_type);
+                if (atomic_dec_and_test(&curr->refs))
+                        kfree(curr);
+                if (over)
+                        return 1;
+        }
+        return 0;
+}
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+                         generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+                         sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+                         transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
+                         nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+                         block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
+                                  struct btrfs_inode_item *inode_item,
+                                  struct inode *inode)
+{
+        btrfs_set_stack_inode_uid(inode_item, inode->i_uid);
+        btrfs_set_stack_inode_gid(inode_item, inode->i_gid);
+        btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
+        btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
+        btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
+        btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
+        btrfs_set_stack_inode_generation(inode_item,
+                                         BTRFS_I(inode)->generation);
+        btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence);
+        btrfs_set_stack_inode_transid(inode_item, trans->transid);
+        btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
+        btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+        btrfs_set_stack_inode_block_group(inode_item,
+                                          BTRFS_I(inode)->block_group);
+        btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
+                                     inode->i_atime.tv_sec);
+        btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item),
+                                      inode->i_atime.tv_nsec);
+        btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item),
+                                     inode->i_mtime.tv_sec);
+        btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item),
+                                      inode->i_mtime.tv_nsec);
+        btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item),
+                                     inode->i_ctime.tv_sec);
+        btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
+                                      inode->i_ctime.tv_nsec);
+}
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root, struct inode *inode)
+{
+        struct btrfs_delayed_node *delayed_node;
+        int ret;
+        delayed_node = btrfs_get_or_create_delayed_node(inode);
+        if (IS_ERR(delayed_node))
+                return PTR_ERR(delayed_node);
+        mutex_lock(&delayed_node->mutex);
+        if (delayed_node->inode_dirty) {
+                fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+                goto release_node;
+        }
+        ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
+        /*
+         * we must reserve enough space when we start a new transaction,
+         * so reserving metadata failure is impossible
+         */
+        BUG_ON(ret);
+        fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+        delayed_node->inode_dirty = 1;
+        delayed_node->count++;
+        atomic_inc(&root->fs_info->delayed_root->items);
+release_node:
+        mutex_unlock(&delayed_node->mutex);
+        btrfs_release_delayed_node(delayed_node);
+        return ret;
+}
+static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
+{
+        struct btrfs_root *root = delayed_node->root;
+        struct btrfs_delayed_item *curr_item, *prev_item;
+        mutex_lock(&delayed_node->mutex);
+        curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
+        while (curr_item) {
+                btrfs_delayed_item_release_metadata(root, curr_item);
+                prev_item = curr_item;
+                curr_item = __btrfs_next_delayed_item(prev_item);
+                btrfs_release_delayed_item(prev_item);
+        }
+        curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
+        while (curr_item) {
+                btrfs_delayed_item_release_metadata(root, curr_item);
+                prev_item = curr_item;
+                curr_item = __btrfs_next_delayed_item(prev_item);
+                btrfs_release_delayed_item(prev_item);
+        }
+        if (delayed_node->inode_dirty) {
+                btrfs_delayed_inode_release_metadata(root, delayed_node);
+                btrfs_release_delayed_inode(delayed_node);
+        }
+        mutex_unlock(&delayed_node->mutex);
+}
+void btrfs_kill_delayed_inode_items(struct inode *inode)
+{
+        struct btrfs_delayed_node *delayed_node;
+        delayed_node = btrfs_get_delayed_node(inode);
+        if (!delayed_node)
+                return;
+        __btrfs_kill_delayed_node(delayed_node);
+        btrfs_release_delayed_node(delayed_node);
+}
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
+{
+        u64 inode_id = 0;
+        struct btrfs_delayed_node *delayed_nodes[8];
+        int i, n;
+        while (1) {
+                spin_lock(&root->inode_lock);
+                n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
+                                           (void **)delayed_nodes, inode_id,
+                                           ARRAY_SIZE(delayed_nodes));
+                if (!n) {
+                        spin_unlock(&root->inode_lock);
+                        break;
+                }
+                inode_id = delayed_nodes[n - 1]->inode_id + 1;
+                for (i = 0; i < n; i++)
+                        atomic_inc(&delayed_nodes[i]->refs);
+                spin_unlock(&root->inode_lock);
+                for (i = 0; i < n; i++) {
+                        __btrfs_kill_delayed_node(delayed_nodes[i]);
+                        btrfs_release_delayed_node(delayed_nodes[i]);
+                }
+        }
+}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
new file mode 100644
index 000000000000..eb7d240aa648
--- /dev/null
+++ b/fs/btrfs/delayed-inode.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2011 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_TREE_OPERATION_H
+#define __DELAYED_TREE_OPERATION_H
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+#include "ctree.h"
+/* types of the delayed item */
+#define BTRFS_DELAYED_INSERTION_ITEM    1
+#define BTRFS_DELAYED_DELETION_ITEM     2
+struct btrfs_delayed_root {
+        spinlock_t lock;
+        struct list_head node_list;
+        /*
+         * Used for delayed nodes which is waiting to be dealt with by the
+         * worker. If the delayed node is inserted into the work queue, we
+         * drop it from this list.
+         */
+        struct list_head prepare_list;
+        atomic_t items;         /* for delayed items */
+        int nodes;              /* for delayed nodes */
+        wait_queue_head_t wait;
+};
+struct btrfs_delayed_node {
+        u64 inode_id;
+        u64 bytes_reserved;
+        struct btrfs_root *root;
+        /* Used to add the node into the delayed root's node list. */
+        struct list_head n_list;
+        /*
+         * Used to add the node into the prepare list, the nodes in this list
+         * is waiting to be dealt with by the async worker.
+         */
+        struct list_head p_list;
+        struct rb_root ins_root;
+        struct rb_root del_root;
+        struct mutex mutex;
+        struct btrfs_inode_item inode_item;
+        atomic_t refs;
+        u64 index_cnt;
+        bool in_list;
+        bool inode_dirty;
+        int count;
+};
+struct btrfs_delayed_item {
+        struct rb_node rb_node;
+        struct btrfs_key key;
+        struct list_head tree_list;     /* used for batch insert/delete items */
+        struct list_head readdir_list;  /* used for readdir items */
+        u64 bytes_reserved;
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_delayed_node *delayed_node;
+        atomic_t refs;
+        int ins_or_del;
+        u32 data_len;
+        char data[0];
+};
+static inline void btrfs_init_delayed_root(
+                                struct btrfs_delayed_root *delayed_root)
+{
+        atomic_set(&delayed_root->items, 0);
+        delayed_root->nodes = 0;
+        spin_lock_init(&delayed_root->lock);
+        init_waitqueue_head(&delayed_root->wait);
+        INIT_LIST_HEAD(&delayed_root->node_list);
+        INIT_LIST_HEAD(&delayed_root->prepare_list);
+}
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root, const char *name,
+                                   int name_len, struct inode *dir,
+                                   struct btrfs_disk_key *disk_key, u8 type,
+                                   u64 index);
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root, struct inode *dir,
+                                   u64 index);
+int btrfs_inode_delayed_dir_index_count(struct inode *inode);
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root);
+void btrfs_balance_delayed_items(struct btrfs_root *root);
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+                                     struct inode *inode);
+/* Used for evicting the inode. */
+void btrfs_remove_delayed_node(struct inode *inode);
+void btrfs_kill_delayed_inode_items(struct inode *inode);
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root, struct inode *inode);
+/* Used for drop dead root */
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
+/* Used for readdir() */
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+                             struct list_head *del_list);
+void btrfs_put_delayed_items(struct list_head *ins_list,
+                             struct list_head *del_list);
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+                                  u64 index);
+int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
+                                    filldir_t filldir,
+                                    struct list_head *ins_list);
+/* for init */
+int __init btrfs_delayed_inode_init(void);
+void btrfs_delayed_inode_exit(void);
+#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index bce28f653899..125cf76fcd08 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -281,44 +281,6 @@ again:
 }
 /*
- * This checks to see if there are any delayed refs in the
- * btree for a given bytenr.  It returns one if it finds any
- * and zero otherwise.
- *
- * If it only finds a head node, it returns 0.
- *
- * The idea is to use this when deciding if you can safely delete an
- * extent from the extent allocation tree.  There may be a pending
- * ref in the rbtree that adds or removes references, so as long as this
- * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
- * allocation tree.
- */
-int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
-{
-        struct btrfs_delayed_ref_node *ref;
-        struct btrfs_delayed_ref_root *delayed_refs;
-        struct rb_node *prev_node;
-        int ret = 0;
-        delayed_refs = &trans->transaction->delayed_refs;
-        spin_lock(&delayed_refs->lock);
-        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
-        if (ref) {
-                prev_node = rb_prev(&ref->rb_node);
-                if (!prev_node)
-                        goto out;
-                ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
-                               rb_node);
-                if (ref->bytenr == bytenr)
-                        ret = 1;
-        }
-out:
-        spin_unlock(&delayed_refs->lock);
-        return ret;
-}
-/*
 * helper function to update an extent delayed ref in the
 * rbtree.  existing and update must both have the same
 * bytenr and parent
@@ -747,79 +709,3 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
 }
-/*
- * add a delayed ref to the tree.  This does all of the accounting required
- * to make sure the delayed ref is eventually processed before this
- * transaction commits.
- *
- * The main point of this call is to add and remove a backreference in a single
- * shot, taking the lock only once, and only searching for the head node once.
- *
- * It is the same as doing a ref add and delete in two separate calls.
- */
-#if 0
-int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
-                          u64 bytenr, u64 num_bytes, u64 orig_parent,
-                          u64 parent, u64 orig_ref_root, u64 ref_root,
-                          u64 orig_ref_generation, u64 ref_generation,
-                          u64 owner_objectid, int pin)
-{
-        struct btrfs_delayed_ref *ref;
-        struct btrfs_delayed_ref *old_ref;
-        struct btrfs_delayed_ref_head *head_ref;
-        struct btrfs_delayed_ref_root *delayed_refs;
-        int ret;
-        ref = kmalloc(sizeof(*ref), GFP_NOFS);
-        if (!ref)
-                return -ENOMEM;
-        old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
-        if (!old_ref) {
-                kfree(ref);
-                return -ENOMEM;
-        }
-        /*
-         * the parent = 0 case comes from cases where we don't actually
-         * know the parent yet.  It will get updated later via a add/drop
-         * pair.
-         */
-        if (parent == 0)
-                parent = bytenr;
-        if (orig_parent == 0)
-                orig_parent = bytenr;
-        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
-        if (!head_ref) {
-                kfree(ref);
-                kfree(old_ref);
-                return -ENOMEM;
-        }
-        delayed_refs = &trans->transaction->delayed_refs;
-        spin_lock(&delayed_refs->lock);
-        /*
-         * insert both the head node and the new ref without dropping
-         * the spin lock
-         */
-        ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
-                                      (u64)-1, 0, 0, 0,
-                                      BTRFS_UPDATE_DELAYED_HEAD, 0);
-        BUG_ON(ret);
-        ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
-                                      parent, ref_root, ref_generation,
-                                      owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
-        BUG_ON(ret);
-        ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
-                                      orig_parent, orig_ref_root,
-                                      orig_ref_generation, owner_objectid,
-                                      BTRFS_DROP_DELAYED_REF, pin);
-        BUG_ON(ret);
-        spin_unlock(&delayed_refs->lock);
-        return 0;
-}
-#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 50e3cf92fbda..e287e3b0eab0 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -166,12 +166,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
-                          u64 bytenr, u64 num_bytes, u64 orig_parent,
-                          u64 parent, u64 orig_ref_root, u64 ref_root,
-                          u64 orig_ref_generation, u64 ref_generation,
-                          u64 owner_objectid, int pin);
 int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
                           struct btrfs_delayed_ref_head *head);
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c62f02f6ae69..685f2593c4f0 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -50,7 +50,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
                if (di)
                        return ERR_PTR(-EEXIST);
                ret = btrfs_extend_item(trans, root, path, data_size);
-                WARN_ON(ret > 0);
        }
        if (ret < 0)
                return ERR_PTR(ret);
@@ -124,8 +123,9 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 * to use for the second index (if one is created).
 */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-                          *root, const char *name, int name_len, u64 dir,
+                          *root, const char *name, int name_len,
-                          struct btrfs_key *location, u8 type, u64 index)
+                          struct inode *dir, struct btrfs_key *location,
+                          u8 type, u64 index)
 {
        int ret = 0;
        int ret2 = 0;
@@ -137,13 +137,17 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
        struct btrfs_disk_key disk_key;
        u32 data_size;
-        key.objectid = dir;
+        key.objectid = btrfs_ino(dir);
        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
        key.offset = btrfs_name_hash(name, name_len);
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        path->leave_spinning = 1;
+        btrfs_cpu_key_to_disk(&disk_key, location);
        data_size = sizeof(*dir_item) + name_len;
        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
                                        name, name_len);
@@ -155,7 +159,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
        }
        leaf = path->nodes[0];
-        btrfs_cpu_key_to_disk(&disk_key, location);
        btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
        btrfs_set_dir_type(leaf, dir_item, type);
        btrfs_set_dir_data_len(leaf, dir_item, 0);
@@ -172,29 +175,11 @@ second_insert:
                ret = 0;
                goto out_free;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
-        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
-        key.offset = index;
-        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
-                                        name, name_len);
-        if (IS_ERR(dir_item)) {
-                ret2 = PTR_ERR(dir_item);
-                goto out_free;
-        }
-        leaf = path->nodes[0];
-        btrfs_cpu_key_to_disk(&disk_key, location);
-        btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
-        btrfs_set_dir_type(leaf, dir_item, type);
-        btrfs_set_dir_data_len(leaf, dir_item, 0);
-        btrfs_set_dir_name_len(leaf, dir_item, name_len);
-        btrfs_set_dir_transid(leaf, dir_item, trans->transid);
-        name_ptr = (unsigned long)(dir_item + 1);
-        write_extent_buffer(leaf, name, name_ptr, name_len);
-        btrfs_mark_buffer_dirty(leaf);
+        ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir,
+                                              &disk_key, type, index);
 out_free:
        btrfs_free_path(path);
        if (ret)
                return ret;
@@ -452,7 +437,7 @@ int verify_dir_item(struct btrfs_root *root,
                namelen = XATTR_NAME_MAX;
        if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
-                printk(KERN_CRIT "btrfS: invalid dir item name len: %u\n",
+                printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n",
                       (unsigned)btrfs_dir_data_len(leaf, dir_item));
                return 1;
        }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 228cf36ece83..98b6a71decba 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,6 +29,7 @@
 #include <linux/crc32c.h>
 #include <linux/slab.h>
 #include <linux/migrate.h>
+#include <linux/ratelimit.h>
 #include <asm/unaligned.h>
 #include "compat.h"
 #include "ctree.h"
@@ -41,6 +42,7 @@
 #include "locking.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
+#include "inode-map.h"
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -137,7 +139,7 @@ static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
 * that covers the entire device
 */
 static struct extent_map *btree_get_extent(struct inode *inode,
-                struct page *page, size_t page_offset, u64 start, u64 len,
+                struct page *page, size_t pg_offset, u64 start, u64 len,
                int create)
 {
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
@@ -154,7 +156,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
        }
        read_unlock(&em_tree->lock);
-        em = alloc_extent_map(GFP_NOFS);
+        em = alloc_extent_map();
        if (!em) {
                em = ERR_PTR(-ENOMEM);
                goto out;
@@ -254,14 +256,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                        memcpy(&found, result, csum_size);
                        read_extent_buffer(buf, &val, 0, csum_size);
-                        if (printk_ratelimit()) {
+                        printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
-                                printk(KERN_INFO "btrfs: %s checksum verify "
                                       "failed on %llu wanted %X found %X "
                                       "level %d\n",
                                       root->fs_info->sb->s_id,
                                       (unsigned long long)buf->start, val, found,
                                       btrfs_header_level(buf));
-                        }
                        if (result != (char *)&inline_result)
                                kfree(result);
                        return 1;
@@ -296,13 +296,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                ret = 0;
                goto out;
        }
-        if (printk_ratelimit()) {
+        printk_ratelimited("parent transid verify failed on %llu wanted %llu "
-                printk("parent transid verify failed on %llu wanted %llu "
                       "found %llu\n",
                       (unsigned long long)eb->start,
                       (unsigned long long)parent_transid,
                       (unsigned long long)btrfs_header_generation(eb));
-        }
        ret = 1;
        clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
 out:
@@ -380,7 +378,7 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        len = page->private >> 2;
        WARN_ON(len == 0);
-        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+        eb = alloc_extent_buffer(tree, start, len, page);
        if (eb == NULL) {
                WARN_ON(1);
                goto out;
@@ -525,7 +523,7 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        len = page->private >> 2;
        WARN_ON(len == 0);
-        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+        eb = alloc_extent_buffer(tree, start, len, page);
        if (eb == NULL) {
                ret = -EIO;
                goto out;
@@ -533,12 +531,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
-                if (printk_ratelimit()) {
+                printk_ratelimited(KERN_INFO "btrfs bad tree block start "
-                        printk(KERN_INFO "btrfs bad tree block start "
                               "%llu %llu\n",
                               (unsigned long long)found_start,
                               (unsigned long long)eb->start);
-                }
                ret = -EIO;
                goto err;
        }
@@ -550,10 +546,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                goto err;
        }
        if (check_tree_block_fsid(root, eb)) {
-                if (printk_ratelimit()) {
+                printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
-                        printk(KERN_INFO "btrfs bad fsid on block %llu\n",
                               (unsigned long long)eb->start);
-                }
                ret = -EIO;
                goto err;
        }
@@ -650,12 +644,6 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
        return 256 * limit;
 }
-int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
-{
-        return atomic_read(&info->nr_async_bios) >
-                btrfs_async_submit_limit(info);
-}
 static void run_one_async_start(struct btrfs_work *work)
 {
        struct async_submit_bio *async;
@@ -963,7 +951,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
        struct inode *btree_inode = root->fs_info->btree_inode;
        struct extent_buffer *eb;
        eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-                                bytenr, blocksize, GFP_NOFS);
+                                bytenr, blocksize);
        return eb;
 }
@@ -974,7 +962,7 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
        struct extent_buffer *eb;
        eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-                                 bytenr, blocksize, NULL, GFP_NOFS);
+                                 bytenr, blocksize, NULL);
        return eb;
 }
@@ -1058,13 +1046,13 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->name = NULL;
        root->in_sysfs = 0;
        root->inode_tree = RB_ROOT;
+        INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
        root->block_rsv = NULL;
        root->orphan_block_rsv = NULL;
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->orphan_list);
        INIT_LIST_HEAD(&root->root_list);
-        spin_lock_init(&root->node_lock);
        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
        spin_lock_init(&root->accounting_lock);
@@ -1080,7 +1068,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->log_transid = 0;
        root->last_log_commit = 0;
        extent_io_tree_init(&root->dirty_log_pages,
-                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+                             fs_info->btree_inode->i_mapping);
        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
@@ -1283,21 +1271,6 @@ out:
        return root;
 }
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
-                                        u64 root_objectid)
-{
-        struct btrfs_root *root;
-        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
-                return fs_info->tree_root;
-        if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
-                return fs_info->extent_root;
-        root = radix_tree_lookup(&fs_info->fs_roots_radix,
-                                 (unsigned long)root_objectid);
-        return root;
-}
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                              struct btrfs_key *location)
 {
@@ -1326,6 +1299,19 @@ again:
        if (IS_ERR(root))
                return root;
+        root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+        if (!root->free_ino_ctl)
+                goto fail;
+        root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+                                        GFP_NOFS);
+        if (!root->free_ino_pinned)
+                goto fail;
+        btrfs_init_free_ino_ctl(root);
+        mutex_init(&root->fs_commit_mutex);
+        spin_lock_init(&root->cache_lock);
+        init_waitqueue_head(&root->cache_wait);
        set_anon_super(&root->anon_super, NULL);
        if (btrfs_root_refs(&root->root_item) == 0) {
@@ -1369,41 +1355,6 @@ fail:
        return ERR_PTR(ret);
 }
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
-                                      struct btrfs_key *location,
-                                      const char *name, int namelen)
-{
-        return btrfs_read_fs_root_no_name(fs_info, location);
-#if 0
-        struct btrfs_root *root;
-        int ret;
-        root = btrfs_read_fs_root_no_name(fs_info, location);
-        if (!root)
-                return NULL;
-        if (root->in_sysfs)
-                return root;
-        ret = btrfs_set_root_name(root, name, namelen);
-        if (ret) {
-                free_extent_buffer(root->node);
-                kfree(root);
-                return ERR_PTR(ret);
-        }
-        ret = btrfs_sysfs_add_root(root);
-        if (ret) {
-                free_extent_buffer(root->node);
-                kfree(root->name);
-                kfree(root);
-                return ERR_PTR(ret);
-        }
-        root->in_sysfs = 1;
-        return root;
-#endif
-}
 static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 {
        struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
@@ -1411,7 +1362,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
        struct btrfs_device *device;
        struct backing_dev_info *bdi;
-        list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
+        rcu_read_lock();
+        list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
                if (!device->bdev)
                        continue;
                bdi = blk_get_backing_dev_info(device->bdev);
@@ -1420,6 +1372,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
                        break;
                }
        }
+        rcu_read_unlock();
        return ret;
 }
@@ -1522,6 +1475,7 @@ static int cleaner_kthread(void *arg)
                        btrfs_run_delayed_iputs(root);
                        btrfs_clean_old_snapshots(root);
                        mutex_unlock(&root->fs_info->cleaner_mutex);
+                        btrfs_run_defrag_inodes(root->fs_info);
                }
                if (freezing(current)) {
@@ -1611,7 +1565,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
                                                 GFP_NOFS);
        struct btrfs_root *tree_root = btrfs_sb(sb);
-        struct btrfs_fs_info *fs_info = tree_root->fs_info;
+        struct btrfs_fs_info *fs_info = NULL;
        struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
                                                GFP_NOFS);
        struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
@@ -1623,11 +1577,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        struct btrfs_super_block *disk_super;
-        if (!extent_root || !tree_root || !fs_info ||
+        if (!extent_root || !tree_root || !tree_root->fs_info ||
            !chunk_root || !dev_root || !csum_root) {
                err = -ENOMEM;
                goto fail;
        }
+        fs_info = tree_root->fs_info;
        ret = init_srcu_struct(&fs_info->subvol_srcu);
        if (ret) {
@@ -1662,6 +1617,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->ref_cache_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
+        spin_lock_init(&fs_info->defrag_inodes_lock);
        init_completion(&fs_info->kobj_unregister);
        fs_info->tree_root = tree_root;
@@ -1684,15 +1640,35 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
        atomic_set(&fs_info->nr_async_bios, 0);
+        atomic_set(&fs_info->defrag_running, 0);
        fs_info->sb = sb;
        fs_info->max_inline = 8192 * 1024;
        fs_info->metadata_ratio = 0;
+        fs_info->defrag_inodes = RB_ROOT;
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
        INIT_LIST_HEAD(&fs_info->ordered_extents);
        spin_lock_init(&fs_info->ordered_extent_lock);
+        fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
+                                        GFP_NOFS);
+        if (!fs_info->delayed_root) {
+                err = -ENOMEM;
+                goto fail_iput;
+        }
+        btrfs_init_delayed_root(fs_info->delayed_root);
+        mutex_init(&fs_info->scrub_lock);
+        atomic_set(&fs_info->scrubs_running, 0);
+        atomic_set(&fs_info->scrub_pause_req, 0);
+        atomic_set(&fs_info->scrubs_paused, 0);
+        atomic_set(&fs_info->scrub_cancel_req, 0);
+        init_waitqueue_head(&fs_info->scrub_pause_wait);
+        init_rwsem(&fs_info->scrub_super_lock);
+        fs_info->scrub_workers_refcnt = 0;
+        btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+                           fs_info->thread_pool_size, &fs_info->generic_worker);
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
@@ -1711,10 +1687,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
-                             fs_info->btree_inode->i_mapping,
+                             fs_info->btree_inode->i_mapping);
-                             GFP_NOFS);
+        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
-        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
-                             GFP_NOFS);
        BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
@@ -1728,9 +1702,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->block_group_cache_tree = RB_ROOT;
        extent_io_tree_init(&fs_info->freed_extents[0],
-                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+                             fs_info->btree_inode->i_mapping);
        extent_io_tree_init(&fs_info->freed_extents[1],
-                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+                             fs_info->btree_inode->i_mapping);
        fs_info->pinned_extents = &fs_info->freed_extents[0];
        fs_info->do_barriers = 1;
@@ -1760,7 +1734,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
        if (!bh) {
                err = -EINVAL;
-                goto fail_iput;
+                goto fail_alloc;
        }
        memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
@@ -1772,7 +1746,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        disk_super = &fs_info->super_copy;
        if (!btrfs_super_root(disk_super))
-                goto fail_iput;
+                goto fail_alloc;
        /* check FS state, whether FS is broken. */
        fs_info->fs_state |= btrfs_super_flags(disk_super);
@@ -1788,7 +1762,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        ret = btrfs_parse_options(tree_root, options);
        if (ret) {
                err = ret;
-                goto fail_iput;
+                goto fail_alloc;
        }
        features = btrfs_super_incompat_flags(disk_super) &
@@ -1798,7 +1772,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                       "unsupported optional features (%Lx).\n",
                       (unsigned long long)features);
                err = -EINVAL;
-                goto fail_iput;
+                goto fail_alloc;
        }
        features = btrfs_super_incompat_flags(disk_super);
@@ -1814,7 +1788,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                       "unsupported option features (%Lx).\n",
                       (unsigned long long)features);
                err = -EINVAL;
-                goto fail_iput;
+                goto fail_alloc;
        }
        btrfs_init_workers(&fs_info->generic_worker,
@@ -1861,6 +1835,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                           &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
                           1, &fs_info->generic_worker);
+        btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
+                           fs_info->thread_pool_size,
+                           &fs_info->generic_worker);
        /*
         * endios are largely parallel and should have a very
@@ -1882,6 +1859,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
        btrfs_start_workers(&fs_info->endio_write_workers, 1);
        btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
+        btrfs_start_workers(&fs_info->delayed_workers, 1);
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2138,6 +2116,9 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->endio_freespace_worker);
        btrfs_stop_workers(&fs_info->submit_workers);
+        btrfs_stop_workers(&fs_info->delayed_workers);
+fail_alloc:
+        kfree(fs_info->delayed_root);
 fail_iput:
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
@@ -2165,11 +2146,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (printk_ratelimit()) {
+                printk_ratelimited(KERN_WARNING "lost page write due to "
-                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
                                       bdevname(bh->b_bdev, b));
-                }
                /* note, we dont' set_buffer_write_io_error because we have
                 * our own ways of dealing with the IO errors
                 */
@@ -2333,7 +2312,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        head = &root->fs_info->fs_devices->devices;
-        list_for_each_entry(dev, head, dev_list) {
+        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev) {
                        total_errors++;
                        continue;
@@ -2366,7 +2345,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        }
        total_errors = 0;
-        list_for_each_entry(dev, head, dev_list) {
+        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev)
                        continue;
                if (!dev->in_fs_metadata || !dev->writeable)
@@ -2404,12 +2383,15 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
        if (btrfs_root_refs(&root->root_item) == 0)
                synchronize_srcu(&fs_info->subvol_srcu);
+        __btrfs_remove_free_space_cache(root->free_ino_pinned);
+        __btrfs_remove_free_space_cache(root->free_ino_ctl);
        free_fs_root(root);
        return 0;
 }
 static void free_fs_root(struct btrfs_root *root)
 {
+        iput(root->cache_inode);
        WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
        if (root->anon_super.s_dev) {
                down_write(&root->anon_super.s_umount);
@@ -2417,6 +2399,8 @@ static void free_fs_root(struct btrfs_root *root)
        }
        free_extent_buffer(root->node);
        free_extent_buffer(root->commit_root);
+        kfree(root->free_ino_ctl);
+        kfree(root->free_ino_pinned);
        kfree(root->name);
        kfree(root);
 }
@@ -2520,6 +2504,15 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
+        btrfs_scrub_cancel(root);
+        /* wait for any defraggers to finish */
+        wait_event(fs_info->transaction_wait,
+                   (atomic_read(&fs_info->defrag_running) == 0));
+        /* clear out the rbtree of defraggable inodes */
+        btrfs_run_defrag_inodes(root->fs_info);
        btrfs_put_block_group_cache(fs_info);
        /*
@@ -2578,6 +2571,7 @@ int close_ctree(struct btrfs_root *root)
        del_fs_roots(fs_info);
        iput(fs_info->btree_inode);
+        kfree(fs_info->delayed_root);
        btrfs_stop_workers(&fs_info->generic_worker);
        btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2589,6 +2583,7 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->endio_freespace_worker);
        btrfs_stop_workers(&fs_info->submit_workers);
+        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2665,6 +2660,29 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
        if (current->flags & PF_MEMALLOC)
                return;
+        btrfs_balance_delayed_items(root);
+        num_dirty = root->fs_info->dirty_metadata_bytes;
+        if (num_dirty > thresh) {
+                balance_dirty_pages_ratelimited_nr(
+                                   root->fs_info->btree_inode->i_mapping, 1);
+        }
+        return;
+}
+void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+{
+        /*
+         * looks as though older kernels can get into trouble with
+         * this code, they end up stuck in balance_dirty_pages forever
+         */
+        u64 num_dirty;
+        unsigned long thresh = 32 * 1024 * 1024;
+        if (current->flags & PF_MEMALLOC)
+                return;
        num_dirty = root->fs_info->dirty_metadata_bytes;
        if (num_dirty > thresh) {
@@ -2697,7 +2715,7 @@ int btree_lock_page_hook(struct page *page)
                goto out;
        len = page->private >> 2;
-        eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
+        eb = find_extent_buffer(io_tree, bytenr, len);
        if (!eb)
                goto out;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 07b20dc2fd95..a0b610a67aae 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -55,35 +55,20 @@ int btrfs_commit_super(struct btrfs_root *root);
 int btrfs_error_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize);
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
-                                        u64 root_objectid);
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
-                                      struct btrfs_key *location,
-                                      const char *name, int namelen);
 struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
                                               struct btrfs_key *location);
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                              struct btrfs_key *location);
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-int btrfs_insert_dev_radix(struct btrfs_root *root,
-                           struct block_device *bdev,
-                           u64 device_id,
-                           u64 block_start,
-                           u64 num_blocks);
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
-void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
-int wait_on_tree_block_writeback(struct btrfs_root *root,
-                                 struct extent_buffer *buf);
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
-int btrfs_open_device(struct btrfs_device *dev);
-int btrfs_verify_block_csum(struct btrfs_root *root,
-                            struct extent_buffer *buf);
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                        int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
@@ -91,8 +76,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        unsigned long bio_flags, u64 bio_offset,
                        extent_submit_bio_hook_t *submit_bio_start,
                        extent_submit_bio_hook_t *submit_bio_done);
-int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index b4ffad859adb..1b8dc33778f9 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -32,7 +32,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
        type = FILEID_BTRFS_WITHOUT_PARENT;
-        fid->objectid = inode->i_ino;
+        fid->objectid = btrfs_ino(inode);
        fid->root_objectid = BTRFS_I(inode)->root->objectid;
        fid->gen = inode->i_generation;
@@ -178,13 +178,13 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        if (!path)
                return ERR_PTR(-ENOMEM);
-        if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+        if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) {
                key.objectid = root->root_key.objectid;
                key.type = BTRFS_ROOT_BACKREF_KEY;
                key.offset = (u64)-1;
                root = root->fs_info->tree_root;
        } else {
-                key.objectid = dir->i_ino;
+                key.objectid = btrfs_ino(dir);
                key.type = BTRFS_INODE_REF_KEY;
                key.offset = (u64)-1;
        }
@@ -244,6 +244,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
        struct btrfs_key key;
        int name_len;
        int ret;
+        u64 ino;
        if (!dir || !inode)
                return -EINVAL;
@@ -251,19 +252,21 @@ static int btrfs_get_name(struct dentry *parent, char *name,
        if (!S_ISDIR(dir->i_mode))
                return -EINVAL;
+        ino = btrfs_ino(inode);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->leave_spinning = 1;
-        if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+        if (ino == BTRFS_FIRST_FREE_OBJECTID) {
                key.objectid = BTRFS_I(inode)->root->root_key.objectid;
                key.type = BTRFS_ROOT_BACKREF_KEY;
                key.offset = (u64)-1;
                root = root->fs_info->tree_root;
        } else {
-                key.objectid = inode->i_ino;
+                key.objectid = ino;
-                key.offset = dir->i_ino;
+                key.offset = btrfs_ino(dir);
                key.type = BTRFS_INODE_REF_KEY;
        }
@@ -272,7 +275,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
                btrfs_free_path(path);
                return ret;
        } else if (ret > 0) {
-                if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+                if (ino == BTRFS_FIRST_FREE_OBJECTID) {
                        path->slots[0]--;
                } else {
                        btrfs_free_path(path);
@@ -281,11 +284,11 @@ static int btrfs_get_name(struct dentry *parent, char *name,
        }
        leaf = path->nodes[0];
-        if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+        if (ino == BTRFS_FIRST_FREE_OBJECTID) {
-               rref = btrfs_item_ptr(leaf, path->slots[0],
+                rref = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_root_ref);
-               name_ptr = (unsigned long)(rref + 1);
+                name_ptr = (unsigned long)(rref + 1);
-               name_len = btrfs_root_ref_name_len(leaf, rref);
+                name_len = btrfs_root_ref_name_len(leaf, rref);
        } else {
                iref = btrfs_item_ptr(leaf, path->slots[0],
                                      struct btrfs_inode_ref);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9ee6bd55e16c..169bd62ce776 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -94,7 +94,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
        return (cache->flags & bits) == bits;
 }
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 {
        atomic_inc(&cache->count);
 }
@@ -105,6 +105,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
                WARN_ON(cache->pinned > 0);
                WARN_ON(cache->reserved > 0);
                WARN_ON(cache->reserved_pinned > 0);
+                kfree(cache->free_space_ctl);
                kfree(cache);
        }
 }
@@ -379,7 +380,7 @@ again:
                                break;
                        caching_ctl->progress = last;
-                        btrfs_release_path(extent_root, path);
+                        btrfs_release_path(path);
                        up_read(&fs_info->extent_commit_sem);
                        mutex_unlock(&caching_ctl->mutex);
                        if (btrfs_transaction_in_commit(fs_info))
@@ -754,8 +755,12 @@ again:
                        atomic_inc(&head->node.refs);
                        spin_unlock(&delayed_refs->lock);
-                        btrfs_release_path(root->fs_info->extent_root, path);
+                        btrfs_release_path(path);
+                        /*
+                         * Mutex was contended, block until it's released and try
+                         * again
+                         */
                        mutex_lock(&head->mutex);
                        mutex_unlock(&head->mutex);
                        btrfs_put_delayed_ref(&head->node);
@@ -934,7 +939,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
                        break;
                }
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (owner < BTRFS_FIRST_FREE_OBJECTID)
                new_size += sizeof(*bi);
@@ -947,7 +952,6 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        ret = btrfs_extend_item(trans, root, path, new_size);
-        BUG_ON(ret);
        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1042,7 +1046,7 @@ again:
                        return 0;
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
                key.type = BTRFS_EXTENT_REF_V0_KEY;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret < 0) {
                        err = ret;
@@ -1080,7 +1084,7 @@ again:
                if (match_extent_data_ref(leaf, ref, root_objectid,
                                          owner, offset)) {
                        if (recow) {
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                                goto again;
                        }
                        err = 0;
@@ -1141,7 +1145,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
                        if (match_extent_data_ref(leaf, ref, root_objectid,
                                                  owner, offset))
                                break;
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        key.offset++;
                        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                                      size);
@@ -1167,7 +1171,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        ret = 0;
 fail:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        return ret;
 }
@@ -1293,7 +1297,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
                ret = -ENOENT;
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
        if (ret == -ENOENT && parent) {
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                key.type = BTRFS_EXTENT_REF_V0_KEY;
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret > 0)
@@ -1322,7 +1326,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
        }
        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        return ret;
 }
@@ -1555,7 +1559,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
        size = btrfs_extent_inline_ref_size(type);
        ret = btrfs_extend_item(trans, root, path, size);
-        BUG_ON(ret);
        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, ei);
@@ -1608,7 +1611,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
        if (ret != -ENOENT)
                return ret;
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        *ref_ret = NULL;
        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
@@ -1684,7 +1687,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
                                              end - ptr - size);
                item_size -= size;
                ret = btrfs_truncate_item(trans, root, path, item_size, 1);
-                BUG_ON(ret);
        }
        btrfs_mark_buffer_dirty(leaf);
        return 0;
@@ -1862,7 +1864,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                __run_delayed_extent_op(extent_op, leaf, item);
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(root->fs_info->extent_root, path);
+        btrfs_release_path(path);
        path->reada = 1;
        path->leave_spinning = 1;
@@ -2297,6 +2299,10 @@ again:
                                atomic_inc(&ref->refs);
                                spin_unlock(&delayed_refs->lock);
+                                /*
+                                 * Mutex was contended, block until it's
+                                 * released and try again
+                                 */
                                mutex_lock(&head->mutex);
                                mutex_unlock(&head->mutex);
@@ -2361,8 +2367,12 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
                atomic_inc(&head->node.refs);
                spin_unlock(&delayed_refs->lock);
-                btrfs_release_path(root->fs_info->extent_root, path);
+                btrfs_release_path(path);
+                /*
+                 * Mutex was contended, block until it's released and let
+                 * caller try again
+                 */
                mutex_lock(&head->mutex);
                mutex_unlock(&head->mutex);
                btrfs_put_delayed_ref(&head->node);
@@ -2510,126 +2520,6 @@ out:
        return ret;
 }
-#if 0
-int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                    struct extent_buffer *buf, u32 nr_extents)
-{
-        struct btrfs_key key;
-        struct btrfs_file_extent_item *fi;
-        u64 root_gen;
-        u32 nritems;
-        int i;
-        int level;
-        int ret = 0;
-        int shared = 0;
-        if (!root->ref_cows)
-                return 0;
-        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-                shared = 0;
-                root_gen = root->root_key.offset;
-        } else {
-                shared = 1;
-                root_gen = trans->transid - 1;
-        }
-        level = btrfs_header_level(buf);
-        nritems = btrfs_header_nritems(buf);
-        if (level == 0) {
-                struct btrfs_leaf_ref *ref;
-                struct btrfs_extent_info *info;
-                ref = btrfs_alloc_leaf_ref(root, nr_extents);
-                if (!ref) {
-                        ret = -ENOMEM;
-                        goto out;
-                }
-                ref->root_gen = root_gen;
-                ref->bytenr = buf->start;
-                ref->owner = btrfs_header_owner(buf);
-                ref->generation = btrfs_header_generation(buf);
-                ref->nritems = nr_extents;
-                info = ref->extents;
-                for (i = 0; nr_extents > 0 && i < nritems; i++) {
-                        u64 disk_bytenr;
-                        btrfs_item_key_to_cpu(buf, &key, i);
-                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-                                continue;
-                        fi = btrfs_item_ptr(buf, i,
-                                            struct btrfs_file_extent_item);
-                        if (btrfs_file_extent_type(buf, fi) ==
-                            BTRFS_FILE_EXTENT_INLINE)
-                                continue;
-                        disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-                        if (disk_bytenr == 0)
-                                continue;
-                        info->bytenr = disk_bytenr;
-                        info->num_bytes =
-                                btrfs_file_extent_disk_num_bytes(buf, fi);
-                        info->objectid = key.objectid;
-                        info->offset = key.offset;
-                        info++;
-                }
-                ret = btrfs_add_leaf_ref(root, ref, shared);
-                if (ret == -EEXIST && shared) {
-                        struct btrfs_leaf_ref *old;
-                        old = btrfs_lookup_leaf_ref(root, ref->bytenr);
-                        BUG_ON(!old);
-                        btrfs_remove_leaf_ref(root, old);
-                        btrfs_free_leaf_ref(root, old);
-                        ret = btrfs_add_leaf_ref(root, ref, shared);
-                }
-                WARN_ON(ret);
-                btrfs_free_leaf_ref(root, ref);
-        }
-out:
-        return ret;
-}
-/* when a block goes through cow, we update the reference counts of
- * everything that block points to.  The internal pointers of the block
- * can be in just about any order, and it is likely to have clusters of
- * things that are close together and clusters of things that are not.
- *
- * To help reduce the seeks that come with updating all of these reference
- * counts, sort them by byte number before actual updates are done.
- *
- * struct refsort is used to match byte number to slot in the btree block.
- * we sort based on the byte number and then use the slot to actually
- * find the item.
- *
- * struct refsort is smaller than strcut btrfs_item and smaller than
- * struct btrfs_key_ptr.  Since we're currently limited to the page size
- * for a btree block, there's no way for a kmalloc of refsorts for a
- * single node to be bigger than a page.
- */
-struct refsort {
-        u64 bytenr;
-        u32 slot;
-};
-/*
- * for passing into sort()
- */
-static int refsort_cmp(const void *a_void, const void *b_void)
-{
-        const struct refsort *a = a_void;
-        const struct refsort *b = b_void;
-        if (a->bytenr < b->bytenr)
-                return -1;
-        if (a->bytenr > b->bytenr)
-                return 1;
-        return 0;
-}
-#endif
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
@@ -2732,7 +2622,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(extent_root, path);
+        btrfs_release_path(path);
 fail:
        if (ret)
                return ret;
@@ -2785,7 +2675,7 @@ again:
        inode = lookup_free_space_inode(root, block_group, path);
        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
                ret = PTR_ERR(inode);
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                goto out;
        }
@@ -2854,7 +2744,7 @@ again:
 out_put:
        iput(inode);
 out_free:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
 out:
        spin_lock(&block_group->lock);
        block_group->disk_cache_state = dcs;
@@ -3144,7 +3034,8 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
        /* make sure bytes are sectorsize aligned */
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
-        if (root == root->fs_info->tree_root) {
+        if (root == root->fs_info->tree_root ||
+            BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
                alloc_chunk = 0;
                committed = 1;
        }
@@ -3211,18 +3102,6 @@ commit_trans:
                        goto again;
                }
-#if 0 /* I hope we never need this code again, just in case */
-                printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
-                       "%llu bytes_reserved, " "%llu bytes_pinned, "
-                       "%llu bytes_readonly, %llu may use %llu total\n",
-                       (unsigned long long)bytes,
-                       (unsigned long long)data_sinfo->bytes_used,
-                       (unsigned long long)data_sinfo->bytes_reserved,
-                       (unsigned long long)data_sinfo->bytes_pinned,
-                       (unsigned long long)data_sinfo->bytes_readonly,
-                       (unsigned long long)data_sinfo->bytes_may_use,
-                       (unsigned long long)data_sinfo->total_bytes);
-#endif
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
@@ -3425,6 +3304,10 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
        if (reserved == 0)
                return 0;
+        /* nothing to shrink - nothing to reclaim */
+        if (root->fs_info->delalloc_bytes == 0)
+                return 0;
        max_reclaim = min(reserved, to_reclaim);
        while (loops < 1024) {
@@ -3651,8 +3534,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
        spin_unlock(&block_rsv->lock);
 }
-void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
-                             struct btrfs_block_rsv *dest, u64 num_bytes)
+                                    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3855,23 +3738,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
        u64 meta_used;
        u64 data_used;
        int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
-#if 0
-        /*
-         * per tree used space accounting can be inaccuracy, so we
-         * can't rely on it.
-         */
-        spin_lock(&fs_info->extent_root->accounting_lock);
-        num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
-        spin_unlock(&fs_info->extent_root->accounting_lock);
-        spin_lock(&fs_info->csum_root->accounting_lock);
-        num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
-        spin_unlock(&fs_info->csum_root->accounting_lock);
-        spin_lock(&fs_info->tree_root->accounting_lock);
-        num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
-        spin_unlock(&fs_info->tree_root->accounting_lock);
-#endif
        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
        spin_lock(&sinfo->lock);
        data_used = sinfo->bytes_used;
@@ -3924,10 +3791,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
        }
-#if 0
-        printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
-                block_rsv->size, block_rsv->reserved);
-#endif
        spin_unlock(&sinfo->lock);
        spin_unlock(&block_rsv->lock);
 }
@@ -3973,12 +3837,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
 }
-static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
-{
-        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
-                3 * num_items;
-}
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 int num_items)
@@ -3989,7 +3847,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
        if (num_items == 0 || root->fs_info->chunk_root == root)
                return 0;
-        num_bytes = calc_trans_metadata_size(root, num_items);
+        num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
        ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
                                  num_bytes);
        if (!ret) {
@@ -4028,14 +3886,14 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
         * If all of the metadata space is used, we can commit
         * transaction and use space it freed.
         */
-        u64 num_bytes = calc_trans_metadata_size(root, 4);
+        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4);
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 void btrfs_orphan_release_metadata(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        u64 num_bytes = calc_trans_metadata_size(root, 4);
+        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4);
        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
@@ -4049,7 +3907,7 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
         * two for root back/forward refs, two for directory entries
         * and one for root of the snapshot.
         */
-        u64 num_bytes = calc_trans_metadata_size(root, 5);
+        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
        dst_rsv->space_info = src_rsv->space_info;
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
@@ -4078,7 +3936,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        if (nr_extents > reserved_extents) {
                nr_extents -= reserved_extents;
-                to_reserve = calc_trans_metadata_size(root, nr_extents);
+                to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
        } else {
                nr_extents = 0;
                to_reserve = 0;
@@ -4132,7 +3990,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        to_free = calc_csum_metadata_size(inode, num_bytes);
        if (nr_extents > 0)
-                to_free += calc_trans_metadata_size(root, nr_extents);
+                to_free += btrfs_calc_trans_metadata_size(root, nr_extents);
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                to_free);
@@ -4541,7 +4399,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                                    NULL, refs_to_drop,
                                                    is_data);
                        BUG_ON(ret);
-                        btrfs_release_path(extent_root, path);
+                        btrfs_release_path(path);
                        path->leave_spinning = 1;
                        key.objectid = bytenr;
@@ -4580,7 +4438,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                             owner_objectid, 0);
                BUG_ON(ret < 0);
-                btrfs_release_path(extent_root, path);
+                btrfs_release_path(path);
                path->leave_spinning = 1;
                key.objectid = bytenr;
@@ -4650,7 +4508,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                BUG_ON(ret);
-                btrfs_release_path(extent_root, path);
+                btrfs_release_path(path);
                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
@@ -4893,7 +4751,7 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
                return 0;
        wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
-                   (cache->free_space >= num_bytes));
+                   (cache->free_space_ctl->free_space >= num_bytes));
        put_caching_control(caching_ctl);
        return 0;
@@ -6480,7 +6338,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                trans->block_rsv = block_rsv;
                }
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        BUG_ON(err);
        ret = btrfs_del_root(trans, tree_root, &root->root_key);
@@ -6584,1514 +6442,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        return ret;
 }
-#if 0
-static unsigned long calc_ra(unsigned long start, unsigned long last,
-                             unsigned long nr)
-{
-        return min(last, start + nr - 1);
-}
-static noinline int relocate_inode_pages(struct inode *inode, u64 start,
-                                         u64 len)
-{
-        u64 page_start;
-        u64 page_end;
-        unsigned long first_index;
-        unsigned long last_index;
-        unsigned long i;
-        struct page *page;
-        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-        struct file_ra_state *ra;
-        struct btrfs_ordered_extent *ordered;
-        unsigned int total_read = 0;
-        unsigned int total_dirty = 0;
-        int ret = 0;
-        ra = kzalloc(sizeof(*ra), GFP_NOFS);
-        if (!ra)
-                return -ENOMEM;
-        mutex_lock(&inode->i_mutex);
-        first_index = start >> PAGE_CACHE_SHIFT;
-        last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
-        /* make sure the dirty trick played by the caller work */
-        ret = invalidate_inode_pages2_range(inode->i_mapping,
-                                            first_index, last_index);
-        if (ret)
-                goto out_unlock;
-        file_ra_state_init(ra, inode->i_mapping);
-        for (i = first_index ; i <= last_index; i++) {
-                if (total_read % ra->ra_pages == 0) {
-                        btrfs_force_ra(inode->i_mapping, ra, NULL, i,
-                                       calc_ra(i, last_index, ra->ra_pages));
-                }
-                total_read++;
-again:
-                if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
-                        BUG_ON(1);
-                page = grab_cache_page(inode->i_mapping, i);
-                if (!page) {
-                        ret = -ENOMEM;
-                        goto out_unlock;
-                }
-                if (!PageUptodate(page)) {
-                        btrfs_readpage(NULL, page);
-                        lock_page(page);
-                        if (!PageUptodate(page)) {
-                                unlock_page(page);
-                                page_cache_release(page);
-                                ret = -EIO;
-                                goto out_unlock;
-                        }
-                }
-                wait_on_page_writeback(page);
-                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-                page_end = page_start + PAGE_CACHE_SIZE - 1;
-                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-                ordered = btrfs_lookup_ordered_extent(inode, page_start);
-                if (ordered) {
-                        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-                        unlock_page(page);
-                        page_cache_release(page);
-                        btrfs_start_ordered_extent(inode, ordered, 1);
-                        btrfs_put_ordered_extent(ordered);
-                        goto again;
-                }
-                set_page_extent_mapped(page);
-                if (i == first_index)
-                        set_extent_bits(io_tree, page_start, page_end,
-                                        EXTENT_BOUNDARY, GFP_NOFS);
-                btrfs_set_extent_delalloc(inode, page_start, page_end);
-                set_page_dirty(page);
-                total_dirty++;
-                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-                unlock_page(page);
-                page_cache_release(page);
-        }
-out_unlock:
-        kfree(ra);
-        mutex_unlock(&inode->i_mutex);
-        balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
-        return ret;
-}
-static noinline int relocate_data_extent(struct inode *reloc_inode,
-                                         struct btrfs_key *extent_key,
-                                         u64 offset)
-{
-        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
-        struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
-        struct extent_map *em;
-        u64 start = extent_key->objectid - offset;
-        u64 end = start + extent_key->offset - 1;
-        em = alloc_extent_map(GFP_NOFS);
-        BUG_ON(!em);
-        em->start = start;
-        em->len = extent_key->offset;
-        em->block_len = extent_key->offset;
-        em->block_start = extent_key->objectid;
-        em->bdev = root->fs_info->fs_devices->latest_bdev;
-        set_bit(EXTENT_FLAG_PINNED, &em->flags);
-        /* setup extent map to cheat btrfs_readpage */
-        lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
-        while (1) {
-                int ret;
-                write_lock(&em_tree->lock);
-                ret = add_extent_mapping(em_tree, em);
-                write_unlock(&em_tree->lock);
-                if (ret != -EEXIST) {
-                        free_extent_map(em);
-                        break;
-                }
-                btrfs_drop_extent_cache(reloc_inode, start, end, 0);
-        }
-        unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
-        return relocate_inode_pages(reloc_inode, start, extent_key->offset);
-}
-struct btrfs_ref_path {
-        u64 extent_start;
-        u64 nodes[BTRFS_MAX_LEVEL];
-        u64 root_objectid;
-        u64 root_generation;
-        u64 owner_objectid;
-        u32 num_refs;
-        int lowest_level;
-        int current_level;
-        int shared_level;
-        struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
-        u64 new_nodes[BTRFS_MAX_LEVEL];
-};
-struct disk_extent {
-        u64 ram_bytes;
-        u64 disk_bytenr;
-        u64 disk_num_bytes;
-        u64 offset;
-        u64 num_bytes;
-        u8 compression;
-        u8 encryption;
-        u16 other_encoding;
-};
-static int is_cowonly_root(u64 root_objectid)
-{
-        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
-            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
-            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
-            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
-            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
-            root_objectid == BTRFS_CSUM_TREE_OBJECTID)
-                return 1;
-        return 0;
-}
-static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *extent_root,
-                                    struct btrfs_ref_path *ref_path,
-                                    int first_time)
-{
-        struct extent_buffer *leaf;
-        struct btrfs_path *path;
-        struct btrfs_extent_ref *ref;
-        struct btrfs_key key;
-        struct btrfs_key found_key;
-        u64 bytenr;
-        u32 nritems;
-        int level;
-        int ret = 1;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        if (first_time) {
-                ref_path->lowest_level = -1;
-                ref_path->current_level = -1;
-                ref_path->shared_level = -1;
-                goto walk_up;
-        }
-walk_down:
-        level = ref_path->current_level - 1;
-        while (level >= -1) {
-                u64 parent;
-                if (level < ref_path->lowest_level)
-                        break;
-                if (level >= 0)
-                        bytenr = ref_path->nodes[level];
-                else
-                        bytenr = ref_path->extent_start;
-                BUG_ON(bytenr == 0);
-                parent = ref_path->nodes[level + 1];
-                ref_path->nodes[level + 1] = 0;
-                ref_path->current_level = level;
-                BUG_ON(parent == 0);
-                key.objectid = bytenr;
-                key.offset = parent + 1;
-                key.type = BTRFS_EXTENT_REF_KEY;
-                ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
-                if (ret < 0)
-                        goto out;
-                BUG_ON(ret == 0);
-                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
-                if (path->slots[0] >= nritems) {
-                        ret = btrfs_next_leaf(extent_root, path);
-                        if (ret < 0)
-                                goto out;
-                        if (ret > 0)
-                                goto next;
-                        leaf = path->nodes[0];
-                }
-                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.objectid == bytenr &&
-                    found_key.type == BTRFS_EXTENT_REF_KEY) {
-                        if (level < ref_path->shared_level)
-                                ref_path->shared_level = level;
-                        goto found;
-                }
-next:
-                level--;
-                btrfs_release_path(extent_root, path);
-                cond_resched();
-        }
-        /* reached lowest level */
-        ret = 1;
-        goto out;
-walk_up:
-        level = ref_path->current_level;
-        while (level < BTRFS_MAX_LEVEL - 1) {
-                u64 ref_objectid;
-                if (level >= 0)
-                        bytenr = ref_path->nodes[level];
-                else
-                        bytenr = ref_path->extent_start;
-                BUG_ON(bytenr == 0);
-                key.objectid = bytenr;
-                key.offset = 0;
-                key.type = BTRFS_EXTENT_REF_KEY;
-                ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
-                if (ret < 0)
-                        goto out;
-                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
-                if (path->slots[0] >= nritems) {
-                        ret = btrfs_next_leaf(extent_root, path);
-                        if (ret < 0)
-                                goto out;
-                        if (ret > 0) {
-                                /* the extent was freed by someone */
-                                if (ref_path->lowest_level == level)
-                                        goto out;
-                                btrfs_release_path(extent_root, path);
-                                goto walk_down;
-                        }
-                        leaf = path->nodes[0];
-                }
-                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.objectid != bytenr ||
-                                found_key.type != BTRFS_EXTENT_REF_KEY) {
-                        /* the extent was freed by someone */
-                        if (ref_path->lowest_level == level) {
-                                ret = 1;
-                                goto out;
-                        }
-                        btrfs_release_path(extent_root, path);
-                        goto walk_down;
-                }
-found:
-                ref = btrfs_item_ptr(leaf, path->slots[0],
-                                struct btrfs_extent_ref);
-                ref_objectid = btrfs_ref_objectid(leaf, ref);
-                if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-                        if (first_time) {
-                                level = (int)ref_objectid;
-                                BUG_ON(level >= BTRFS_MAX_LEVEL);
-                                ref_path->lowest_level = level;
-                                ref_path->current_level = level;
-                                ref_path->nodes[level] = bytenr;
-                        } else {
-                                WARN_ON(ref_objectid != level);
-                        }
-                } else {
-                        WARN_ON(level != -1);
-                }
-                first_time = 0;
-                if (ref_path->lowest_level == level) {
-                        ref_path->owner_objectid = ref_objectid;
-                        ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
-                }
-                /*
-                 * the block is tree root or the block isn't in reference
-                 * counted tree.
-                 */
-                if (found_key.objectid == found_key.offset ||
-                    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
-                        ref_path->root_objectid = btrfs_ref_root(leaf, ref);
-                        ref_path->root_generation =
-                                btrfs_ref_generation(leaf, ref);
-                        if (level < 0) {
-                                /* special reference from the tree log */
-                                ref_path->nodes[0] = found_key.offset;
-                                ref_path->current_level = 0;
-                        }
-                        ret = 0;
-                        goto out;
-                }
-                level++;
-                BUG_ON(ref_path->nodes[level] != 0);
-                ref_path->nodes[level] = found_key.offset;
-                ref_path->current_level = level;
-                /*
-                 * the reference was created in the running transaction,
-                 * no need to continue walking up.
-                 */
-                if (btrfs_ref_generation(leaf, ref) == trans->transid) {
-                        ref_path->root_objectid = btrfs_ref_root(leaf, ref);
-                        ref_path->root_generation =
-                                btrfs_ref_generation(leaf, ref);
-                        ret = 0;
-                        goto out;
-                }
-                btrfs_release_path(extent_root, path);
-                cond_resched();
-        }
-        /* reached max tree level, but no tree root found. */
-        BUG();
-out:
-        btrfs_free_path(path);
-        return ret;
-}
-static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *extent_root,
-                                struct btrfs_ref_path *ref_path,
-                                u64 extent_start)
-{
-        memset(ref_path, 0, sizeof(*ref_path));
-        ref_path->extent_start = extent_start;
-        return __next_ref_path(trans, extent_root, ref_path, 1);
-}
-static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *extent_root,
-                               struct btrfs_ref_path *ref_path)
-{
-        return __next_ref_path(trans, extent_root, ref_path, 0);
-}
-static noinline int get_new_locations(struct inode *reloc_inode,
-                                      struct btrfs_key *extent_key,
-                                      u64 offset, int no_fragment,
-                                      struct disk_extent **extents,
-                                      int *nr_extents)
-{
-        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
-        struct btrfs_path *path;
-        struct btrfs_file_extent_item *fi;
-        struct extent_buffer *leaf;
-        struct disk_extent *exts = *extents;
-        struct btrfs_key found_key;
-        u64 cur_pos;
-        u64 last_byte;
-        u32 nritems;
-        int nr = 0;
-        int max = *nr_extents;
-        int ret;
-        WARN_ON(!no_fragment && *extents);
-        if (!exts) {
-                max = 1;
-                exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
-                if (!exts)
-                        return -ENOMEM;
-        }
-        path = btrfs_alloc_path();
-        if (!path) {
-                if (exts != *extents)
-                        kfree(exts);
-                return -ENOMEM;
-        }
-        cur_pos = extent_key->objectid - offset;
-        last_byte = extent_key->objectid + extent_key->offset;
-        ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
-                                       cur_pos, 0);
-        if (ret < 0)
-                goto out;
-        if (ret > 0) {
-                ret = -ENOENT;
-                goto out;
-        }
-        while (1) {
-                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
-                if (path->slots[0] >= nritems) {
-                        ret = btrfs_next_leaf(root, path);
-                        if (ret < 0)
-                                goto out;
-                        if (ret > 0)
-                                break;
-                        leaf = path->nodes[0];
-                }
-                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.offset != cur_pos ||
-                    found_key.type != BTRFS_EXTENT_DATA_KEY ||
-                    found_key.objectid != reloc_inode->i_ino)
-                        break;
-                fi = btrfs_item_ptr(leaf, path->slots[0],
-                                    struct btrfs_file_extent_item);
-                if (btrfs_file_extent_type(leaf, fi) !=
-                    BTRFS_FILE_EXTENT_REG ||
-                    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
-                        break;
-                if (nr == max) {
-                        struct disk_extent *old = exts;
-                        max *= 2;
-                        exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
-                        if (!exts) {
-                                ret = -ENOMEM;
-                                goto out;
-                        }
-                        memcpy(exts, old, sizeof(*exts) * nr);
-                        if (old != *extents)
-                                kfree(old);
-                }
-                exts[nr].disk_bytenr =
-                        btrfs_file_extent_disk_bytenr(leaf, fi);
-                exts[nr].disk_num_bytes =
-                        btrfs_file_extent_disk_num_bytes(leaf, fi);
-                exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
-                exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
-                exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
-                exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
-                exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
-                exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
-                                                                           fi);
-                BUG_ON(exts[nr].offset > 0);
-                BUG_ON(exts[nr].compression || exts[nr].encryption);
-                BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
-                cur_pos += exts[nr].num_bytes;
-                nr++;
-                if (cur_pos + offset >= last_byte)
-                        break;
-                if (no_fragment) {
-                        ret = 1;
-                        goto out;
-                }
-                path->slots[0]++;
-        }
-        BUG_ON(cur_pos + offset > last_byte);
-        if (cur_pos + offset < last_byte) {
-                ret = -ENOENT;
-                goto out;
-        }
-        ret = 0;
-out:
-        btrfs_free_path(path);
-        if (ret) {
-                if (exts != *extents)
-                        kfree(exts);
-        } else {
-                *extents = exts;
-                *nr_extents = nr;
-        }
-        return ret;
-}
-static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
-                                        struct btrfs_path *path,
-                                        struct btrfs_key *extent_key,
-                                        struct btrfs_key *leaf_key,
-                                        struct btrfs_ref_path *ref_path,
-                                        struct disk_extent *new_extents,
-                                        int nr_extents)
-{
-        struct extent_buffer *leaf;
-        struct btrfs_file_extent_item *fi;
-        struct inode *inode = NULL;
-        struct btrfs_key key;
-        u64 lock_start = 0;
-        u64 lock_end = 0;
-        u64 num_bytes;
-        u64 ext_offset;
-        u64 search_end = (u64)-1;
-        u32 nritems;
-        int nr_scaned = 0;
-        int extent_locked = 0;
-        int extent_type;
-        int ret;
-        memcpy(&key, leaf_key, sizeof(key));
-        if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
-                if (key.objectid < ref_path->owner_objectid ||
-                    (key.objectid == ref_path->owner_objectid &&
-                     key.type < BTRFS_EXTENT_DATA_KEY)) {
-                        key.objectid = ref_path->owner_objectid;
-                        key.type = BTRFS_EXTENT_DATA_KEY;
-                        key.offset = 0;
-                }
-        }
-        while (1) {
-                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-                if (ret < 0)
-                        goto out;
-                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
-next:
-                if (extent_locked && ret > 0) {
-                        /*
-                         * the file extent item was modified by someone
-                         * before the extent got locked.
-                         */
-                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
-                                      lock_end, GFP_NOFS);
-                        extent_locked = 0;
-                }
-                if (path->slots[0] >= nritems) {
-                        if (++nr_scaned > 2)
-                                break;
-                        BUG_ON(extent_locked);
-                        ret = btrfs_next_leaf(root, path);
-                        if (ret < 0)
-                                goto out;
-                        if (ret > 0)
-                                break;
-                        leaf = path->nodes[0];
-                        nritems = btrfs_header_nritems(leaf);
-                }
-                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
-                        if ((key.objectid > ref_path->owner_objectid) ||
-                            (key.objectid == ref_path->owner_objectid &&
-                             key.type > BTRFS_EXTENT_DATA_KEY) ||
-                            key.offset >= search_end)
-                                break;
-                }
-                if (inode && key.objectid != inode->i_ino) {
-                        BUG_ON(extent_locked);
-                        btrfs_release_path(root, path);
-                        mutex_unlock(&inode->i_mutex);
-                        iput(inode);
-                        inode = NULL;
-                        continue;
-                }
-                if (key.type != BTRFS_EXTENT_DATA_KEY) {
-                        path->slots[0]++;
-                        ret = 1;
-                        goto next;
-                }
-                fi = btrfs_item_ptr(leaf, path->slots[0],
-                                    struct btrfs_file_extent_item);
-                extent_type = btrfs_file_extent_type(leaf, fi);
-                if ((extent_type != BTRFS_FILE_EXTENT_REG &&
-                     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
-                    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
-                     extent_key->objectid)) {
-                        path->slots[0]++;
-                        ret = 1;
-                        goto next;
-                }
-                num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
-                ext_offset = btrfs_file_extent_offset(leaf, fi);
-                if (search_end == (u64)-1) {
-                        search_end = key.offset - ext_offset +
-                                btrfs_file_extent_ram_bytes(leaf, fi);
-                }
-                if (!extent_locked) {
-                        lock_start = key.offset;
-                        lock_end = lock_start + num_bytes - 1;
-                } else {
-                        if (lock_start > key.offset ||
-                            lock_end + 1 < key.offset + num_bytes) {
-                                unlock_extent(&BTRFS_I(inode)->io_tree,
-                                              lock_start, lock_end, GFP_NOFS);
-                                extent_locked = 0;
-                        }
-                }
-                if (!inode) {
-                        btrfs_release_path(root, path);
-                        inode = btrfs_iget_locked(root->fs_info->sb,
-                                                  key.objectid, root);
-                        if (inode->i_state & I_NEW) {
-                                BTRFS_I(inode)->root = root;
-                                BTRFS_I(inode)->location.objectid =
-                                        key.objectid;
-                                BTRFS_I(inode)->location.type =
-                                        BTRFS_INODE_ITEM_KEY;
-                                BTRFS_I(inode)->location.offset = 0;
-                                btrfs_read_locked_inode(inode);
-                                unlock_new_inode(inode);
-                        }
-                        /*
-                         * some code call btrfs_commit_transaction while
-                         * holding the i_mutex, so we can't use mutex_lock
-                         * here.
-                         */
-                        if (is_bad_inode(inode) ||
-                            !mutex_trylock(&inode->i_mutex)) {
-                                iput(inode);
-                                inode = NULL;
-                                key.offset = (u64)-1;
-                                goto skip;
-                        }
-                }
-                if (!extent_locked) {
-                        struct btrfs_ordered_extent *ordered;
-                        btrfs_release_path(root, path);
-                        lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
-                                    lock_end, GFP_NOFS);
-                        ordered = btrfs_lookup_first_ordered_extent(inode,
-                                                                    lock_end);
-                        if (ordered &&
-                            ordered->file_offset <= lock_end &&
-                            ordered->file_offset + ordered->len > lock_start) {
-                                unlock_extent(&BTRFS_I(inode)->io_tree,
-                                              lock_start, lock_end, GFP_NOFS);
-                                btrfs_start_ordered_extent(inode, ordered, 1);
-                                btrfs_put_ordered_extent(ordered);
-                                key.offset += num_bytes;
-                                goto skip;
-                        }
-                        if (ordered)
-                                btrfs_put_ordered_extent(ordered);
-                        extent_locked = 1;
-                        continue;
-                }
-                if (nr_extents == 1) {
-                        /* update extent pointer in place */
-                        btrfs_set_file_extent_disk_bytenr(leaf, fi,
-                                                new_extents[0].disk_bytenr);
-                        btrfs_set_file_extent_disk_num_bytes(leaf, fi,
-                                                new_extents[0].disk_num_bytes);
-                        btrfs_mark_buffer_dirty(leaf);
-                        btrfs_drop_extent_cache(inode, key.offset,
-                                                key.offset + num_bytes - 1, 0);
-                        ret = btrfs_inc_extent_ref(trans, root,
-                                                new_extents[0].disk_bytenr,
-                                                new_extents[0].disk_num_bytes,
-                                                leaf->start,
-                                                root->root_key.objectid,
-                                                trans->transid,
-                                                key.objectid);
-                        BUG_ON(ret);
-                        ret = btrfs_free_extent(trans, root,
-                                                extent_key->objectid,
-                                                extent_key->offset,
-                                                leaf->start,
-                                                btrfs_header_owner(leaf),
-                                                btrfs_header_generation(leaf),
-                                                key.objectid, 0);
-                        BUG_ON(ret);
-                        btrfs_release_path(root, path);
-                        key.offset += num_bytes;
-                } else {
-                        BUG_ON(1);
-#if 0
-                        u64 alloc_hint;
-                        u64 extent_len;
-                        int i;
-                        /*
-                         * drop old extent pointer at first, then insert the
-                         * new pointers one bye one
-                         */
-                        btrfs_release_path(root, path);
-                        ret = btrfs_drop_extents(trans, root, inode, key.offset,
-                                                 key.offset + num_bytes,
-                                                 key.offset, &alloc_hint);
-                        BUG_ON(ret);
-                        for (i = 0; i < nr_extents; i++) {
-                                if (ext_offset >= new_extents[i].num_bytes) {
-                                        ext_offset -= new_extents[i].num_bytes;
-                                        continue;
-                                }
-                                extent_len = min(new_extents[i].num_bytes -
-                                                 ext_offset, num_bytes);
-                                ret = btrfs_insert_empty_item(trans, root,
-                                                              path, &key,
-                                                              sizeof(*fi));
-                                BUG_ON(ret);
-                                leaf = path->nodes[0];
-                                fi = btrfs_item_ptr(leaf, path->slots[0],
-                                                struct btrfs_file_extent_item);
-                                btrfs_set_file_extent_generation(leaf, fi,
-                                                        trans->transid);
-                                btrfs_set_file_extent_type(leaf, fi,
-                                                        BTRFS_FILE_EXTENT_REG);
-                                btrfs_set_file_extent_disk_bytenr(leaf, fi,
-                                                new_extents[i].disk_bytenr);
-                                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
-                                                new_extents[i].disk_num_bytes);
-                                btrfs_set_file_extent_ram_bytes(leaf, fi,
-                                                new_extents[i].ram_bytes);
-                                btrfs_set_file_extent_compression(leaf, fi,
-                                                new_extents[i].compression);
-                                btrfs_set_file_extent_encryption(leaf, fi,
-                                                new_extents[i].encryption);
-                                btrfs_set_file_extent_other_encoding(leaf, fi,
-                                                new_extents[i].other_encoding);
-                                btrfs_set_file_extent_num_bytes(leaf, fi,
-                                                        extent_len);
-                                ext_offset += new_extents[i].offset;
-                                btrfs_set_file_extent_offset(leaf, fi,
-                                                        ext_offset);
-                                btrfs_mark_buffer_dirty(leaf);
-                                btrfs_drop_extent_cache(inode, key.offset,
-                                                key.offset + extent_len - 1, 0);
-                                ret = btrfs_inc_extent_ref(trans, root,
-                                                new_extents[i].disk_bytenr,
-                                                new_extents[i].disk_num_bytes,
-                                                leaf->start,
-                                                root->root_key.objectid,
-                                                trans->transid, key.objectid);
-                                BUG_ON(ret);
-                                btrfs_release_path(root, path);
-                                inode_add_bytes(inode, extent_len);
-                                ext_offset = 0;
-                                num_bytes -= extent_len;
-                                key.offset += extent_len;
-                                if (num_bytes == 0)
-                                        break;
-                        }
-                        BUG_ON(i >= nr_extents);
-#endif
-                }
-                if (extent_locked) {
-                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
-                                      lock_end, GFP_NOFS);
-                        extent_locked = 0;
-                }
-skip:
-                if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
-                    key.offset >= search_end)
-                        break;
-                cond_resched();
-        }
-        ret = 0;
-out:
-        btrfs_release_path(root, path);
-        if (inode) {
-                mutex_unlock(&inode->i_mutex);
-                if (extent_locked) {
-                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
-                                      lock_end, GFP_NOFS);
-                }
-                iput(inode);
-        }
-        return ret;
-}
-int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
-                               struct extent_buffer *buf, u64 orig_start)
-{
-        int level;
-        int ret;
-        BUG_ON(btrfs_header_generation(buf) != trans->transid);
-        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
-        level = btrfs_header_level(buf);
-        if (level == 0) {
-                struct btrfs_leaf_ref *ref;
-                struct btrfs_leaf_ref *orig_ref;
-                orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
-                if (!orig_ref)
-                        return -ENOENT;
-                ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
-                if (!ref) {
-                        btrfs_free_leaf_ref(root, orig_ref);
-                        return -ENOMEM;
-                }
-                ref->nritems = orig_ref->nritems;
-                memcpy(ref->extents, orig_ref->extents,
-                        sizeof(ref->extents[0]) * ref->nritems);
-                btrfs_free_leaf_ref(root, orig_ref);
-                ref->root_gen = trans->transid;
-                ref->bytenr = buf->start;
-                ref->owner = btrfs_header_owner(buf);
-                ref->generation = btrfs_header_generation(buf);
-                ret = btrfs_add_leaf_ref(root, ref, 0);
-                WARN_ON(ret);
-                btrfs_free_leaf_ref(root, ref);
-        }
-        return 0;
-}
-static noinline int invalidate_extent_cache(struct btrfs_root *root,
-                                        struct extent_buffer *leaf,
-                                        struct btrfs_block_group_cache *group,
-                                        struct btrfs_root *target_root)
-{
-        struct btrfs_key key;
-        struct inode *inode = NULL;
-        struct btrfs_file_extent_item *fi;
-        struct extent_state *cached_state = NULL;
-        u64 num_bytes;
-        u64 skip_objectid = 0;
-        u32 nritems;
-        u32 i;
-        nritems = btrfs_header_nritems(leaf);
-        for (i = 0; i < nritems; i++) {
-                btrfs_item_key_to_cpu(leaf, &key, i);
-                if (key.objectid == skip_objectid ||
-                    key.type != BTRFS_EXTENT_DATA_KEY)
-                        continue;
-                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-                if (btrfs_file_extent_type(leaf, fi) ==
-                    BTRFS_FILE_EXTENT_INLINE)
-                        continue;
-                if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
-                        continue;
-                if (!inode || inode->i_ino != key.objectid) {
-                        iput(inode);
-                        inode = btrfs_ilookup(target_root->fs_info->sb,
-                                              key.objectid, target_root, 1);
-                }
-                if (!inode) {
-                        skip_objectid = key.objectid;
-                        continue;
-                }
-                num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
-                lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
-                                 key.offset + num_bytes - 1, 0, &cached_state,
-                                 GFP_NOFS);
-                btrfs_drop_extent_cache(inode, key.offset,
-                                        key.offset + num_bytes - 1, 1);
-                unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
-                                     key.offset + num_bytes - 1, &cached_state,
-                                     GFP_NOFS);
-                cond_resched();
-        }
-        iput(inode);
-        return 0;
-}
-static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
-                                        struct extent_buffer *leaf,
-                                        struct btrfs_block_group_cache *group,
-                                        struct inode *reloc_inode)
-{
-        struct btrfs_key key;
-        struct btrfs_key extent_key;
-        struct btrfs_file_extent_item *fi;
-        struct btrfs_leaf_ref *ref;
-        struct disk_extent *new_extent;
-        u64 bytenr;
-        u64 num_bytes;
-        u32 nritems;
-        u32 i;
-        int ext_index;
-        int nr_extent;
-        int ret;
-        new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
-        if (!new_extent)
-                return -ENOMEM;
-        ref = btrfs_lookup_leaf_ref(root, leaf->start);
-        BUG_ON(!ref);
-        ext_index = -1;
-        nritems = btrfs_header_nritems(leaf);
-        for (i = 0; i < nritems; i++) {
-                btrfs_item_key_to_cpu(leaf, &key, i);
-                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-                        continue;
-                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-                if (btrfs_file_extent_type(leaf, fi) ==
-                    BTRFS_FILE_EXTENT_INLINE)
-                        continue;
-                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-                if (bytenr == 0)
-                        continue;
-                ext_index++;
-                if (bytenr >= group->key.objectid + group->key.offset ||
-                    bytenr + num_bytes <= group->key.objectid)
-                        continue;
-                extent_key.objectid = bytenr;
-                extent_key.offset = num_bytes;
-                extent_key.type = BTRFS_EXTENT_ITEM_KEY;
-                nr_extent = 1;
-                ret = get_new_locations(reloc_inode, &extent_key,
-                                        group->key.objectid, 1,
-                                        &new_extent, &nr_extent);
-                if (ret > 0)
-                        continue;
-                BUG_ON(ret < 0);
-                BUG_ON(ref->extents[ext_index].bytenr != bytenr);
-                BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
-                ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
-                ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
-                btrfs_set_file_extent_disk_bytenr(leaf, fi,
-                                                new_extent->disk_bytenr);
-                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
-                                                new_extent->disk_num_bytes);
-                btrfs_mark_buffer_dirty(leaf);
-                ret = btrfs_inc_extent_ref(trans, root,
-                                        new_extent->disk_bytenr,
-                                        new_extent->disk_num_bytes,
-                                        leaf->start,
-                                        root->root_key.objectid,
-                                        trans->transid, key.objectid);
-                BUG_ON(ret);
-                ret = btrfs_free_extent(trans, root,
-                                        bytenr, num_bytes, leaf->start,
-                                        btrfs_header_owner(leaf),
-                                        btrfs_header_generation(leaf),
-                                        key.objectid, 0);
-                BUG_ON(ret);
-                cond_resched();
-        }
-        kfree(new_extent);
-        BUG_ON(ext_index + 1 != ref->nritems);
-        btrfs_free_leaf_ref(root, ref);
-        return 0;
-}
-int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root)
-{
-        struct btrfs_root *reloc_root;
-        int ret;
-        if (root->reloc_root) {
-                reloc_root = root->reloc_root;
-                root->reloc_root = NULL;
-                list_add(&reloc_root->dead_list,
-                         &root->fs_info->dead_reloc_roots);
-                btrfs_set_root_bytenr(&reloc_root->root_item,
-                                      reloc_root->node->start);
-                btrfs_set_root_level(&root->root_item,
-                                     btrfs_header_level(reloc_root->node));
-                memset(&reloc_root->root_item.drop_progress, 0,
-                        sizeof(struct btrfs_disk_key));
-                reloc_root->root_item.drop_level = 0;
-                ret = btrfs_update_root(trans, root->fs_info->tree_root,
-                                        &reloc_root->root_key,
-                                        &reloc_root->root_item);
-                BUG_ON(ret);
-        }
-        return 0;
-}
-int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
-{
-        struct btrfs_trans_handle *trans;
-        struct btrfs_root *reloc_root;
-        struct btrfs_root *prev_root = NULL;
-        struct list_head dead_roots;
-        int ret;
-        unsigned long nr;
-        INIT_LIST_HEAD(&dead_roots);
-        list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
-        while (!list_empty(&dead_roots)) {
-                reloc_root = list_entry(dead_roots.prev,
-                                        struct btrfs_root, dead_list);
-                list_del_init(&reloc_root->dead_list);
-                BUG_ON(reloc_root->commit_root != NULL);
-                while (1) {
-                        trans = btrfs_join_transaction(root, 1);
-                        BUG_ON(IS_ERR(trans));
-                        mutex_lock(&root->fs_info->drop_mutex);
-                        ret = btrfs_drop_snapshot(trans, reloc_root);
-                        if (ret != -EAGAIN)
-                                break;
-                        mutex_unlock(&root->fs_info->drop_mutex);
-                        nr = trans->blocks_used;
-                        ret = btrfs_end_transaction(trans, root);
-                        BUG_ON(ret);
-                        btrfs_btree_balance_dirty(root, nr);
-                }
-                free_extent_buffer(reloc_root->node);
-                ret = btrfs_del_root(trans, root->fs_info->tree_root,
-                                     &reloc_root->root_key);
-                BUG_ON(ret);
-                mutex_unlock(&root->fs_info->drop_mutex);
-                nr = trans->blocks_used;
-                ret = btrfs_end_transaction(trans, root);
-                BUG_ON(ret);
-                btrfs_btree_balance_dirty(root, nr);
-                kfree(prev_root);
-                prev_root = reloc_root;
-        }
-        if (prev_root) {
-                btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
-                kfree(prev_root);
-        }
-        return 0;
-}
-int btrfs_add_dead_reloc_root(struct btrfs_root *root)
-{
-        list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
-        return 0;
-}
-int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
-{
-        struct btrfs_root *reloc_root;
-        struct btrfs_trans_handle *trans;
-        struct btrfs_key location;
-        int found;
-        int ret;
-        mutex_lock(&root->fs_info->tree_reloc_mutex);
-        ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
-        BUG_ON(ret);
-        found = !list_empty(&root->fs_info->dead_reloc_roots);
-        mutex_unlock(&root->fs_info->tree_reloc_mutex);
-        if (found) {
-                trans = btrfs_start_transaction(root, 1);
-                BUG_ON(IS_ERR(trans));
-                ret = btrfs_commit_transaction(trans, root);
-                BUG_ON(ret);
-        }
-        location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
-        location.offset = (u64)-1;
-        location.type = BTRFS_ROOT_ITEM_KEY;
-        reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
-        BUG_ON(!reloc_root);
-        ret = btrfs_orphan_cleanup(reloc_root);
-        BUG_ON(ret);
-        return 0;
-}
-static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root)
-{
-        struct btrfs_root *reloc_root;
-        struct extent_buffer *eb;
-        struct btrfs_root_item *root_item;
-        struct btrfs_key root_key;
-        int ret;
-        BUG_ON(!root->ref_cows);
-        if (root->reloc_root)
-                return 0;
-        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
-        if (!root_item)
-                return -ENOMEM;
-        ret = btrfs_copy_root(trans, root, root->commit_root,
-                              &eb, BTRFS_TREE_RELOC_OBJECTID);
-        BUG_ON(ret);
-        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
-        root_key.offset = root->root_key.objectid;
-        root_key.type = BTRFS_ROOT_ITEM_KEY;
-        memcpy(root_item, &root->root_item, sizeof(root_item));
-        btrfs_set_root_refs(root_item, 0);
-        btrfs_set_root_bytenr(root_item, eb->start);
-        btrfs_set_root_level(root_item, btrfs_header_level(eb));
-        btrfs_set_root_generation(root_item, trans->transid);
-        btrfs_tree_unlock(eb);
-        free_extent_buffer(eb);
-        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
-                                &root_key, root_item);
-        BUG_ON(ret);
-        kfree(root_item);
-        reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-                                                 &root_key);
-        BUG_ON(IS_ERR(reloc_root));
-        reloc_root->last_trans = trans->transid;
-        reloc_root->commit_root = NULL;
-        reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
-        root->reloc_root = reloc_root;
-        return 0;
-}
-/*
- * Core function of space balance.
- *
- * The idea is using reloc trees to relocate tree blocks in reference
- * counted roots. There is one reloc tree for each subvol, and all
- * reloc trees share same root key objectid. Reloc trees are snapshots
- * of the latest committed roots of subvols (root->commit_root).
- *
- * To relocate a tree block referenced by a subvol, there are two steps.
- * COW the block through subvol's reloc tree, then update block pointer
- * in the subvol to point to the new block. Since all reloc trees share
- * same root key objectid, doing special handing for tree blocks owned
- * by them is easy. Once a tree block has been COWed in one reloc tree,
- * we can use the resulting new block directly when the same block is
- * required to COW again through other reloc trees. By this way, relocated
- * tree blocks are shared between reloc trees, so they are also shared
- * between subvols.
- */
-static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
-                                      struct btrfs_root *root,
-                                      struct btrfs_path *path,
-                                      struct btrfs_key *first_key,
-                                      struct btrfs_ref_path *ref_path,
-                                      struct btrfs_block_group_cache *group,
-                                      struct inode *reloc_inode)
-{
-        struct btrfs_root *reloc_root;
-        struct extent_buffer *eb = NULL;
-        struct btrfs_key *keys;
-        u64 *nodes;
-        int level;
-        int shared_level;
-        int lowest_level = 0;
-        int ret;
-        if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
-                lowest_level = ref_path->owner_objectid;
-        if (!root->ref_cows) {
-                path->lowest_level = lowest_level;
-                ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
-                BUG_ON(ret < 0);
-                path->lowest_level = 0;
-                btrfs_release_path(root, path);
-                return 0;
-        }
-        mutex_lock(&root->fs_info->tree_reloc_mutex);
-        ret = init_reloc_tree(trans, root);
-        BUG_ON(ret);
-        reloc_root = root->reloc_root;
-        shared_level = ref_path->shared_level;
-        ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
-        keys = ref_path->node_keys;
-        nodes = ref_path->new_nodes;
-        memset(&keys[shared_level + 1], 0,
-               sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
-        memset(&nodes[shared_level + 1], 0,
-               sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
-        if (nodes[lowest_level] == 0) {
-                path->lowest_level = lowest_level;
-                ret = btrfs_search_slot(trans, reloc_root, first_key, path,
-                                        0, 1);
-                BUG_ON(ret);
-                for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
-                        eb = path->nodes[level];
-                        if (!eb || eb == reloc_root->node)
-                                break;
-                        nodes[level] = eb->start;
-                        if (level == 0)
-                                btrfs_item_key_to_cpu(eb, &keys[level], 0);
-                        else
-                                btrfs_node_key_to_cpu(eb, &keys[level], 0);
-                }
-                if (nodes[0] &&
-                    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-                        eb = path->nodes[0];
-                        ret = replace_extents_in_leaf(trans, reloc_root, eb,
-                                                      group, reloc_inode);
-                        BUG_ON(ret);
-                }
-                btrfs_release_path(reloc_root, path);
-        } else {
-                ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
-                                       lowest_level);
-                BUG_ON(ret);
-        }
-        /*
-         * replace tree blocks in the fs tree with tree blocks in
-         * the reloc tree.
-         */
-        ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
-        BUG_ON(ret < 0);
-        if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-                ret = btrfs_search_slot(trans, reloc_root, first_key, path,
-                                        0, 0);
-                BUG_ON(ret);
-                extent_buffer_get(path->nodes[0]);
-                eb = path->nodes[0];
-                btrfs_release_path(reloc_root, path);
-                ret = invalidate_extent_cache(reloc_root, eb, group, root);
-                BUG_ON(ret);
-                free_extent_buffer(eb);
-        }
-        mutex_unlock(&root->fs_info->tree_reloc_mutex);
-        path->lowest_level = 0;
-        return 0;
-}
-static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
-                                        struct btrfs_path *path,
-                                        struct btrfs_key *first_key,
-                                        struct btrfs_ref_path *ref_path)
-{
-        int ret;
-        ret = relocate_one_path(trans, root, path, first_key,
-                                ref_path, NULL, NULL);
-        BUG_ON(ret);
-        return 0;
-}
-static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *extent_root,
-                                    struct btrfs_path *path,
-                                    struct btrfs_key *extent_key)
-{
-        int ret;
-        ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
-        if (ret)
-                goto out;
-        ret = btrfs_del_item(trans, extent_root, path);
-out:
-        btrfs_release_path(extent_root, path);
-        return ret;
-}
-static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
-                                                struct btrfs_ref_path *ref_path)
-{
-        struct btrfs_key root_key;
-        root_key.objectid = ref_path->root_objectid;
-        root_key.type = BTRFS_ROOT_ITEM_KEY;
-        if (is_cowonly_root(ref_path->root_objectid))
-                root_key.offset = 0;
-        else
-                root_key.offset = (u64)-1;
-        return btrfs_read_fs_root_no_name(fs_info, &root_key);
-}
-static noinline int relocate_one_extent(struct btrfs_root *extent_root,
-                                        struct btrfs_path *path,
-                                        struct btrfs_key *extent_key,
-                                        struct btrfs_block_group_cache *group,
-                                        struct inode *reloc_inode, int pass)
-{
-        struct btrfs_trans_handle *trans;
-        struct btrfs_root *found_root;
-        struct btrfs_ref_path *ref_path = NULL;
-        struct disk_extent *new_extents = NULL;
-        int nr_extents = 0;
-        int loops;
-        int ret;
-        int level;
-        struct btrfs_key first_key;
-        u64 prev_block = 0;
-        trans = btrfs_start_transaction(extent_root, 1);
-        BUG_ON(IS_ERR(trans));
-        if (extent_key->objectid == 0) {
-                ret = del_extent_zero(trans, extent_root, path, extent_key);
-                goto out;
-        }
-        ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
-        if (!ref_path) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        for (loops = 0; ; loops++) {
-                if (loops == 0) {
-                        ret = btrfs_first_ref_path(trans, extent_root, ref_path,
-                                                   extent_key->objectid);
-                } else {
-                        ret = btrfs_next_ref_path(trans, extent_root, ref_path);
-                }
-                if (ret < 0)
-                        goto out;
-                if (ret > 0)
-                        break;
-                if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
-                    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-                        continue;
-                found_root = read_ref_root(extent_root->fs_info, ref_path);
-                BUG_ON(!found_root);
-                /*
-                 * for reference counted tree, only process reference paths
-                 * rooted at the latest committed root.
-                 */
-                if (found_root->ref_cows &&
-                    ref_path->root_generation != found_root->root_key.offset)
-                        continue;
-                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-                        if (pass == 0) {
-                                /*
-                                 * copy data extents to new locations
-                                 */
-                                u64 group_start = group->key.objectid;
-                                ret = relocate_data_extent(reloc_inode,
-                                                           extent_key,
-                                                           group_start);
-                                if (ret < 0)
-                                        goto out;
-                                break;
-                        }
-                        level = 0;
-                } else {
-                        level = ref_path->owner_objectid;
-                }
-                if (prev_block != ref_path->nodes[level]) {
-                        struct extent_buffer *eb;
-                        u64 block_start = ref_path->nodes[level];
-                        u64 block_size = btrfs_level_size(found_root, level);
-                        eb = read_tree_block(found_root, block_start,
-                                             block_size, 0);
-                        if (!eb) {
-                                ret = -EIO;
-                                goto out;
-                        }
-                        btrfs_tree_lock(eb);
-                        BUG_ON(level != btrfs_header_level(eb));
-                        if (level == 0)
-                                btrfs_item_key_to_cpu(eb, &first_key, 0);
-                        else
-                                btrfs_node_key_to_cpu(eb, &first_key, 0);
-                        btrfs_tree_unlock(eb);
-                        free_extent_buffer(eb);
-                        prev_block = block_start;
-                }
-                mutex_lock(&extent_root->fs_info->trans_mutex);
-                btrfs_record_root_in_trans(found_root);
-                mutex_unlock(&extent_root->fs_info->trans_mutex);
-                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-                        /*
-                         * try to update data extent references while
-                         * keeping metadata shared between snapshots.
-                         */
-                        if (pass == 1) {
-                                ret = relocate_one_path(trans, found_root,
-                                                path, &first_key, ref_path,
-                                                group, reloc_inode);
-                                if (ret < 0)
-                                        goto out;
-                                continue;
-                        }
-                        /*
-                         * use fallback method to process the remaining
-                         * references.
-                         */
-                        if (!new_extents) {
-                                u64 group_start = group->key.objectid;
-                                new_extents = kmalloc(sizeof(*new_extents),
-                                                      GFP_NOFS);
-                                if (!new_extents) {
-                                        ret = -ENOMEM;
-                                        goto out;
-                                }
-                                nr_extents = 1;
-                                ret = get_new_locations(reloc_inode,
-                                                        extent_key,
-                                                        group_start, 1,
-                                                        &new_extents,
-                                                        &nr_extents);
-                                if (ret)
-                                        goto out;
-                        }
-                        ret = replace_one_extent(trans, found_root,
-                                                path, extent_key,
-                                                &first_key, ref_path,
-                                                new_extents, nr_extents);
-                } else {
-                        ret = relocate_tree_block(trans, found_root, path,
-                                                  &first_key, ref_path);
-                }
-                if (ret < 0)
-                        goto out;
-        }
-        ret = 0;
-out:
-        btrfs_end_transaction(trans, extent_root);
-        kfree(new_extents);
-        kfree(ref_path);
-        return ret;
-}
-#endif
 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
        u64 num_devices;
@@ -8555,10 +6905,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                        ret = -ENOMEM;
                        goto error;
                }
+                cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+                                                GFP_NOFS);
+                if (!cache->free_space_ctl) {
+                        kfree(cache);
+                        ret = -ENOMEM;
+                        goto error;
+                }
                atomic_set(&cache->count, 1);
                spin_lock_init(&cache->lock);
-                spin_lock_init(&cache->tree_lock);
                cache->fs_info = info;
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
@@ -8566,24 +6922,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                if (need_clear)
                        cache->disk_cache_state = BTRFS_DC_CLEAR;
-                /*
-                 * we only want to have 32k of ram per block group for keeping
-                 * track of free space, and if we pass 1/2 of that we want to
-                 * start converting things over to using bitmaps
-                 */
-                cache->extents_thresh = ((1024 * 32) / 2) /
-                        sizeof(struct btrfs_free_space);
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
                memcpy(&cache->key, &found_key, sizeof(found_key));
                key.objectid = found_key.objectid + found_key.offset;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                cache->flags = btrfs_block_group_flags(&cache->item);
                cache->sectorsize = root->sectorsize;
+                btrfs_init_free_space_ctl(cache);
                /*
                 * We need to exclude the super stripes now so that the space
                 * info has super bytes accounted for, otherwise we'll think
@@ -8670,6 +7020,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache = kzalloc(sizeof(*cache), GFP_NOFS);
        if (!cache)
                return -ENOMEM;
+        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+                                        GFP_NOFS);
+        if (!cache->free_space_ctl) {
+                kfree(cache);
+                return -ENOMEM;
+        }
        cache->key.objectid = chunk_offset;
        cache->key.offset = size;
@@ -8677,19 +7033,13 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->sectorsize = root->sectorsize;
        cache->fs_info = root->fs_info;
-        /*
-         * we only want to have 32k of ram per block group for keeping track
-         * of free space, and if we pass 1/2 of that we want to start
-         * converting things over to using bitmaps
-         */
-        cache->extents_thresh = ((1024 * 32) / 2) /
-                sizeof(struct btrfs_free_space);
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
-        spin_lock_init(&cache->tree_lock);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
+        btrfs_init_free_space_ctl(cache);
        btrfs_set_block_group_used(&cache->item, bytes_used);
        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
        cache->flags = type;
@@ -8802,12 +7152,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        if (ret < 0)
                goto out;
        if (ret > 0)
-                btrfs_release_path(tree_root, path);
+                btrfs_release_path(path);
        if (ret == 0) {
                ret = btrfs_del_item(trans, tree_root, path);
                if (ret)
                        goto out;
-                btrfs_release_path(tree_root, path);
+                btrfs_release_path(path);
        }
        spin_lock(&root->fs_info->block_group_cache_lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4f9893243dae..c5d9fbb92bc3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -103,7 +103,7 @@ void extent_io_exit(void)
 }
 void extent_io_tree_init(struct extent_io_tree *tree,
-                          struct address_space *mapping, gfp_t mask)
+                         struct address_space *mapping)
 {
        tree->state = RB_ROOT;
        INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
@@ -441,6 +441,15 @@ static int clear_state_bit(struct extent_io_tree *tree,
        return ret;
 }
+static struct extent_state *
+alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+        if (!prealloc)
+                prealloc = alloc_extent_state(GFP_ATOMIC);
+        return prealloc;
+}
 /*
 * clear some bits on a range in the tree.  This may require splitting
 * or inserting elements in the tree, so the gfp mask is used to
@@ -531,8 +540,8 @@ hit_next:
         */
        if (state->start < start) {
-                if (!prealloc)
+                prealloc = alloc_extent_state_atomic(prealloc);
-                        prealloc = alloc_extent_state(GFP_ATOMIC);
+                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, start);
                BUG_ON(err == -EEXIST);
                prealloc = NULL;
@@ -553,8 +562,8 @@ hit_next:
         * on the first half
         */
        if (state->start <= end && state->end > end) {
-                if (!prealloc)
+                prealloc = alloc_extent_state_atomic(prealloc);
-                        prealloc = alloc_extent_state(GFP_ATOMIC);
+                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, end + 1);
                BUG_ON(err == -EEXIST);
                if (wake)
@@ -727,8 +736,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
                prealloc = alloc_extent_state(mask);
-                if (!prealloc)
+                BUG_ON(!prealloc);
-                        return -ENOMEM;
        }
        spin_lock(&tree->lock);
@@ -745,6 +753,8 @@ again:
         */
        node = tree_search(tree, start);
        if (!node) {
+                prealloc = alloc_extent_state_atomic(prealloc);
+                BUG_ON(!prealloc);
                err = insert_state(tree, prealloc, start, end, &bits);
                prealloc = NULL;
                BUG_ON(err == -EEXIST);
@@ -773,20 +783,18 @@ hit_next:
                if (err)
                        goto out;
+                next_node = rb_next(node);
                cache_state(state, cached_state);
                merge_state(tree, state);
                if (last_end == (u64)-1)
                        goto out;
                start = last_end + 1;
-                if (start < end && prealloc && !need_resched()) {
+                if (next_node && start < end && prealloc && !need_resched()) {
-                        next_node = rb_next(node);
+                        state = rb_entry(next_node, struct extent_state,
-                        if (next_node) {
+                                         rb_node);
-                                state = rb_entry(next_node, struct extent_state,
+                        if (state->start == start)
-                                                 rb_node);
+                                goto hit_next;
-                                if (state->start == start)
-                                        goto hit_next;
-                        }
                }
                goto search_again;
        }
@@ -813,6 +821,9 @@ hit_next:
                        err = -EEXIST;
                        goto out;
                }
+                prealloc = alloc_extent_state_atomic(prealloc);
+                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, start);
                BUG_ON(err == -EEXIST);
                prealloc = NULL;
@@ -843,14 +854,25 @@ hit_next:
                        this_end = end;
                else
                        this_end = last_start - 1;
+                prealloc = alloc_extent_state_atomic(prealloc);
+                BUG_ON(!prealloc);
+                /*
+                 * Avoid to free 'prealloc' if it can be merged with
+                 * the later extent.
+                 */
+                atomic_inc(&prealloc->refs);
                err = insert_state(tree, prealloc, start, this_end,
                                   &bits);
                BUG_ON(err == -EEXIST);
                if (err) {
+                        free_extent_state(prealloc);
                        prealloc = NULL;
                        goto out;
                }
                cache_state(prealloc, cached_state);
+                free_extent_state(prealloc);
                prealloc = NULL;
                start = this_end + 1;
                goto search_again;
@@ -867,6 +889,9 @@ hit_next:
                        err = -EEXIST;
                        goto out;
                }
+                prealloc = alloc_extent_state_atomic(prealloc);
+                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, end + 1);
                BUG_ON(err == -EEXIST);
@@ -943,13 +968,6 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                              NULL, mask);
 }
-static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
-                       gfp_t mask)
-{
-        return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
-                                NULL, mask);
-}
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask)
 {
@@ -965,11 +983,6 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
                                cached_state, mask);
 }
-int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
-{
-        return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
-}
 /*
 * either insert or lock state struct between start and end use mask to tell
 * us if waiting is desired.
@@ -1030,25 +1043,6 @@ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 }
 /*
- * helper function to set pages and extents in the tree dirty
- */
-int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
-{
-        unsigned long index = start >> PAGE_CACHE_SHIFT;
-        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-        struct page *page;
-        while (index <= end_index) {
-                page = find_get_page(tree->mapping, index);
-                BUG_ON(!page);
-                __set_page_dirty_nobuffers(page);
-                page_cache_release(page);
-                index++;
-        }
-        return 0;
-}
-/*
 * helper function to set both pages and extents in the tree writeback
 */
 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1821,46 +1815,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
        bio_put(bio);
 }
-/*
- * IO done from prepare_write is pretty simple, we just unlock
- * the structs in the extent tree when done, and set the uptodate bits
- * as appropriate.
- */
-static void end_bio_extent_preparewrite(struct bio *bio, int err)
-{
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        struct extent_io_tree *tree;
-        u64 start;
-        u64 end;
-        do {
-                struct page *page = bvec->bv_page;
-                struct extent_state *cached = NULL;
-                tree = &BTRFS_I(page->mapping->host)->io_tree;
-                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-                        bvec->bv_offset;
-                end = start + bvec->bv_len - 1;
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (uptodate) {
-                        set_extent_uptodate(tree, start, end, &cached,
-                                            GFP_ATOMIC);
-                } else {
-                        ClearPageUptodate(page);
-                        SetPageError(page);
-                }
-                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
-        } while (bvec >= bio->bi_io_vec);
-        bio_put(bio);
-}
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
                gfp_t gfp_flags)
@@ -2009,7 +1963,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        struct btrfs_ordered_extent *ordered;
        int ret;
        int nr = 0;
-        size_t page_offset = 0;
+        size_t pg_offset = 0;
        size_t iosize;
        size_t disk_io_size;
        size_t blocksize = inode->i_sb->s_blocksize;
@@ -2052,9 +2006,9 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        char *userpage;
                        struct extent_state *cached = NULL;
-                        iosize = PAGE_CACHE_SIZE - page_offset;
+                        iosize = PAGE_CACHE_SIZE - pg_offset;
                        userpage = kmap_atomic(page, KM_USER0);
-                        memset(userpage + page_offset, 0, iosize);
+                        memset(userpage + pg_offset, 0, iosize);
                        flush_dcache_page(page);
                        kunmap_atomic(userpage, KM_USER0);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
@@ -2063,9 +2017,9 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                                             &cached, GFP_NOFS);
                        break;
                }
-                em = get_extent(inode, page, page_offset, cur,
+                em = get_extent(inode, page, pg_offset, cur,
                                end - cur + 1, 0);
-                if (IS_ERR(em) || !em) {
+                if (IS_ERR_OR_NULL(em)) {
                        SetPageError(page);
                        unlock_extent(tree, cur, end, GFP_NOFS);
                        break;
@@ -2103,7 +2057,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        struct extent_state *cached = NULL;
                        userpage = kmap_atomic(page, KM_USER0);
-                        memset(userpage + page_offset, 0, iosize);
+                        memset(userpage + pg_offset, 0, iosize);
                        flush_dcache_page(page);
                        kunmap_atomic(userpage, KM_USER0);
@@ -2112,7 +2066,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        unlock_extent_cached(tree, cur, cur + iosize - 1,
                                             &cached, GFP_NOFS);
                        cur = cur + iosize;
-                        page_offset += iosize;
+                        pg_offset += iosize;
                        continue;
                }
                /* the get_extent function already copied into the page */
@@ -2121,7 +2075,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        check_page_uptodate(tree, page);
                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
                        cur = cur + iosize;
-                        page_offset += iosize;
+                        pg_offset += iosize;
                        continue;
                }
                /* we have an inline extent but it didn't get marked up
@@ -2131,7 +2085,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        SetPageError(page);
                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
                        cur = cur + iosize;
-                        page_offset += iosize;
+                        pg_offset += iosize;
                        continue;
                }
@@ -2144,7 +2098,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
                        pnr -= page->index;
                        ret = submit_extent_page(READ, tree, page,
-                                         sector, disk_io_size, page_offset,
+                                         sector, disk_io_size, pg_offset,
                                         bdev, bio, pnr,
                                         end_bio_extent_readpage, mirror_num,
                                         *bio_flags,
@@ -2155,7 +2109,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                if (ret)
                        SetPageError(page);
                cur = cur + iosize;
-                page_offset += iosize;
+                pg_offset += iosize;
        }
 out:
        if (!nr) {
@@ -2351,7 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                }
                em = epd->get_extent(inode, page, pg_offset, cur,
                                     end - cur + 1, 1);
-                if (IS_ERR(em) || !em) {
+                if (IS_ERR_OR_NULL(em)) {
                        SetPageError(page);
                        break;
                }
@@ -2730,128 +2684,6 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 }
 /*
- * simple commit_write call, set_range_dirty is used to mark both
- * the pages and the extent records as dirty
- */
-int extent_commit_write(struct extent_io_tree *tree,
-                        struct inode *inode, struct page *page,
-                        unsigned from, unsigned to)
-{
-        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-        set_page_extent_mapped(page);
-        set_page_dirty(page);
-        if (pos > inode->i_size) {
-                i_size_write(inode, pos);
-                mark_inode_dirty(inode);
-        }
-        return 0;
-}
-int extent_prepare_write(struct extent_io_tree *tree,
-                         struct inode *inode, struct page *page,
-                         unsigned from, unsigned to, get_extent_t *get_extent)
-{
-        u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-        u64 block_start;
-        u64 orig_block_start;
-        u64 block_end;
-        u64 cur_end;
-        struct extent_map *em;
-        unsigned blocksize = 1 << inode->i_blkbits;
-        size_t page_offset = 0;
-        size_t block_off_start;
-        size_t block_off_end;
-        int err = 0;
-        int iocount = 0;
-        int ret = 0;
-        int isnew;
-        set_page_extent_mapped(page);
-        block_start = (page_start + from) & ~((u64)blocksize - 1);
-        block_end = (page_start + to - 1) | (blocksize - 1);
-        orig_block_start = block_start;
-        lock_extent(tree, page_start, page_end, GFP_NOFS);
-        while (block_start <= block_end) {
-                em = get_extent(inode, page, page_offset, block_start,
-                                block_end - block_start + 1, 1);
-                if (IS_ERR(em) || !em)
-                        goto err;
-                cur_end = min(block_end, extent_map_end(em) - 1);
-                block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
-                block_off_end = block_off_start + blocksize;
-                isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
-                if (!PageUptodate(page) && isnew &&
-                    (block_off_end > to || block_off_start < from)) {
-                        void *kaddr;
-                        kaddr = kmap_atomic(page, KM_USER0);
-                        if (block_off_end > to)
-                                memset(kaddr + to, 0, block_off_end - to);
-                        if (block_off_start < from)
-                                memset(kaddr + block_off_start, 0,
-                                       from - block_off_start);
-                        flush_dcache_page(page);
-                        kunmap_atomic(kaddr, KM_USER0);
-                }
-                if ((em->block_start != EXTENT_MAP_HOLE &&
-                     em->block_start != EXTENT_MAP_INLINE) &&
-                    !isnew && !PageUptodate(page) &&
-                    (block_off_end > to || block_off_start < from) &&
-                    !test_range_bit(tree, block_start, cur_end,
-                                    EXTENT_UPTODATE, 1, NULL)) {
-                        u64 sector;
-                        u64 extent_offset = block_start - em->start;
-                        size_t iosize;
-                        sector = (em->block_start + extent_offset) >> 9;
-                        iosize = (cur_end - block_start + blocksize) &
-                                ~((u64)blocksize - 1);
-                        /*
-                         * we've already got the extent locked, but we
-                         * need to split the state such that our end_bio
-                         * handler can clear the lock.
-                         */
-                        set_extent_bit(tree, block_start,
-                                       block_start + iosize - 1,
-                                       EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
-                        ret = submit_extent_page(READ, tree, page,
-                                         sector, iosize, page_offset, em->bdev,
-                                         NULL, 1,
-                                         end_bio_extent_preparewrite, 0,
-                                         0, 0);
-                        if (ret && !err)
-                                err = ret;
-                        iocount++;
-                        block_start = block_start + iosize;
-                } else {
-                        struct extent_state *cached = NULL;
-                        set_extent_uptodate(tree, block_start, cur_end, &cached,
-                                            GFP_NOFS);
-                        unlock_extent_cached(tree, block_start, cur_end,
-                                             &cached, GFP_NOFS);
-                        block_start = cur_end + 1;
-                }
-                page_offset = block_start & (PAGE_CACHE_SIZE - 1);
-                free_extent_map(em);
-        }
-        if (iocount) {
-                wait_extent_bit(tree, orig_block_start,
-                                block_end, EXTENT_LOCKED);
-        }
-        check_page_uptodate(tree, page);
-err:
-        /* FIXME, zero out newly allocated blocks on error */
-        return err;
-}
-/*
 * a helper for releasepage, this tests for areas of the page that
 * are locked or under IO and drops the related state bits if it is safe
 * to drop the page.
@@ -2909,7 +2741,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
                        len = end - start + 1;
                        write_lock(&map->lock);
                        em = lookup_extent_mapping(map, start, len);
-                        if (!em || IS_ERR(em)) {
+                        if (IS_ERR_OR_NULL(em)) {
                                write_unlock(&map->lock);
                                break;
                        }
@@ -2937,33 +2769,6 @@ int try_release_extent_mapping(struct extent_map_tree *map,
        return try_release_extent_state(map, tree, page, mask);
 }
-sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
-                get_extent_t *get_extent)
-{
-        struct inode *inode = mapping->host;
-        struct extent_state *cached_state = NULL;
-        u64 start = iblock << inode->i_blkbits;
-        sector_t sector = 0;
-        size_t blksize = (1 << inode->i_blkbits);
-        struct extent_map *em;
-        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
-                         0, &cached_state, GFP_NOFS);
-        em = get_extent(inode, NULL, 0, start, blksize, 0);
-        unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
-                             start + blksize - 1, &cached_state, GFP_NOFS);
-        if (!em || IS_ERR(em))
-                return 0;
-        if (em->block_start > EXTENT_MAP_LAST_BYTE)
-                goto out;
-        sector = (em->block_start + start - em->start) >> inode->i_blkbits;
-out:
-        free_extent_map(em);
-        return sector;
-}
 /*
 * helper function for fiemap, which doesn't want to see any holes.
 * This maps until we find something past 'last'
@@ -2986,7 +2791,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
                        break;
                len = (len + sectorsize - 1) & ~(sectorsize - 1);
                em = get_extent(inode, NULL, 0, offset, len, 0);
-                if (!em || IS_ERR(em))
+                if (IS_ERR_OR_NULL(em))
                        return em;
                /* if this isn't a hole return it */
@@ -3040,7 +2845,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
         * because there might be preallocation past i_size
         */
        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
-                                       path, inode->i_ino, -1, 0);
+                                       path, btrfs_ino(inode), -1, 0);
        if (ret < 0) {
                btrfs_free_path(path);
                return ret;
@@ -3053,7 +2858,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        found_type = btrfs_key_type(&found_key);
        /* No extents, but there might be delalloc bits */
-        if (found_key.objectid != inode->i_ino ||
+        if (found_key.objectid != btrfs_ino(inode) ||
            found_type != BTRFS_EXTENT_DATA_KEY) {
                /* have to trust i_size as the end */
                last = (u64)-1;
@@ -3276,8 +3081,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                                          u64 start, unsigned long len,
-                                          struct page *page0,
+                                          struct page *page0)
-                                          gfp_t mask)
 {
        unsigned long num_pages = num_extent_pages(start, len);
        unsigned long i;
@@ -3298,7 +3102,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        }
        rcu_read_unlock();
-        eb = __alloc_extent_buffer(tree, start, len, mask);
+        eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
        if (!eb)
                return NULL;
@@ -3315,7 +3119,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                i = 0;
        }
        for (; i < num_pages; i++, index++) {
-                p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
+                p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM);
                if (!p) {
                        WARN_ON(1);
                        goto free_eb;
@@ -3387,8 +3191,7 @@ free_eb:
 }
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-                                         u64 start, unsigned long len,
+                                         u64 start, unsigned long len)
-                                          gfp_t mask)
 {
        struct extent_buffer *eb;
@@ -3449,13 +3252,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
        return 0;
 }
-int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
-                                    struct extent_buffer *eb)
-{
-        return wait_on_extent_writeback(tree, eb->start,
-                                        eb->start + eb->len - 1);
-}
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
                             struct extent_buffer *eb)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index af2d7179c372..4e8445a4757c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -153,23 +153,14 @@ static inline int extent_compress_type(unsigned long bio_flags)
 struct extent_map_tree;
-static inline struct extent_state *extent_state_next(struct extent_state *state)
-{
-        struct rb_node *node;
-        node = rb_next(&state->rb_node);
-        if (!node)
-                return NULL;
-        return rb_entry(node, struct extent_state, rb_node);
-}
 typedef struct extent_map *(get_extent_t)(struct inode *inode,
                                          struct page *page,
-                                          size_t page_offset,
+                                          size_t pg_offset,
                                          u64 start, u64 len,
                                          int create);
 void extent_io_tree_init(struct extent_io_tree *tree,
-                          struct address_space *mapping, gfp_t mask);
+                         struct address_space *mapping);
 int try_release_extent_mapping(struct extent_map_tree *map,
                               struct extent_io_tree *tree, struct page *page,
                               gfp_t mask);
@@ -215,14 +206,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask);
-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-                       gfp_t mask);
-int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
-                                  u64 end, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-                     gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
                          u64 *start_ret, u64 *end_ret, int bits);
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
@@ -243,28 +228,17 @@ int extent_readpages(struct extent_io_tree *tree,
                     struct address_space *mapping,
                     struct list_head *pages, unsigned nr_pages,
                     get_extent_t get_extent);
-int extent_prepare_write(struct extent_io_tree *tree,
-                         struct inode *inode, struct page *page,
-                         unsigned from, unsigned to, get_extent_t *get_extent);
-int extent_commit_write(struct extent_io_tree *tree,
-                        struct inode *inode, struct page *page,
-                        unsigned from, unsigned to);
-sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
-                get_extent_t *get_extent);
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent);
-int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                                          u64 start, unsigned long len,
-                                          struct page *page0,
+                                          struct page *page0);
-                                          gfp_t mask);
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-                                         u64 start, unsigned long len,
+                                         u64 start, unsigned long len);
-                                          gfp_t mask);
 void free_extent_buffer(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
                             struct extent_buffer *eb, u64 start, int wait,
@@ -292,16 +266,11 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
                           unsigned long src_offset, unsigned long len);
 void memset_extent_buffer(struct extent_buffer *eb, char c,
                          unsigned long start, unsigned long len);
-int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
-                                    struct extent_buffer *eb);
-int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                              struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
                             struct extent_buffer *eb);
-int test_extent_buffer_dirty(struct extent_io_tree *tree,
-                             struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
                               struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -319,7 +288,6 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
                      unsigned long *map_start,
                      unsigned long *map_len, int km);
 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
-int release_extent_buffer_tail_pages(struct extent_buffer *eb);
 int extent_range_uptodate(struct extent_io_tree *tree,
                          u64 start, u64 end);
 int extent_clear_unlock_delalloc(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a24a3f2fa13e..2d0410344ea3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -28,12 +28,11 @@ void extent_map_exit(void)
 /**
 * extent_map_tree_init - initialize extent map tree
 * @tree:               tree to initialize
- * @mask:               flags for memory allocations during tree operations
 *
 * Initialize the extent tree @tree.  Should be called for each new inode
 * or other user of the extent_map interface.
 */
-void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
+void extent_map_tree_init(struct extent_map_tree *tree)
 {
        tree->map = RB_ROOT;
        rwlock_init(&tree->lock);
@@ -41,16 +40,15 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 /**
 * alloc_extent_map - allocate new extent map structure
- * @mask:       memory allocation flags
 *
 * Allocate a new extent_map structure.  The new structure is
 * returned with a reference count of one and needs to be
 * freed using free_extent_map()
 */
-struct extent_map *alloc_extent_map(gfp_t mask)
+struct extent_map *alloc_extent_map(void)
 {
        struct extent_map *em;
-        em = kmem_cache_alloc(extent_map_cache, mask);
+        em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
        if (!em)
                return NULL;
        em->in_tree = 0;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 28b44dbd1e35..33a7890b1f40 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -49,14 +49,14 @@ static inline u64 extent_map_block_end(struct extent_map *em)
        return em->block_start + em->block_len;
 }
-void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
+void extent_map_tree_init(struct extent_map_tree *tree);
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len);
 int add_extent_mapping(struct extent_map_tree *tree,
                       struct extent_map *em);
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
-struct extent_map *alloc_extent_map(gfp_t mask);
+struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a6a9d4e8b491..90d4ee52cd45 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -193,7 +193,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                        u32 item_size;
                        if (item)
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                        item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
                                                 path, disk_bytenr, 0);
                        if (IS_ERR(item)) {
@@ -208,12 +208,13 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                                                EXTENT_NODATASUM, GFP_NOFS);
                                } else {
                                        printk(KERN_INFO "btrfs no csum found "
-                                               "for inode %lu start %llu\n",
+                                               "for inode %llu start %llu\n",
-                                               inode->i_ino,
+                                               (unsigned long long)
+                                               btrfs_ino(inode),
                                               (unsigned long long)offset);
                                }
                                item = NULL;
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                                goto found;
                        }
                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
@@ -266,7 +267,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
 }
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
-                             struct list_head *list)
+                             struct list_head *list, int search_commit)
 {
        struct btrfs_key key;
        struct btrfs_path *path;
@@ -283,6 +284,12 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        if (search_commit) {
+                path->skip_locking = 1;
+                path->reada = 2;
+                path->search_commit_root = 1;
+        }
        key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
        key.offset = start;
        key.type = BTRFS_EXTENT_CSUM_KEY;
@@ -495,7 +502,6 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
                u32 new_size = (bytenr - key->offset) >> blocksize_bits;
                new_size *= csum_size;
                ret = btrfs_truncate_item(trans, root, path, new_size, 1);
-                BUG_ON(ret);
        } else if (key->offset >= bytenr && csum_end > end_byte &&
                   end_byte > key->offset) {
                /*
@@ -508,7 +514,6 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
                new_size *= csum_size;
                ret = btrfs_truncate_item(trans, root, path, new_size, 0);
-                BUG_ON(ret);
                key->offset = end_byte;
                ret = btrfs_set_item_key_safe(trans, root, path, key);
@@ -551,10 +556,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret > 0) {
                        if (path->slots[0] == 0)
-                                goto out;
+                                break;
                        path->slots[0]--;
                } else if (ret < 0) {
-                        goto out;
+                        break;
                }
                leaf = path->nodes[0];
@@ -579,7 +584,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                /* delete the entire item, it is inside our range */
                if (key.offset >= bytenr && csum_end <= end_byte) {
                        ret = btrfs_del_item(trans, root, path);
-                        BUG_ON(ret);
+                        if (ret)
+                                goto out;
                        if (key.offset == bytenr)
                                break;
                } else if (key.offset < bytenr && csum_end > end_byte) {
@@ -631,11 +637,12 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                        if (key.offset < bytenr)
                                break;
                }
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
        }
+        ret = 0;
 out:
        btrfs_free_path(path);
-        return 0;
+        return ret;
 }
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
@@ -722,7 +729,7 @@ again:
         * at this point, we know the tree has an item, but it isn't big
         * enough yet to put our csum in.  Grow it
         */
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ret = btrfs_search_slot(trans, root, &file_key, path,
                                csum_size, 1);
        if (ret < 0)
@@ -761,12 +768,11 @@ again:
                        goto insert;
                ret = btrfs_extend_item(trans, root, path, diff);
-                BUG_ON(ret);
                goto csum;
        }
 insert:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        csum_offset = 0;
        if (found_next) {
                u64 tmp = total_bytes + root->sectorsize;
@@ -850,7 +856,7 @@ next_sector:
        }
        btrfs_mark_buffer_dirty(path->nodes[0]);
        if (total_bytes < sums->len) {
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                cond_resched();
                goto again;
        }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 75899a01dded..c6a22d783c35 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -40,6 +40,263 @@
 #include "locking.h"
 #include "compat.h"
+/*
+ * when auto defrag is enabled we
+ * queue up these defrag structs to remember which
+ * inodes need defragging passes
+ */
+struct inode_defrag {
+        struct rb_node rb_node;
+        /* objectid */
+        u64 ino;
+        /*
+         * transid where the defrag was added, we search for
+         * extents newer than this
+         */
+        u64 transid;
+        /* root objectid */
+        u64 root;
+        /* last offset we were able to defrag */
+        u64 last_offset;
+        /* if we've wrapped around back to zero once already */
+        int cycled;
+};
+/* pop a record for an inode into the defrag tree.  The lock
+ * must be held already
+ *
+ * If you're inserting a record for an older transid than an
+ * existing record, the transid already in the tree is lowered
+ *
+ * If an existing record is found the defrag item you
+ * pass in is freed
+ */
+static int __btrfs_add_inode_defrag(struct inode *inode,
+                                    struct inode_defrag *defrag)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct inode_defrag *entry;
+        struct rb_node **p;
+        struct rb_node *parent = NULL;
+        p = &root->fs_info->defrag_inodes.rb_node;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct inode_defrag, rb_node);
+                if (defrag->ino < entry->ino)
+                        p = &parent->rb_left;
+                else if (defrag->ino > entry->ino)
+                        p = &parent->rb_right;
+                else {
+                        /* if we're reinserting an entry for
+                         * an old defrag run, make sure to
+                         * lower the transid of our existing record
+                         */
+                        if (defrag->transid < entry->transid)
+                                entry->transid = defrag->transid;
+                        if (defrag->last_offset > entry->last_offset)
+                                entry->last_offset = defrag->last_offset;
+                        goto exists;
+                }
+        }
+        BTRFS_I(inode)->in_defrag = 1;
+        rb_link_node(&defrag->rb_node, parent, p);
+        rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
+        return 0;
+exists:
+        kfree(defrag);
+        return 0;
+}
+/*
+ * insert a defrag record for this inode if auto defrag is
+ * enabled
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+                           struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct inode_defrag *defrag;
+        int ret = 0;
+        u64 transid;
+        if (!btrfs_test_opt(root, AUTO_DEFRAG))
+                return 0;
+        if (root->fs_info->closing)
+                return 0;
+        if (BTRFS_I(inode)->in_defrag)
+                return 0;
+        if (trans)
+                transid = trans->transid;
+        else
+                transid = BTRFS_I(inode)->root->last_trans;
+        defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+        if (!defrag)
+                return -ENOMEM;
+        defrag->ino = inode->i_ino;
+        defrag->transid = transid;
+        defrag->root = root->root_key.objectid;
+        spin_lock(&root->fs_info->defrag_inodes_lock);
+        if (!BTRFS_I(inode)->in_defrag)
+                ret = __btrfs_add_inode_defrag(inode, defrag);
+        spin_unlock(&root->fs_info->defrag_inodes_lock);
+        return ret;
+}
+/*
+ * must be called with the defrag_inodes lock held
+ */
+struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
+                                             struct rb_node **next)
+{
+        struct inode_defrag *entry = NULL;
+        struct rb_node *p;
+        struct rb_node *parent = NULL;
+        p = info->defrag_inodes.rb_node;
+        while (p) {
+                parent = p;
+                entry = rb_entry(parent, struct inode_defrag, rb_node);
+                if (ino < entry->ino)
+                        p = parent->rb_left;
+                else if (ino > entry->ino)
+                        p = parent->rb_right;
+                else
+                        return entry;
+        }
+        if (next) {
+                while (parent && ino > entry->ino) {
+                        parent = rb_next(parent);
+                        entry = rb_entry(parent, struct inode_defrag, rb_node);
+                }
+                *next = parent;
+        }
+        return NULL;
+}
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+        struct inode_defrag *defrag;
+        struct btrfs_root *inode_root;
+        struct inode *inode;
+        struct rb_node *n;
+        struct btrfs_key key;
+        struct btrfs_ioctl_defrag_range_args range;
+        u64 first_ino = 0;
+        int num_defrag;
+        int defrag_batch = 1024;
+        memset(&range, 0, sizeof(range));
+        range.len = (u64)-1;
+        atomic_inc(&fs_info->defrag_running);
+        spin_lock(&fs_info->defrag_inodes_lock);
+        while(1) {
+                n = NULL;
+                /* find an inode to defrag */
+                defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+                if (!defrag) {
+                        if (n)
+                                defrag = rb_entry(n, struct inode_defrag, rb_node);
+                        else if (first_ino) {
+                                first_ino = 0;
+                                continue;
+                        } else {
+                                break;
+                        }
+                }
+                /* remove it from the rbtree */
+                first_ino = defrag->ino + 1;
+                rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
+                if (fs_info->closing)
+                        goto next_free;
+                spin_unlock(&fs_info->defrag_inodes_lock);
+                /* get the inode */
+                key.objectid = defrag->root;
+                btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+                key.offset = (u64)-1;
+                inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+                if (IS_ERR(inode_root))
+                        goto next;
+                key.objectid = defrag->ino;
+                btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+                key.offset = 0;
+                inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+                if (IS_ERR(inode))
+                        goto next;
+                /* do a chunk of defrag */
+                BTRFS_I(inode)->in_defrag = 0;
+                range.start = defrag->last_offset;
+                num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+                                               defrag_batch);
+                /*
+                 * if we filled the whole defrag batch, there
+                 * must be more work to do.  Queue this defrag
+                 * again
+                 */
+                if (num_defrag == defrag_batch) {
+                        defrag->last_offset = range.start;
+                        __btrfs_add_inode_defrag(inode, defrag);
+                        /*
+                         * we don't want to kfree defrag, we added it back to
+                         * the rbtree
+                         */
+                        defrag = NULL;
+                } else if (defrag->last_offset && !defrag->cycled) {
+                        /*
+                         * we didn't fill our defrag batch, but
+                         * we didn't start at zero.  Make sure we loop
+                         * around to the start of the file.
+                         */
+                        defrag->last_offset = 0;
+                        defrag->cycled = 1;
+                        __btrfs_add_inode_defrag(inode, defrag);
+                        defrag = NULL;
+                }
+                iput(inode);
+next:
+                spin_lock(&fs_info->defrag_inodes_lock);
+next_free:
+                kfree(defrag);
+        }
+        spin_unlock(&fs_info->defrag_inodes_lock);
+        atomic_dec(&fs_info->defrag_running);
+        /*
+         * during unmount, we use the transaction_wait queue to
+         * wait for the defragger to stop
+         */
+        wake_up(&fs_info->transaction_wait);
+        return 0;
+}
 /* simple helper to fault in pages and copy.  This should go away
 * and be replaced with calls into generic code.
@@ -191,9 +448,9 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
        }
        while (1) {
                if (!split)
-                        split = alloc_extent_map(GFP_NOFS);
+                        split = alloc_extent_map();
                if (!split2)
-                        split2 = alloc_extent_map(GFP_NOFS);
+                        split2 = alloc_extent_map();
                BUG_ON(!split || !split2);
                write_lock(&em_tree->lock);
@@ -298,6 +555,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_key new_key;
+        u64 ino = btrfs_ino(inode);
        u64 search_start = start;
        u64 disk_bytenr = 0;
        u64 num_bytes = 0;
@@ -318,14 +576,14 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
        while (1) {
                recow = 0;
-                ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                ret = btrfs_lookup_file_extent(trans, root, path, ino,
                                               search_start, -1);
                if (ret < 0)
                        break;
                if (ret > 0 && path->slots[0] > 0 && search_start == start) {
                        leaf = path->nodes[0];
                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
-                        if (key.objectid == inode->i_ino &&
+                        if (key.objectid == ino &&
                            key.type == BTRFS_EXTENT_DATA_KEY)
                                path->slots[0]--;
                }
@@ -346,7 +604,7 @@ next_slot:
                }
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                if (key.objectid > inode->i_ino ||
+                if (key.objectid > ino ||
                    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
                        break;
@@ -376,7 +634,7 @@ next_slot:
                search_start = max(key.offset, start);
                if (recow) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        continue;
                }
@@ -393,7 +651,7 @@ next_slot:
                        ret = btrfs_duplicate_item(trans, root, path,
                                                   &new_key);
                        if (ret == -EAGAIN) {
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                                continue;
                        }
                        if (ret < 0)
@@ -516,7 +774,7 @@ next_slot:
                        del_nr = 0;
                        del_slot = 0;
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        continue;
                }
@@ -592,6 +850,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        int del_slot = 0;
        int recow;
        int ret;
+        u64 ino = btrfs_ino(inode);
        btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -600,7 +859,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 again:
        recow = 0;
        split = start;
-        key.objectid = inode->i_ino;
+        key.objectid = ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = split;
@@ -612,8 +871,7 @@ again:
        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-        BUG_ON(key.objectid != inode->i_ino ||
+        BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
-               key.type != BTRFS_EXTENT_DATA_KEY);
        fi = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
        BUG_ON(btrfs_file_extent_type(leaf, fi) !=
@@ -630,7 +888,7 @@ again:
                other_start = 0;
                other_end = start;
                if (extent_mergeable(leaf, path->slots[0] - 1,
-                                     inode->i_ino, bytenr, orig_offset,
+                                     ino, bytenr, orig_offset,
                                     &other_start, &other_end)) {
                        new_key.offset = end;
                        btrfs_set_item_key_safe(trans, root, path, &new_key);
@@ -653,7 +911,7 @@ again:
                other_start = end;
                other_end = 0;
                if (extent_mergeable(leaf, path->slots[0] + 1,
-                                     inode->i_ino, bytenr, orig_offset,
+                                     ino, bytenr, orig_offset,
                                     &other_start, &other_end)) {
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
@@ -681,7 +939,7 @@ again:
                new_key.offset = split;
                ret = btrfs_duplicate_item(trans, root, path, &new_key);
                if (ret == -EAGAIN) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto again;
                }
                BUG_ON(ret < 0);
@@ -702,7 +960,7 @@ again:
                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
                                           root->root_key.objectid,
-                                           inode->i_ino, orig_offset);
+                                           ino, orig_offset);
                BUG_ON(ret);
                if (split == start) {
@@ -718,10 +976,10 @@ again:
        other_start = end;
        other_end = 0;
        if (extent_mergeable(leaf, path->slots[0] + 1,
-                             inode->i_ino, bytenr, orig_offset,
+                             ino, bytenr, orig_offset,
                             &other_start, &other_end)) {
                if (recow) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto again;
                }
                extent_end = other_end;
@@ -729,16 +987,16 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                        inode->i_ino, orig_offset);
+                                        ino, orig_offset);
                BUG_ON(ret);
        }
        other_start = 0;
        other_end = start;
        if (extent_mergeable(leaf, path->slots[0] - 1,
-                             inode->i_ino, bytenr, orig_offset,
+                             ino, bytenr, orig_offset,
                             &other_start, &other_end)) {
                if (recow) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto again;
                }
                key.offset = other_start;
@@ -746,7 +1004,7 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                        inode->i_ino, orig_offset);
+                                        ino, orig_offset);
                BUG_ON(ret);
        }
        if (del_nr == 0) {
@@ -1375,7 +1633,7 @@ static long btrfs_fallocate(struct file *file, int mode,
        while (1) {
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                      alloc_end - cur_offset, 0);
-                BUG_ON(IS_ERR(em) || !em);
+                BUG_ON(IS_ERR_OR_NULL(em));
                last_byte = min(extent_map_end(em), alloc_end);
                last_byte = (last_byte + mask) & ~mask;
                if (em->block_start == EXTENT_MAP_HOLE ||
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 63731a1fb0a1..70d45795d758 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -25,18 +25,17 @@
 #include "transaction.h"
 #include "disk-io.h"
 #include "extent_io.h"
+#include "inode-map.h"
 #define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
-static void recalculate_thresholds(struct btrfs_block_group_cache
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
-                                   *block_group);
-static int link_free_space(struct btrfs_block_group_cache *block_group,
                           struct btrfs_free_space *info);
-struct inode *lookup_free_space_inode(struct btrfs_root *root,
+static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
-                                      struct btrfs_block_group_cache
+                                               struct btrfs_path *path,
-                                      *block_group, struct btrfs_path *path)
+                                               u64 offset)
 {
        struct btrfs_key key;
        struct btrfs_key location;
@@ -46,22 +45,15 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
        struct inode *inode = NULL;
        int ret;
-        spin_lock(&block_group->lock);
-        if (block_group->inode)
-                inode = igrab(block_group->inode);
-        spin_unlock(&block_group->lock);
-        if (inode)
-                return inode;
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
-        key.offset = block_group->key.objectid;
+        key.offset = offset;
        key.type = 0;
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                return ERR_PTR(ret);
        if (ret > 0) {
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                return ERR_PTR(-ENOENT);
        }
@@ -70,7 +62,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
                                struct btrfs_free_space_header);
        btrfs_free_space_key(leaf, header, &disk_key);
        btrfs_disk_key_to_cpu(&location, &disk_key);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
        if (!inode)
@@ -84,6 +76,27 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
        inode->i_mapping->flags &= ~__GFP_FS;
+        return inode;
+}
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+                                      struct btrfs_block_group_cache
+                                      *block_group, struct btrfs_path *path)
+{
+        struct inode *inode = NULL;
+        spin_lock(&block_group->lock);
+        if (block_group->inode)
+                inode = igrab(block_group->inode);
+        spin_unlock(&block_group->lock);
+        if (inode)
+                return inode;
+        inode = __lookup_free_space_inode(root, path,
+                                          block_group->key.objectid);
+        if (IS_ERR(inode))
+                return inode;
        spin_lock(&block_group->lock);
        if (!root->fs_info->closing) {
                block_group->inode = igrab(inode);
@@ -94,24 +107,18 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
        return inode;
 }
-int create_free_space_inode(struct btrfs_root *root,
+int __create_free_space_inode(struct btrfs_root *root,
-                            struct btrfs_trans_handle *trans,
+                              struct btrfs_trans_handle *trans,
-                            struct btrfs_block_group_cache *block_group,
+                              struct btrfs_path *path, u64 ino, u64 offset)
-                            struct btrfs_path *path)
 {
        struct btrfs_key key;
        struct btrfs_disk_key disk_key;
        struct btrfs_free_space_header *header;
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
-        u64 objectid;
        int ret;
-        ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
+        ret = btrfs_insert_empty_inode(trans, root, path, ino);
-        if (ret < 0)
-                return ret;
-        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
        if (ret)
                return ret;
@@ -131,19 +138,18 @@ int create_free_space_inode(struct btrfs_root *root,
                              BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
        btrfs_set_inode_nlink(leaf, inode_item, 1);
        btrfs_set_inode_transid(leaf, inode_item, trans->transid);
-        btrfs_set_inode_block_group(leaf, inode_item,
+        btrfs_set_inode_block_group(leaf, inode_item, offset);
-                                    block_group->key.objectid);
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
-        key.offset = block_group->key.objectid;
+        key.offset = offset;
        key.type = 0;
        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      sizeof(struct btrfs_free_space_header));
        if (ret < 0) {
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                return ret;
        }
        leaf = path->nodes[0];
@@ -152,11 +158,27 @@ int create_free_space_inode(struct btrfs_root *root,
        memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
        btrfs_set_free_space_key(leaf, header, &disk_key);
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        return 0;
 }
+int create_free_space_inode(struct btrfs_root *root,
+                            struct btrfs_trans_handle *trans,
+                            struct btrfs_block_group_cache *block_group,
+                            struct btrfs_path *path)
+{
+        int ret;
+        u64 ino;
+        ret = btrfs_find_free_objectid(root, &ino);
+        if (ret < 0)
+                return ret;
+        return __create_free_space_inode(root, trans, path, ino,
+                                         block_group->key.objectid);
+}
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
                                    struct btrfs_trans_handle *trans,
                                    struct btrfs_path *path,
@@ -187,7 +209,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
                return ret;
        }
-        return btrfs_update_inode(trans, root, inode);
+        ret = btrfs_update_inode(trans, root, inode);
+        return ret;
 }
 static int readahead_cache(struct inode *inode)
@@ -209,15 +232,13 @@ static int readahead_cache(struct inode *inode)
        return 0;
 }
-int load_free_space_cache(struct btrfs_fs_info *fs_info,
+int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
-                          struct btrfs_block_group_cache *block_group)
+                            struct btrfs_free_space_ctl *ctl,
+                            struct btrfs_path *path, u64 offset)
 {
-        struct btrfs_root *root = fs_info->tree_root;
-        struct inode *inode;
        struct btrfs_free_space_header *header;
        struct extent_buffer *leaf;
        struct page *page;
-        struct btrfs_path *path;
        u32 *checksums = NULL, *crc;
        char *disk_crcs = NULL;
        struct btrfs_key key;
@@ -225,76 +246,47 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
        u64 num_entries;
        u64 num_bitmaps;
        u64 generation;
-        u64 used = btrfs_block_group_used(&block_group->item);
        u32 cur_crc = ~(u32)0;
        pgoff_t index = 0;
        unsigned long first_page_offset;
        int num_checksums;
-        int ret = 0;
+        int ret = 0, ret2;
-        /*
-         * If we're unmounting then just return, since this does a search on the
-         * normal root and not the commit root and we could deadlock.
-         */
-        smp_mb();
-        if (fs_info->closing)
-                return 0;
-        /*
-         * If this block group has been marked to be cleared for one reason or
-         * another then we can't trust the on disk cache, so just return.
-         */
-        spin_lock(&block_group->lock);
-        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
-                spin_unlock(&block_group->lock);
-                return 0;
-        }
-        spin_unlock(&block_group->lock);
        INIT_LIST_HEAD(&bitmaps);
-        path = btrfs_alloc_path();
-        if (!path)
-                return 0;
-        inode = lookup_free_space_inode(root, block_group, path);
-        if (IS_ERR(inode)) {
-                btrfs_free_path(path);
-                return 0;
-        }
        /* Nothing in the space cache, goodbye */
-        if (!i_size_read(inode)) {
+        if (!i_size_read(inode))
-                btrfs_free_path(path);
                goto out;
-        }
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
-        key.offset = block_group->key.objectid;
+        key.offset = offset;
        key.type = 0;
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-        if (ret) {
+        if (ret < 0)
-                btrfs_free_path(path);
+                goto out;
+        else if (ret > 0) {
+                btrfs_release_path(path);
+                ret = 0;
                goto out;
        }
+        ret = -1;
        leaf = path->nodes[0];
        header = btrfs_item_ptr(leaf, path->slots[0],
                                struct btrfs_free_space_header);
        num_entries = btrfs_free_space_entries(leaf, header);
        num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
        generation = btrfs_free_space_generation(leaf, header);
-        btrfs_free_path(path);
+        btrfs_release_path(path);
        if (BTRFS_I(inode)->generation != generation) {
                printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
-                       " not match free space cache generation (%llu) for "
+                       " not match free space cache generation (%llu)\n",
-                       "block group %llu\n",
                       (unsigned long long)BTRFS_I(inode)->generation,
-                       (unsigned long long)generation,
+                       (unsigned long long)generation);
-                       (unsigned long long)block_group->key.objectid);
+                goto out;
-                goto free_cache;
        }
        if (!num_entries)
@@ -311,10 +303,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                goto out;
        ret = readahead_cache(inode);
-        if (ret) {
+        if (ret)
-                ret = 0;
                goto out;
-        }
        while (1) {
                struct btrfs_free_space_entry *entry;
@@ -333,10 +323,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                }
                page = grab_cache_page(inode->i_mapping, index);
-                if (!page) {
+                if (!page)
-                        ret = 0;
                        goto free_cache;
-                }
                if (!PageUptodate(page)) {
                        btrfs_readpage(NULL, page);
@@ -345,9 +333,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                                unlock_page(page);
                                page_cache_release(page);
                                printk(KERN_ERR "btrfs: error reading free "
-                                       "space cache: %llu\n",
+                                       "space cache\n");
-                                       (unsigned long long)
-                                       block_group->key.objectid);
                                goto free_cache;
                        }
                }
@@ -360,13 +346,10 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                        gen = addr + (sizeof(u32) * num_checksums);
                        if (*gen != BTRFS_I(inode)->generation) {
                                printk(KERN_ERR "btrfs: space cache generation"
-                                       " (%llu) does not match inode (%llu) "
+                                       " (%llu) does not match inode (%llu)\n",
-                                       "for block group %llu\n",
                                       (unsigned long long)*gen,
                                       (unsigned long long)
-                                       BTRFS_I(inode)->generation,
+                                       BTRFS_I(inode)->generation);
-                                       (unsigned long long)
-                                       block_group->key.objectid);
                                kunmap(page);
                                unlock_page(page);
                                page_cache_release(page);
@@ -382,9 +365,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                                          PAGE_CACHE_SIZE - start_offset);
                btrfs_csum_final(cur_crc, (char *)&cur_crc);
                if (cur_crc != *crc) {
-                        printk(KERN_ERR "btrfs: crc mismatch for page %lu in "
+                        printk(KERN_ERR "btrfs: crc mismatch for page %lu\n",
-                               "block group %llu\n", index,
+                               index);
-                               (unsigned long long)block_group->key.objectid);
                        kunmap(page);
                        unlock_page(page);
                        page_cache_release(page);
@@ -417,9 +399,9 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                        }
                        if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
-                                spin_lock(&block_group->tree_lock);
+                                spin_lock(&ctl->tree_lock);
-                                ret = link_free_space(block_group, e);
+                                ret = link_free_space(ctl, e);
-                                spin_unlock(&block_group->tree_lock);
+                                spin_unlock(&ctl->tree_lock);
                                BUG_ON(ret);
                        } else {
                                e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
@@ -431,11 +413,11 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                                        page_cache_release(page);
                                        goto free_cache;
                                }
-                                spin_lock(&block_group->tree_lock);
+                                spin_lock(&ctl->tree_lock);
-                                ret = link_free_space(block_group, e);
+                                ret2 = link_free_space(ctl, e);
-                                block_group->total_bitmaps++;
+                                ctl->total_bitmaps++;
-                                recalculate_thresholds(block_group);
+                                ctl->op->recalc_thresholds(ctl);
-                                spin_unlock(&block_group->tree_lock);
+                                spin_unlock(&ctl->tree_lock);
                                list_add_tail(&e->list, &bitmaps);
                        }
@@ -471,41 +453,97 @@ next:
                index++;
        }
-        spin_lock(&block_group->tree_lock);
-        if (block_group->free_space != (block_group->key.offset - used -
-                                        block_group->bytes_super)) {
-                spin_unlock(&block_group->tree_lock);
-                printk(KERN_ERR "block group %llu has an wrong amount of free "
-                       "space\n", block_group->key.objectid);
-                ret = 0;
-                goto free_cache;
-        }
-        spin_unlock(&block_group->tree_lock);
        ret = 1;
 out:
        kfree(checksums);
        kfree(disk_crcs);
-        iput(inode);
        return ret;
 free_cache:
-        /* This cache is bogus, make sure it gets cleared */
+        __btrfs_remove_free_space_cache(ctl);
+        goto out;
+}
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+                          struct btrfs_block_group_cache *block_group)
+{
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+        struct btrfs_root *root = fs_info->tree_root;
+        struct inode *inode;
+        struct btrfs_path *path;
+        int ret;
+        bool matched;
+        u64 used = btrfs_block_group_used(&block_group->item);
+        /*
+         * If we're unmounting then just return, since this does a search on the
+         * normal root and not the commit root and we could deadlock.
+         */
+        smp_mb();
+        if (fs_info->closing)
+                return 0;
+        /*
+         * If this block group has been marked to be cleared for one reason or
+         * another then we can't trust the on disk cache, so just return.
+         */
        spin_lock(&block_group->lock);
-        block_group->disk_cache_state = BTRFS_DC_CLEAR;
+        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+                spin_unlock(&block_group->lock);
+                return 0;
+        }
        spin_unlock(&block_group->lock);
-        btrfs_remove_free_space_cache(block_group);
-        goto out;
+        path = btrfs_alloc_path();
+        if (!path)
+                return 0;
+        inode = lookup_free_space_inode(root, block_group, path);
+        if (IS_ERR(inode)) {
+                btrfs_free_path(path);
+                return 0;
+        }
+        ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
+                                      path, block_group->key.objectid);
+        btrfs_free_path(path);
+        if (ret <= 0)
+                goto out;
+        spin_lock(&ctl->tree_lock);
+        matched = (ctl->free_space == (block_group->key.offset - used -
+                                       block_group->bytes_super));
+        spin_unlock(&ctl->tree_lock);
+        if (!matched) {
+                __btrfs_remove_free_space_cache(ctl);
+                printk(KERN_ERR "block group %llu has an wrong amount of free "
+                       "space\n", block_group->key.objectid);
+                ret = -1;
+        }
+out:
+        if (ret < 0) {
+                /* This cache is bogus, make sure it gets cleared */
+                spin_lock(&block_group->lock);
+                block_group->disk_cache_state = BTRFS_DC_CLEAR;
+                spin_unlock(&block_group->lock);
+                ret = 0;
+                printk(KERN_ERR "btrfs: failed to load free space cache "
+                       "for block group %llu\n", block_group->key.objectid);
+        }
+        iput(inode);
+        return ret;
 }
-int btrfs_write_out_cache(struct btrfs_root *root,
+int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
-                          struct btrfs_trans_handle *trans,
+                            struct btrfs_free_space_ctl *ctl,
-                          struct btrfs_block_group_cache *block_group,
+                            struct btrfs_block_group_cache *block_group,
-                          struct btrfs_path *path)
+                            struct btrfs_trans_handle *trans,
+                            struct btrfs_path *path, u64 offset)
 {
        struct btrfs_free_space_header *header;
        struct extent_buffer *leaf;
-        struct inode *inode;
        struct rb_node *node;
        struct list_head *pos, *n;
        struct page **pages;
@@ -522,35 +560,18 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        int index = 0, num_pages = 0;
        int entries = 0;
        int bitmaps = 0;
-        int ret = 0;
+        int ret = -1;
        bool next_page = false;
        bool out_of_space = false;
-        root = root->fs_info->tree_root;
        INIT_LIST_HEAD(&bitmap_list);
-        spin_lock(&block_group->lock);
+        node = rb_first(&ctl->free_space_offset);
-        if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+        if (!node)
-                spin_unlock(&block_group->lock);
-                return 0;
-        }
-        spin_unlock(&block_group->lock);
-        inode = lookup_free_space_inode(root, block_group, path);
-        if (IS_ERR(inode))
-                return 0;
-        if (!i_size_read(inode)) {
-                iput(inode);
                return 0;
-        }
-        node = rb_first(&block_group->free_space_offset);
+        if (!i_size_read(inode))
-        if (!node) {
+                return -1;
-                iput(inode);
-                return 0;
-        }
        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
                PAGE_CACHE_SHIFT;
@@ -560,16 +581,13 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        /* We need a checksum per page. */
        crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
-        if (!crc) {
+        if (!crc)
-                iput(inode);
+                return -1;
-                return 0;
-        }
        pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
        if (!pages) {
                kfree(crc);
-                iput(inode);
+                return -1;
-                return 0;
        }
        /* Since the first page has all of our checksums and our generation we
@@ -579,7 +597,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
        /* Get the cluster for this block_group if it exists */
-        if (!list_empty(&block_group->cluster_list))
+        if (block_group && !list_empty(&block_group->cluster_list))
                cluster = list_entry(block_group->cluster_list.next,
                                     struct btrfs_free_cluster,
                                     block_group_list);
@@ -621,7 +639,8 @@ int btrfs_write_out_cache(struct btrfs_root *root,
         * When searching for pinned extents, we need to start at our start
         * offset.
         */
-        start = block_group->key.objectid;
+        if (block_group)
+                start = block_group->key.objectid;
        /* Write out the extent entries */
        do {
@@ -679,8 +698,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                 * We want to add any pinned extents to our free space cache
                 * so we don't leak the space
                 */
-                while (!next_page && (start < block_group->key.objectid +
+                while (block_group && !next_page &&
-                                      block_group->key.offset)) {
+                       (start < block_group->key.objectid +
+                        block_group->key.offset)) {
                        ret = find_first_extent_bit(unpin, start, &start, &end,
                                                    EXTENT_DIRTY);
                        if (ret) {
@@ -798,12 +818,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        filemap_write_and_wait(inode->i_mapping);
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
-        key.offset = block_group->key.objectid;
+        key.offset = offset;
        key.type = 0;
        ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
        if (ret < 0) {
-                ret = 0;
+                ret = -1;
                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
                                 EXTENT_DIRTY | EXTENT_DELALLOC |
                                 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
@@ -816,13 +836,13 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                path->slots[0]--;
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
-                    found_key.offset != block_group->key.objectid) {
+                    found_key.offset != offset) {
-                        ret = 0;
+                        ret = -1;
                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
                                         EXTENT_DIRTY | EXTENT_DELALLOC |
                                         EXTENT_DO_ACCOUNTING, 0, 0, NULL,
                                         GFP_NOFS);
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto out_free;
                }
        }
@@ -832,49 +852,83 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
        btrfs_set_free_space_generation(leaf, header, trans->transid);
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ret = 1;
 out_free:
-        if (ret == 0) {
+        if (ret != 1) {
                invalidate_inode_pages2_range(inode->i_mapping, 0, index);
-                spin_lock(&block_group->lock);
-                block_group->disk_cache_state = BTRFS_DC_ERROR;
-                spin_unlock(&block_group->lock);
                BTRFS_I(inode)->generation = 0;
        }
        kfree(checksums);
        kfree(pages);
        btrfs_update_inode(trans, root, inode);
+        return ret;
+}
+int btrfs_write_out_cache(struct btrfs_root *root,
+                          struct btrfs_trans_handle *trans,
+                          struct btrfs_block_group_cache *block_group,
+                          struct btrfs_path *path)
+{
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+        struct inode *inode;
+        int ret = 0;
+        root = root->fs_info->tree_root;
+        spin_lock(&block_group->lock);
+        if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+                spin_unlock(&block_group->lock);
+                return 0;
+        }
+        spin_unlock(&block_group->lock);
+        inode = lookup_free_space_inode(root, block_group, path);
+        if (IS_ERR(inode))
+                return 0;
+        ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
+                                      path, block_group->key.objectid);
+        if (ret < 0) {
+                spin_lock(&block_group->lock);
+                block_group->disk_cache_state = BTRFS_DC_ERROR;
+                spin_unlock(&block_group->lock);
+                ret = 0;
+                printk(KERN_ERR "btrfs: failed to write free space cace "
+                       "for block group %llu\n", block_group->key.objectid);
+        }
        iput(inode);
        return ret;
 }
-static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
+static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
                                          u64 offset)
 {
        BUG_ON(offset < bitmap_start);
        offset -= bitmap_start;
-        return (unsigned long)(div64_u64(offset, sectorsize));
+        return (unsigned long)(div_u64(offset, unit));
 }
-static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
+static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
 {
-        return (unsigned long)(div64_u64(bytes, sectorsize));
+        return (unsigned long)(div_u64(bytes, unit));
 }
-static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
+static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
                                   u64 offset)
 {
        u64 bitmap_start;
        u64 bytes_per_bitmap;
-        bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
+        bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
-        bitmap_start = offset - block_group->key.objectid;
+        bitmap_start = offset - ctl->start;
        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
        bitmap_start *= bytes_per_bitmap;
-        bitmap_start += block_group->key.objectid;
+        bitmap_start += ctl->start;
        return bitmap_start;
 }
@@ -932,10 +986,10 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
 * offset.
 */
 static struct btrfs_free_space *
-tree_search_offset(struct btrfs_block_group_cache *block_group,
+tree_search_offset(struct btrfs_free_space_ctl *ctl,
                   u64 offset, int bitmap_only, int fuzzy)
 {
-        struct rb_node *n = block_group->free_space_offset.rb_node;
+        struct rb_node *n = ctl->free_space_offset.rb_node;
        struct btrfs_free_space *entry, *prev = NULL;
        /* find entry that is closest to the 'offset' */
@@ -1031,8 +1085,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
                                break;
                        }
                }
-                if (entry->offset + BITS_PER_BITMAP *
+                if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
-                    block_group->sectorsize > offset)
                        return entry;
        } else if (entry->offset + entry->bytes > offset)
                return entry;
@@ -1043,7 +1096,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
        while (1) {
                if (entry->bitmap) {
                        if (entry->offset + BITS_PER_BITMAP *
-                            block_group->sectorsize > offset)
+                            ctl->unit > offset)
                                break;
                } else {
                        if (entry->offset + entry->bytes > offset)
@@ -1059,42 +1112,47 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
 }
 static inline void
-__unlink_free_space(struct btrfs_block_group_cache *block_group,
+__unlink_free_space(struct btrfs_free_space_ctl *ctl,
                    struct btrfs_free_space *info)
 {
-        rb_erase(&info->offset_index, &block_group->free_space_offset);
+        rb_erase(&info->offset_index, &ctl->free_space_offset);
-        block_group->free_extents--;
+        ctl->free_extents--;
 }
-static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *info)
 {
-        __unlink_free_space(block_group, info);
+        __unlink_free_space(ctl, info);
-        block_group->free_space -= info->bytes;
+        ctl->free_space -= info->bytes;
 }
-static int link_free_space(struct btrfs_block_group_cache *block_group,
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
                           struct btrfs_free_space *info)
 {
        int ret = 0;
        BUG_ON(!info->bitmap && !info->bytes);
-        ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
+        ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
                                 &info->offset_index, (info->bitmap != NULL));
        if (ret)
                return ret;
-        block_group->free_space += info->bytes;
+        ctl->free_space += info->bytes;
-        block_group->free_extents++;
+        ctl->free_extents++;
        return ret;
 }
-static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 {
+        struct btrfs_block_group_cache *block_group = ctl->private;
        u64 max_bytes;
        u64 bitmap_bytes;
        u64 extent_bytes;
        u64 size = block_group->key.offset;
+        u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+        int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+        BUG_ON(ctl->total_bitmaps > max_bitmaps);
        /*
         * The goal is to keep the total amount of memory used per 1gb of space
@@ -1112,10 +1170,10 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
         * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
         * we add more bitmaps.
         */
-        bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+        bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
        if (bitmap_bytes >= max_bytes) {
-                block_group->extents_thresh = 0;
+                ctl->extents_thresh = 0;
                return;
        }
@@ -1126,47 +1184,43 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
        extent_bytes = max_bytes - bitmap_bytes;
        extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
-        block_group->extents_thresh =
+        ctl->extents_thresh =
                div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
 }
-static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *info, u64 offset,
                              u64 bytes)
 {
-        unsigned long start, end;
+        unsigned long start, count;
-        unsigned long i;
-        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        start = offset_to_bit(info->offset, ctl->unit, offset);
-        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        count = bytes_to_bits(bytes, ctl->unit);
-        BUG_ON(end > BITS_PER_BITMAP);
+        BUG_ON(start + count > BITS_PER_BITMAP);
-        for (i = start; i < end; i++)
+        bitmap_clear(info->bitmap, start, count);
-                clear_bit(i, info->bitmap);
        info->bytes -= bytes;
-        block_group->free_space -= bytes;
+        ctl->free_space -= bytes;
 }
-static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
                            struct btrfs_free_space *info, u64 offset,
                            u64 bytes)
 {
-        unsigned long start, end;
+        unsigned long start, count;
-        unsigned long i;
-        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        start = offset_to_bit(info->offset, ctl->unit, offset);
-        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        count = bytes_to_bits(bytes, ctl->unit);
-        BUG_ON(end > BITS_PER_BITMAP);
+        BUG_ON(start + count > BITS_PER_BITMAP);
-        for (i = start; i < end; i++)
+        bitmap_set(info->bitmap, start, count);
-                set_bit(i, info->bitmap);
        info->bytes += bytes;
-        block_group->free_space += bytes;
+        ctl->free_space += bytes;
 }
-static int search_bitmap(struct btrfs_block_group_cache *block_group,
+static int search_bitmap(struct btrfs_free_space_ctl *ctl,
                         struct btrfs_free_space *bitmap_info, u64 *offset,
                         u64 *bytes)
 {
@@ -1174,9 +1228,9 @@ static int search_bitmap(struct btrfs_block_group_cache *block_group,
        unsigned long bits, i;
        unsigned long next_zero;
-        i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+        i = offset_to_bit(bitmap_info->offset, ctl->unit,
                          max_t(u64, *offset, bitmap_info->offset));
-        bits = bytes_to_bits(*bytes, block_group->sectorsize);
+        bits = bytes_to_bits(*bytes, ctl->unit);
        for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
             i < BITS_PER_BITMAP;
@@ -1191,29 +1245,25 @@ static int search_bitmap(struct btrfs_block_group_cache *block_group,
        }
        if (found_bits) {
-                *offset = (u64)(i * block_group->sectorsize) +
+                *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
-                        bitmap_info->offset;
+                *bytes = (u64)(found_bits) * ctl->unit;
-                *bytes = (u64)(found_bits) * block_group->sectorsize;
                return 0;
        }
        return -1;
 }
-static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
+static struct btrfs_free_space *
-                                                *block_group, u64 *offset,
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
-                                                u64 *bytes, int debug)
 {
        struct btrfs_free_space *entry;
        struct rb_node *node;
        int ret;
-        if (!block_group->free_space_offset.rb_node)
+        if (!ctl->free_space_offset.rb_node)
                return NULL;
-        entry = tree_search_offset(block_group,
+        entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
-                                   offset_to_bitmap(block_group, *offset),
-                                   0, 1);
        if (!entry)
                return NULL;
@@ -1223,7 +1273,7 @@ static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
                        continue;
                if (entry->bitmap) {
-                        ret = search_bitmap(block_group, entry, offset, bytes);
+                        ret = search_bitmap(ctl, entry, offset, bytes);
                        if (!ret)
                                return entry;
                        continue;
@@ -1237,33 +1287,28 @@ static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
        return NULL;
 }
-static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
                           struct btrfs_free_space *info, u64 offset)
 {
-        u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+        info->offset = offset_to_bitmap(ctl, offset);
-        int max_bitmaps = (int)div64_u64(block_group->key.offset +
-                                         bytes_per_bg - 1, bytes_per_bg);
-        BUG_ON(block_group->total_bitmaps >= max_bitmaps);
-        info->offset = offset_to_bitmap(block_group, offset);
        info->bytes = 0;
-        link_free_space(block_group, info);
+        link_free_space(ctl, info);
-        block_group->total_bitmaps++;
+        ctl->total_bitmaps++;
-        recalculate_thresholds(block_group);
+        ctl->op->recalc_thresholds(ctl);
 }
-static void free_bitmap(struct btrfs_block_group_cache *block_group,
+static void free_bitmap(struct btrfs_free_space_ctl *ctl,
                        struct btrfs_free_space *bitmap_info)
 {
-        unlink_free_space(block_group, bitmap_info);
+        unlink_free_space(ctl, bitmap_info);
        kfree(bitmap_info->bitmap);
        kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
-        block_group->total_bitmaps--;
+        ctl->total_bitmaps--;
-        recalculate_thresholds(block_group);
+        ctl->op->recalc_thresholds(ctl);
 }
-static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *bitmap_info,
                              u64 *offset, u64 *bytes)
 {
@@ -1272,8 +1317,7 @@ static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_gro
        int ret;
 again:
-        end = bitmap_info->offset +
+        end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
-                (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
        /*
         * XXX - this can go away after a few releases.
@@ -1288,24 +1332,22 @@ again:
        search_start = *offset;
        search_bytes = *bytes;
        search_bytes = min(search_bytes, end - search_start + 1);
-        ret = search_bitmap(block_group, bitmap_info, &search_start,
+        ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
-                            &search_bytes);
        BUG_ON(ret < 0 || search_start != *offset);
        if (*offset > bitmap_info->offset && *offset + *bytes > end) {
-                bitmap_clear_bits(block_group, bitmap_info, *offset,
+                bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1);
-                                  end - *offset + 1);
                *bytes -= end - *offset + 1;
                *offset = end + 1;
        } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
-                bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
+                bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes);
                *bytes = 0;
        }
        if (*bytes) {
                struct rb_node *next = rb_next(&bitmap_info->offset_index);
                if (!bitmap_info->bytes)
-                        free_bitmap(block_group, bitmap_info);
+                        free_bitmap(ctl, bitmap_info);
                /*
                 * no entry after this bitmap, but we still have bytes to
@@ -1332,31 +1374,28 @@ again:
                 */
                search_start = *offset;
                search_bytes = *bytes;
-                ret = search_bitmap(block_group, bitmap_info, &search_start,
+                ret = search_bitmap(ctl, bitmap_info, &search_start,
                                    &search_bytes);
                if (ret < 0 || search_start != *offset)
                        return -EAGAIN;
                goto again;
        } else if (!bitmap_info->bytes)
-                free_bitmap(block_group, bitmap_info);
+                free_bitmap(ctl, bitmap_info);
        return 0;
 }
-static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
-                              struct btrfs_free_space *info)
+                      struct btrfs_free_space *info)
 {
-        struct btrfs_free_space *bitmap_info;
+        struct btrfs_block_group_cache *block_group = ctl->private;
-        int added = 0;
-        u64 bytes, offset, end;
-        int ret;
        /*
         * If we are below the extents threshold then we can add this as an
         * extent, and don't have to deal with the bitmap
         */
-        if (block_group->free_extents < block_group->extents_thresh) {
+        if (ctl->free_extents < ctl->extents_thresh) {
                /*
                 * If this block group has some small extents we don't want to
                 * use up all of our free slots in the cache with them, we want
@@ -1365,11 +1404,10 @@ static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
                 * the overhead of a bitmap if we don't have to.
                 */
                if (info->bytes <= block_group->sectorsize * 4) {
-                        if (block_group->free_extents * 2 <=
+                        if (ctl->free_extents * 2 <= ctl->extents_thresh)
-                            block_group->extents_thresh)
+                                return false;
-                                return 0;
                } else {
-                        return 0;
+                        return false;
                }
        }
@@ -1379,31 +1417,42 @@ static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
         */
        if (BITS_PER_BITMAP * block_group->sectorsize >
            block_group->key.offset)
-                return 0;
+                return false;
+        return true;
+}
+static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
+                              struct btrfs_free_space *info)
+{
+        struct btrfs_free_space *bitmap_info;
+        int added = 0;
+        u64 bytes, offset, end;
+        int ret;
        bytes = info->bytes;
        offset = info->offset;
+        if (!ctl->op->use_bitmap(ctl, info))
+                return 0;
 again:
-        bitmap_info = tree_search_offset(block_group,
+        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
-                                         offset_to_bitmap(block_group, offset),
                                         1, 0);
        if (!bitmap_info) {
                BUG_ON(added);
                goto new_bitmap;
        }
-        end = bitmap_info->offset +
+        end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
-                (u64)(BITS_PER_BITMAP * block_group->sectorsize);
        if (offset >= bitmap_info->offset && offset + bytes > end) {
-                bitmap_set_bits(block_group, bitmap_info, offset,
+                bitmap_set_bits(ctl, bitmap_info, offset, end - offset);
-                                end - offset);
                bytes -= end - offset;
                offset = end;
                added = 0;
        } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
-                bitmap_set_bits(block_group, bitmap_info, offset, bytes);
+                bitmap_set_bits(ctl, bitmap_info, offset, bytes);
                bytes = 0;
        } else {
                BUG();
@@ -1417,19 +1466,19 @@ again:
 new_bitmap:
        if (info && info->bitmap) {
-                add_new_bitmap(block_group, info, offset);
+                add_new_bitmap(ctl, info, offset);
                added = 1;
                info = NULL;
                goto again;
        } else {
-                spin_unlock(&block_group->tree_lock);
+                spin_unlock(&ctl->tree_lock);
                /* no pre-allocated info, allocate a new one */
                if (!info) {
                        info = kmem_cache_zalloc(btrfs_free_space_cachep,
                                                 GFP_NOFS);
                        if (!info) {
-                                spin_lock(&block_group->tree_lock);
+                                spin_lock(&ctl->tree_lock);
                                ret = -ENOMEM;
                                goto out;
                        }
@@ -1437,7 +1486,7 @@ new_bitmap:
                /* allocate the bitmap */
                info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-                spin_lock(&block_group->tree_lock);
+                spin_lock(&ctl->tree_lock);
                if (!info->bitmap) {
                        ret = -ENOMEM;
                        goto out;
@@ -1455,7 +1504,7 @@ out:
        return ret;
 }
-bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
+static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
                          struct btrfs_free_space *info, bool update_stat)
 {
        struct btrfs_free_space *left_info;
@@ -1469,18 +1518,18 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
         * are adding, if there is remove that struct and add a new one to
         * cover the entire range
         */
-        right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
+        right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
        if (right_info && rb_prev(&right_info->offset_index))
                left_info = rb_entry(rb_prev(&right_info->offset_index),
                                     struct btrfs_free_space, offset_index);
        else
-                left_info = tree_search_offset(block_group, offset - 1, 0, 0);
+                left_info = tree_search_offset(ctl, offset - 1, 0, 0);
        if (right_info && !right_info->bitmap) {
                if (update_stat)
-                        unlink_free_space(block_group, right_info);
+                        unlink_free_space(ctl, right_info);
                else
-                        __unlink_free_space(block_group, right_info);
+                        __unlink_free_space(ctl, right_info);
                info->bytes += right_info->bytes;
                kmem_cache_free(btrfs_free_space_cachep, right_info);
                merged = true;
@@ -1489,9 +1538,9 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
        if (left_info && !left_info->bitmap &&
            left_info->offset + left_info->bytes == offset) {
                if (update_stat)
-                        unlink_free_space(block_group, left_info);
+                        unlink_free_space(ctl, left_info);
                else
-                        __unlink_free_space(block_group, left_info);
+                        __unlink_free_space(ctl, left_info);
                info->offset = left_info->offset;
                info->bytes += left_info->bytes;
                kmem_cache_free(btrfs_free_space_cachep, left_info);
@@ -1501,8 +1550,8 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
        return merged;
 }
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
-                         u64 offset, u64 bytes)
+                           u64 offset, u64 bytes)
 {
        struct btrfs_free_space *info;
        int ret = 0;
@@ -1514,9 +1563,9 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
        info->offset = offset;
        info->bytes = bytes;
-        spin_lock(&block_group->tree_lock);
+        spin_lock(&ctl->tree_lock);
-        if (try_merge_free_space(block_group, info, true))
+        if (try_merge_free_space(ctl, info, true))
                goto link;
        /*
@@ -1524,7 +1573,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
         * extent then we know we're going to have to allocate a new extent, so
         * before we do that see if we need to drop this into a bitmap
         */
-        ret = insert_into_bitmap(block_group, info);
+        ret = insert_into_bitmap(ctl, info);
        if (ret < 0) {
                goto out;
        } else if (ret) {
@@ -1532,11 +1581,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                goto out;
        }
 link:
-        ret = link_free_space(block_group, info);
+        ret = link_free_space(ctl, info);
        if (ret)
                kmem_cache_free(btrfs_free_space_cachep, info);
 out:
-        spin_unlock(&block_group->tree_lock);
+        spin_unlock(&ctl->tree_lock);
        if (ret) {
                printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
@@ -1549,21 +1598,21 @@ out:
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                            u64 offset, u64 bytes)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *info;
        struct btrfs_free_space *next_info = NULL;
        int ret = 0;
-        spin_lock(&block_group->tree_lock);
+        spin_lock(&ctl->tree_lock);
 again:
-        info = tree_search_offset(block_group, offset, 0, 0);
+        info = tree_search_offset(ctl, offset, 0, 0);
        if (!info) {
                /*
                 * oops didn't find an extent that matched the space we wanted
                 * to remove, look for a bitmap instead
                 */
-                info = tree_search_offset(block_group,
+                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
-                                          offset_to_bitmap(block_group, offset),
                                          1, 0);
                if (!info) {
                        WARN_ON(1);
@@ -1578,8 +1627,8 @@ again:
                                             offset_index);
                if (next_info->bitmap)
-                        end = next_info->offset + BITS_PER_BITMAP *
+                        end = next_info->offset +
-                                block_group->sectorsize - 1;
+                              BITS_PER_BITMAP * ctl->unit - 1;
                else
                        end = next_info->offset + next_info->bytes;
@@ -1599,20 +1648,20 @@ again:
        }
        if (info->bytes == bytes) {
-                unlink_free_space(block_group, info);
+                unlink_free_space(ctl, info);
                if (info->bitmap) {
                        kfree(info->bitmap);
-                        block_group->total_bitmaps--;
+                        ctl->total_bitmaps--;
                }
                kmem_cache_free(btrfs_free_space_cachep, info);
                goto out_lock;
        }
        if (!info->bitmap && info->offset == offset) {
-                unlink_free_space(block_group, info);
+                unlink_free_space(ctl, info);
                info->offset += bytes;
                info->bytes -= bytes;
-                link_free_space(block_group, info);
+                link_free_space(ctl, info);
                goto out_lock;
        }
@@ -1626,13 +1675,13 @@ again:
                 * first unlink the old info and then
                 * insert it again after the hole we're creating
                 */
-                unlink_free_space(block_group, info);
+                unlink_free_space(ctl, info);
                if (offset + bytes < info->offset + info->bytes) {
                        u64 old_end = info->offset + info->bytes;
                        info->offset = offset + bytes;
                        info->bytes = old_end - info->offset;
-                        ret = link_free_space(block_group, info);
+                        ret = link_free_space(ctl, info);
                        WARN_ON(ret);
                        if (ret)
                                goto out_lock;
@@ -1642,7 +1691,7 @@ again:
                         */
                        kmem_cache_free(btrfs_free_space_cachep, info);
                }
-                spin_unlock(&block_group->tree_lock);
+                spin_unlock(&ctl->tree_lock);
                /* step two, insert a new info struct to cover
                 * anything before the hole
@@ -1653,12 +1702,12 @@ again:
                goto out;
        }
-        ret = remove_from_bitmap(block_group, info, &offset, &bytes);
+        ret = remove_from_bitmap(ctl, info, &offset, &bytes);
        if (ret == -EAGAIN)
                goto again;
        BUG_ON(ret);
 out_lock:
-        spin_unlock(&block_group->tree_lock);
+        spin_unlock(&ctl->tree_lock);
 out:
        return ret;
 }
@@ -1666,11 +1715,12 @@ out:
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                           u64 bytes)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *info;
        struct rb_node *n;
        int count = 0;
-        for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
+        for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes)
                        count++;
@@ -1685,19 +1735,28 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
               "\n", count);
 }
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
+static struct btrfs_free_space_op free_space_op = {
+        .recalc_thresholds      = recalculate_thresholds,
+        .use_bitmap             = use_bitmap,
+};
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
 {
-        struct btrfs_free_space *info;
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-        struct rb_node *n;
-        u64 ret = 0;
-        for (n = rb_first(&block_group->free_space_offset); n;
+        spin_lock_init(&ctl->tree_lock);
-             n = rb_next(n)) {
+        ctl->unit = block_group->sectorsize;
-                info = rb_entry(n, struct btrfs_free_space, offset_index);
+        ctl->start = block_group->key.objectid;
-                ret += info->bytes;
+        ctl->private = block_group;
-        }
+        ctl->op = &free_space_op;
-        return ret;
+        /*
+         * we only want to have 32k of ram per block group for keeping
+         * track of free space, and if we pass 1/2 of that we want to
+         * start converting things over to using bitmaps
+         */
+        ctl->extents_thresh = ((1024 * 32) / 2) /
+                                sizeof(struct btrfs_free_space);
 }
 /*
@@ -1711,6 +1770,7 @@ __btrfs_return_cluster_to_free_space(
                             struct btrfs_block_group_cache *block_group,
                             struct btrfs_free_cluster *cluster)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry;
        struct rb_node *node;
@@ -1732,8 +1792,8 @@ __btrfs_return_cluster_to_free_space(
                bitmap = (entry->bitmap != NULL);
                if (!bitmap)
-                        try_merge_free_space(block_group, entry, false);
+                        try_merge_free_space(ctl, entry, false);
-                tree_insert_offset(&block_group->free_space_offset,
+                tree_insert_offset(&ctl->free_space_offset,
                                   entry->offset, &entry->offset_index, bitmap);
        }
        cluster->root = RB_ROOT;
@@ -1744,14 +1804,38 @@ out:
        return 0;
 }
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
 {
        struct btrfs_free_space *info;
        struct rb_node *node;
+        while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+                info = rb_entry(node, struct btrfs_free_space, offset_index);
+                unlink_free_space(ctl, info);
+                kfree(info->bitmap);
+                kmem_cache_free(btrfs_free_space_cachep, info);
+                if (need_resched()) {
+                        spin_unlock(&ctl->tree_lock);
+                        cond_resched();
+                        spin_lock(&ctl->tree_lock);
+                }
+        }
+}
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+        spin_lock(&ctl->tree_lock);
+        __btrfs_remove_free_space_cache_locked(ctl);
+        spin_unlock(&ctl->tree_lock);
+}
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_cluster *cluster;
        struct list_head *head;
-        spin_lock(&block_group->tree_lock);
+        spin_lock(&ctl->tree_lock);
        while ((head = block_group->cluster_list.next) !=
               &block_group->cluster_list) {
                cluster = list_entry(head, struct btrfs_free_cluster,
@@ -1760,60 +1844,46 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
                WARN_ON(cluster->block_group != block_group);
                __btrfs_return_cluster_to_free_space(block_group, cluster);
                if (need_resched()) {
-                        spin_unlock(&block_group->tree_lock);
+                        spin_unlock(&ctl->tree_lock);
                        cond_resched();
-                        spin_lock(&block_group->tree_lock);
+                        spin_lock(&ctl->tree_lock);
                }
        }
+        __btrfs_remove_free_space_cache_locked(ctl);
+        spin_unlock(&ctl->tree_lock);
-        while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
-                info = rb_entry(node, struct btrfs_free_space, offset_index);
-                if (!info->bitmap) {
-                        unlink_free_space(block_group, info);
-                        kmem_cache_free(btrfs_free_space_cachep, info);
-                } else {
-                        free_bitmap(block_group, info);
-                }
-                if (need_resched()) {
-                        spin_unlock(&block_group->tree_lock);
-                        cond_resched();
-                        spin_lock(&block_group->tree_lock);
-                }
-        }
-        spin_unlock(&block_group->tree_lock);
 }
 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
                               u64 offset, u64 bytes, u64 empty_size)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry = NULL;
        u64 bytes_search = bytes + empty_size;
        u64 ret = 0;
-        spin_lock(&block_group->tree_lock);
+        spin_lock(&ctl->tree_lock);
-        entry = find_free_space(block_group, &offset, &bytes_search, 0);
+        entry = find_free_space(ctl, &offset, &bytes_search);
        if (!entry)
                goto out;
        ret = offset;
        if (entry->bitmap) {
-                bitmap_clear_bits(block_group, entry, offset, bytes);
+                bitmap_clear_bits(ctl, entry, offset, bytes);
                if (!entry->bytes)
-                        free_bitmap(block_group, entry);
+                        free_bitmap(ctl, entry);
        } else {
-                unlink_free_space(block_group, entry);
+                unlink_free_space(ctl, entry);
                entry->offset += bytes;
                entry->bytes -= bytes;
                if (!entry->bytes)
                        kmem_cache_free(btrfs_free_space_cachep, entry);
                else
-                        link_free_space(block_group, entry);
+                        link_free_space(ctl, entry);
        }
 out:
-        spin_unlock(&block_group->tree_lock);
+        spin_unlock(&ctl->tree_lock);
        return ret;
 }
@@ -1830,6 +1900,7 @@ int btrfs_return_cluster_to_free_space(
                               struct btrfs_block_group_cache *block_group,
                               struct btrfs_free_cluster *cluster)
 {
+        struct btrfs_free_space_ctl *ctl;
        int ret;
        /* first, get a safe pointer to the block group */
@@ -1848,10 +1919,12 @@ int btrfs_return_cluster_to_free_space(
        atomic_inc(&block_group->count);
        spin_unlock(&cluster->lock);
+        ctl = block_group->free_space_ctl;
        /* now return any extents the cluster had on it */
-        spin_lock(&block_group->tree_lock);
+        spin_lock(&ctl->tree_lock);
        ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
-        spin_unlock(&block_group->tree_lock);
+        spin_unlock(&ctl->tree_lock);
        /* finally drop our ref */
        btrfs_put_block_group(block_group);
@@ -1863,6 +1936,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
                                   struct btrfs_free_space *entry,
                                   u64 bytes, u64 min_start)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        int err;
        u64 search_start = cluster->window_start;
        u64 search_bytes = bytes;
@@ -1871,13 +1945,12 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
        search_start = min_start;
        search_bytes = bytes;
-        err = search_bitmap(block_group, entry, &search_start,
+        err = search_bitmap(ctl, entry, &search_start, &search_bytes);
-                            &search_bytes);
        if (err)
                return 0;
        ret = search_start;
-        bitmap_clear_bits(block_group, entry, ret, bytes);
+        bitmap_clear_bits(ctl, entry, ret, bytes);
        return ret;
 }
@@ -1891,6 +1964,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                             struct btrfs_free_cluster *cluster, u64 bytes,
                             u64 min_start)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry = NULL;
        struct rb_node *node;
        u64 ret = 0;
@@ -1910,8 +1984,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
        while(1) {
                if (entry->bytes < bytes ||
                    (!entry->bitmap && entry->offset < min_start)) {
-                        struct rb_node *node;
                        node = rb_next(&entry->offset_index);
                        if (!node)
                                break;
@@ -1925,7 +1997,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                                                      cluster, entry, bytes,
                                                      min_start);
                        if (ret == 0) {
-                                struct rb_node *node;
                                node = rb_next(&entry->offset_index);
                                if (!node)
                                        break;
@@ -1951,20 +2022,20 @@ out:
        if (!ret)
                return 0;
-        spin_lock(&block_group->tree_lock);
+        spin_lock(&ctl->tree_lock);
-        block_group->free_space -= bytes;
+        ctl->free_space -= bytes;
        if (entry->bytes == 0) {
-                block_group->free_extents--;
+                ctl->free_extents--;
                if (entry->bitmap) {
                        kfree(entry->bitmap);
-                        block_group->total_bitmaps--;
+                        ctl->total_bitmaps--;
-                        recalculate_thresholds(block_group);
+                        ctl->op->recalc_thresholds(ctl);
                }
                kmem_cache_free(btrfs_free_space_cachep, entry);
        }
-        spin_unlock(&block_group->tree_lock);
+        spin_unlock(&ctl->tree_lock);
        return ret;
 }
@@ -1974,6 +2045,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
                                struct btrfs_free_cluster *cluster,
                                u64 offset, u64 bytes, u64 min_bytes)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        unsigned long next_zero;
        unsigned long i;
        unsigned long search_bits;
@@ -2028,7 +2100,7 @@ again:
        cluster->window_start = start * block_group->sectorsize +
                entry->offset;
-        rb_erase(&entry->offset_index, &block_group->free_space_offset);
+        rb_erase(&entry->offset_index, &ctl->free_space_offset);
        ret = tree_insert_offset(&cluster->root, entry->offset,
                                 &entry->offset_index, 1);
        BUG_ON(ret);
@@ -2043,6 +2115,7 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
                                   struct btrfs_free_cluster *cluster,
                                   u64 offset, u64 bytes, u64 min_bytes)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *first = NULL;
        struct btrfs_free_space *entry = NULL;
        struct btrfs_free_space *prev = NULL;
@@ -2053,7 +2126,7 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
        u64 max_extent;
        u64 max_gap = 128 * 1024;
-        entry = tree_search_offset(block_group, offset, 0, 1);
+        entry = tree_search_offset(ctl, offset, 0, 1);
        if (!entry)
                return -ENOSPC;
@@ -2119,7 +2192,7 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
                if (entry->bitmap)
                        continue;
-                rb_erase(&entry->offset_index, &block_group->free_space_offset);
+                rb_erase(&entry->offset_index, &ctl->free_space_offset);
                ret = tree_insert_offset(&cluster->root, entry->offset,
                                         &entry->offset_index, 0);
                BUG_ON(ret);
@@ -2138,16 +2211,15 @@ static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
                                struct btrfs_free_cluster *cluster,
                                u64 offset, u64 bytes, u64 min_bytes)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry;
        struct rb_node *node;
        int ret = -ENOSPC;
-        if (block_group->total_bitmaps == 0)
+        if (ctl->total_bitmaps == 0)
                return -ENOSPC;
-        entry = tree_search_offset(block_group,
+        entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
-                                   offset_to_bitmap(block_group, offset),
-                                   0, 1);
        if (!entry)
                return -ENOSPC;
@@ -2180,6 +2252,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                             struct btrfs_free_cluster *cluster,
                             u64 offset, u64 bytes, u64 empty_size)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        u64 min_bytes;
        int ret;
@@ -2199,14 +2272,14 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
        } else
                min_bytes = max(bytes, (bytes + empty_size) >> 2);
-        spin_lock(&block_group->tree_lock);
+        spin_lock(&ctl->tree_lock);
        /*
         * If we know we don't have enough space to make a cluster don't even
         * bother doing all the work to try and find one.
         */
-        if (block_group->free_space < min_bytes) {
+        if (ctl->free_space < min_bytes) {
-                spin_unlock(&block_group->tree_lock);
+                spin_unlock(&ctl->tree_lock);
                return -ENOSPC;
        }
@@ -2232,7 +2305,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
        }
 out:
        spin_unlock(&cluster->lock);
-        spin_unlock(&block_group->tree_lock);
+        spin_unlock(&ctl->tree_lock);
        return ret;
 }
@@ -2253,6 +2326,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                           u64 *trimmed, u64 start, u64 end, u64 minlen)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry = NULL;
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        u64 bytes = 0;
@@ -2262,52 +2336,50 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
        *trimmed = 0;
        while (start < end) {
-                spin_lock(&block_group->tree_lock);
+                spin_lock(&ctl->tree_lock);
-                if (block_group->free_space < minlen) {
+                if (ctl->free_space < minlen) {
-                        spin_unlock(&block_group->tree_lock);
+                        spin_unlock(&ctl->tree_lock);
                        break;
                }
-                entry = tree_search_offset(block_group, start, 0, 1);
+                entry = tree_search_offset(ctl, start, 0, 1);
                if (!entry)
-                        entry = tree_search_offset(block_group,
+                        entry = tree_search_offset(ctl,
-                                                   offset_to_bitmap(block_group,
+                                                   offset_to_bitmap(ctl, start),
-                                                                    start),
                                                   1, 1);
                if (!entry || entry->offset >= end) {
-                        spin_unlock(&block_group->tree_lock);
+                        spin_unlock(&ctl->tree_lock);
                        break;
                }
                if (entry->bitmap) {
-                        ret = search_bitmap(block_group, entry, &start, &bytes);
+                        ret = search_bitmap(ctl, entry, &start, &bytes);
                        if (!ret) {
                                if (start >= end) {
-                                        spin_unlock(&block_group->tree_lock);
+                                        spin_unlock(&ctl->tree_lock);
                                        break;
                                }
                                bytes = min(bytes, end - start);
-                                bitmap_clear_bits(block_group, entry,
+                                bitmap_clear_bits(ctl, entry, start, bytes);
-                                                  start, bytes);
                                if (entry->bytes == 0)
-                                        free_bitmap(block_group, entry);
+                                        free_bitmap(ctl, entry);
                        } else {
                                start = entry->offset + BITS_PER_BITMAP *
                                        block_group->sectorsize;
-                                spin_unlock(&block_group->tree_lock);
+                                spin_unlock(&ctl->tree_lock);
                                ret = 0;
                                continue;
                        }
                } else {
                        start = entry->offset;
                        bytes = min(entry->bytes, end - start);
-                        unlink_free_space(block_group, entry);
+                        unlink_free_space(ctl, entry);
                        kmem_cache_free(btrfs_free_space_cachep, entry);
                }
-                spin_unlock(&block_group->tree_lock);
+                spin_unlock(&ctl->tree_lock);
                if (bytes >= minlen) {
                        int update_ret;
@@ -2319,8 +2391,7 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                                                         bytes,
                                                         &actually_trimmed);
-                        btrfs_add_free_space(block_group,
+                        btrfs_add_free_space(block_group, start, bytes);
-                                             start, bytes);
                        if (!update_ret)
                                btrfs_update_reserved_bytes(block_group,
                                                            bytes, 0, 1);
@@ -2342,3 +2413,145 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
        return ret;
 }
+/*
+ * Find the left-most item in the cache tree, and then return the
+ * smallest inode number in the item.
+ *
+ * Note: the returned inode number may not be the smallest one in
+ * the tree, if the left-most item is a bitmap.
+ */
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
+{
+        struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
+        struct btrfs_free_space *entry = NULL;
+        u64 ino = 0;
+        spin_lock(&ctl->tree_lock);
+        if (RB_EMPTY_ROOT(&ctl->free_space_offset))
+                goto out;
+        entry = rb_entry(rb_first(&ctl->free_space_offset),
+                         struct btrfs_free_space, offset_index);
+        if (!entry->bitmap) {
+                ino = entry->offset;
+                unlink_free_space(ctl, entry);
+                entry->offset++;
+                entry->bytes--;
+                if (!entry->bytes)
+                        kmem_cache_free(btrfs_free_space_cachep, entry);
+                else
+                        link_free_space(ctl, entry);
+        } else {
+                u64 offset = 0;
+                u64 count = 1;
+                int ret;
+                ret = search_bitmap(ctl, entry, &offset, &count);
+                BUG_ON(ret);
+                ino = offset;
+                bitmap_clear_bits(ctl, entry, offset, 1);
+                if (entry->bytes == 0)
+                        free_bitmap(ctl, entry);
+        }
+out:
+        spin_unlock(&ctl->tree_lock);
+        return ino;
+}
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+                                    struct btrfs_path *path)
+{
+        struct inode *inode = NULL;
+        spin_lock(&root->cache_lock);
+        if (root->cache_inode)
+                inode = igrab(root->cache_inode);
+        spin_unlock(&root->cache_lock);
+        if (inode)
+                return inode;
+        inode = __lookup_free_space_inode(root, path, 0);
+        if (IS_ERR(inode))
+                return inode;
+        spin_lock(&root->cache_lock);
+        if (!root->fs_info->closing)
+                root->cache_inode = igrab(inode);
+        spin_unlock(&root->cache_lock);
+        return inode;
+}
+int create_free_ino_inode(struct btrfs_root *root,
+                          struct btrfs_trans_handle *trans,
+                          struct btrfs_path *path)
+{
+        return __create_free_space_inode(root, trans, path,
+                                         BTRFS_FREE_INO_OBJECTID, 0);
+}
+int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct btrfs_path *path;
+        struct inode *inode;
+        int ret = 0;
+        u64 root_gen = btrfs_root_generation(&root->root_item);
+        /*
+         * If we're unmounting then just return, since this does a search on the
+         * normal root and not the commit root and we could deadlock.
+         */
+        smp_mb();
+        if (fs_info->closing)
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return 0;
+        inode = lookup_free_ino_inode(root, path);
+        if (IS_ERR(inode))
+                goto out;
+        if (root_gen != BTRFS_I(inode)->generation)
+                goto out_put;
+        ret = __load_free_space_cache(root, inode, ctl, path, 0);
+        if (ret < 0)
+                printk(KERN_ERR "btrfs: failed to load free ino cache for "
+                       "root %llu\n", root->root_key.objectid);
+out_put:
+        iput(inode);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+                              struct btrfs_trans_handle *trans,
+                              struct btrfs_path *path)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct inode *inode;
+        int ret;
+        inode = lookup_free_ino_inode(root, path);
+        if (IS_ERR(inode))
+                return 0;
+        ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
+        if (ret < 0)
+                printk(KERN_ERR "btrfs: failed to write free ino cache "
+                       "for root %llu\n", root->root_key.objectid);
+        iput(inode);
+        return ret;
+}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 65c3b935289f..8f2613f779ed 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -27,6 +27,25 @@ struct btrfs_free_space {
        struct list_head list;
 };
+struct btrfs_free_space_ctl {
+        spinlock_t tree_lock;
+        struct rb_root free_space_offset;
+        u64 free_space;
+        int extents_thresh;
+        int free_extents;
+        int total_bitmaps;
+        int unit;
+        u64 start;
+        struct btrfs_free_space_op *op;
+        void *private;
+};
+struct btrfs_free_space_op {
+        void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl);
+        bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
+                           struct btrfs_free_space *info);
+};
 struct inode *lookup_free_space_inode(struct btrfs_root *root,
                                      struct btrfs_block_group_cache
                                      *block_group, struct btrfs_path *path);
@@ -45,17 +64,38 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                          struct btrfs_trans_handle *trans,
                          struct btrfs_block_group_cache *block_group,
                          struct btrfs_path *path);
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-                         u64 bytenr, u64 size);
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+                                    struct btrfs_path *path);
+int create_free_ino_inode(struct btrfs_root *root,
+                          struct btrfs_trans_handle *trans,
+                          struct btrfs_path *path);
+int load_free_ino_cache(struct btrfs_fs_info *fs_info,
+                        struct btrfs_root *root);
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+                              struct btrfs_trans_handle *trans,
+                              struct btrfs_path *path);
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+                           u64 bytenr, u64 size);
+static inline int
+btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                     u64 bytenr, u64 size)
+{
+        return __btrfs_add_free_space(block_group->free_space_ctl,
+                                      bytenr, size);
+}
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                            u64 bytenr, u64 size);
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
-                                   *block_group);
+                                     *block_group);
 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
                               u64 offset, u64 bytes, u64 empty_size);
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                           u64 bytes);
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 64f1150bb48d..baa74f3db691 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -130,7 +130,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                              item_size - (ptr + sub_item_len - item_start));
        ret = btrfs_truncate_item(trans, root, path,
                                  item_size - sub_item_len, 1);
-        BUG_ON(ret);
 out:
        btrfs_free_path(path);
        return ret;
@@ -167,7 +166,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
                old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
                ret = btrfs_extend_item(trans, root, path, ins_len);
-                BUG_ON(ret);
                ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                     struct btrfs_inode_ref);
                ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index c05a08f4c411..3262cd17a12f 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -16,11 +16,446 @@
 * Boston, MA 021110-1307, USA.
 */
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
 #include "ctree.h"
 #include "disk-io.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
 #include "transaction.h"
-int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
+static int caching_kthread(void *data)
+{
+        struct btrfs_root *root = data;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        u64 last = (u64)-1;
+        int slot;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        /* Since the commit root is read-only, we can safely skip locking. */
+        path->skip_locking = 1;
+        path->search_commit_root = 1;
+        path->reada = 2;
+        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+        key.offset = 0;
+        key.type = BTRFS_INODE_ITEM_KEY;
+again:
+        /* need to make sure the commit_root doesn't disappear */
+        mutex_lock(&root->fs_commit_mutex);
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        while (1) {
+                smp_mb();
+                if (fs_info->closing)
+                        goto out;
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out;
+                        else if (ret > 0)
+                                break;
+                        if (need_resched() ||
+                            btrfs_transaction_in_commit(fs_info)) {
+                                leaf = path->nodes[0];
+                                if (btrfs_header_nritems(leaf) == 0) {
+                                        WARN_ON(1);
+                                        break;
+                                }
+                                /*
+                                 * Save the key so we can advances forward
+                                 * in the next search.
+                                 */
+                                btrfs_item_key_to_cpu(leaf, &key, 0);
+                                btrfs_release_path(path);
+                                root->cache_progress = last;
+                                mutex_unlock(&root->fs_commit_mutex);
+                                schedule_timeout(1);
+                                goto again;
+                        } else
+                                continue;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (key.type != BTRFS_INODE_ITEM_KEY)
+                        goto next;
+                if (key.objectid >= root->highest_objectid)
+                        break;
+                if (last != (u64)-1 && last + 1 != key.objectid) {
+                        __btrfs_add_free_space(ctl, last + 1,
+                                               key.objectid - last - 1);
+                        wake_up(&root->cache_wait);
+                }
+                last = key.objectid;
+next:
+                path->slots[0]++;
+        }
+        if (last < root->highest_objectid - 1) {
+                __btrfs_add_free_space(ctl, last + 1,
+                                       root->highest_objectid - last - 1);
+        }
+        spin_lock(&root->cache_lock);
+        root->cached = BTRFS_CACHE_FINISHED;
+        spin_unlock(&root->cache_lock);
+        root->cache_progress = (u64)-1;
+        btrfs_unpin_free_ino(root);
+out:
+        wake_up(&root->cache_wait);
+        mutex_unlock(&root->fs_commit_mutex);
+        btrfs_free_path(path);
+        return ret;
+}
+static void start_caching(struct btrfs_root *root)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct task_struct *tsk;
+        int ret;
+        u64 objectid;
+        spin_lock(&root->cache_lock);
+        if (root->cached != BTRFS_CACHE_NO) {
+                spin_unlock(&root->cache_lock);
+                return;
+        }
+        root->cached = BTRFS_CACHE_STARTED;
+        spin_unlock(&root->cache_lock);
+        ret = load_free_ino_cache(root->fs_info, root);
+        if (ret == 1) {
+                spin_lock(&root->cache_lock);
+                root->cached = BTRFS_CACHE_FINISHED;
+                spin_unlock(&root->cache_lock);
+                return;
+        }
+        /*
+         * It can be quite time-consuming to fill the cache by searching
+         * through the extent tree, and this can keep ino allocation path
+         * waiting. Therefore at start we quickly find out the highest
+         * inode number and we know we can use inode numbers which fall in
+         * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
+         */
+        ret = btrfs_find_free_objectid(root, &objectid);
+        if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
+                __btrfs_add_free_space(ctl, objectid,
+                                       BTRFS_LAST_FREE_OBJECTID - objectid + 1);
+        }
+        tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
+                          root->root_key.objectid);
+        BUG_ON(IS_ERR(tsk));
+}
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
+{
+again:
+        *objectid = btrfs_find_ino_for_alloc(root);
+        if (*objectid != 0)
+                return 0;
+        start_caching(root);
+        wait_event(root->cache_wait,
+                   root->cached == BTRFS_CACHE_FINISHED ||
+                   root->free_ino_ctl->free_space > 0);
+        if (root->cached == BTRFS_CACHE_FINISHED &&
+            root->free_ino_ctl->free_space == 0)
+                return -ENOSPC;
+        else
+                goto again;
+}
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+again:
+        if (root->cached == BTRFS_CACHE_FINISHED) {
+                __btrfs_add_free_space(ctl, objectid, 1);
+        } else {
+                /*
+                 * If we are in the process of caching free ino chunks,
+                 * to avoid adding the same inode number to the free_ino
+                 * tree twice due to cross transaction, we'll leave it
+                 * in the pinned tree until a transaction is committed
+                 * or the caching work is done.
+                 */
+                mutex_lock(&root->fs_commit_mutex);
+                spin_lock(&root->cache_lock);
+                if (root->cached == BTRFS_CACHE_FINISHED) {
+                        spin_unlock(&root->cache_lock);
+                        mutex_unlock(&root->fs_commit_mutex);
+                        goto again;
+                }
+                spin_unlock(&root->cache_lock);
+                start_caching(root);
+                if (objectid <= root->cache_progress ||
+                    objectid > root->highest_objectid)
+                        __btrfs_add_free_space(ctl, objectid, 1);
+                else
+                        __btrfs_add_free_space(pinned, objectid, 1);
+                mutex_unlock(&root->fs_commit_mutex);
+        }
+}
+/*
+ * When a transaction is committed, we'll move those inode numbers which
+ * are smaller than root->cache_progress from pinned tree to free_ino tree,
+ * and others will just be dropped, because the commit root we were
+ * searching has changed.
+ *
+ * Must be called with root->fs_commit_mutex held
+ */
+void btrfs_unpin_free_ino(struct btrfs_root *root)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
+        struct btrfs_free_space *info;
+        struct rb_node *n;
+        u64 count;
+        while (1) {
+                n = rb_first(rbroot);
+                if (!n)
+                        break;
+                info = rb_entry(n, struct btrfs_free_space, offset_index);
+                BUG_ON(info->bitmap);
+                if (info->offset > root->cache_progress)
+                        goto free;
+                else if (info->offset + info->bytes > root->cache_progress)
+                        count = root->cache_progress - info->offset + 1;
+                else
+                        count = info->bytes;
+                __btrfs_add_free_space(ctl, info->offset, count);
+free:
+                rb_erase(&info->offset_index, rbroot);
+                kfree(info);
+        }
+}
+#define INIT_THRESHOLD  (((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
+#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+/*
+ * The goal is to keep the memory used by the free_ino tree won't
+ * exceed the memory if we use bitmaps only.
+ */
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+        struct btrfs_free_space *info;
+        struct rb_node *n;
+        int max_ino;
+        int max_bitmaps;
+        n = rb_last(&ctl->free_space_offset);
+        if (!n) {
+                ctl->extents_thresh = INIT_THRESHOLD;
+                return;
+        }
+        info = rb_entry(n, struct btrfs_free_space, offset_index);
+        /*
+         * Find the maximum inode number in the filesystem. Note we
+         * ignore the fact that this can be a bitmap, because we are
+         * not doing precise calculation.
+         */
+        max_ino = info->bytes - 1;
+        max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
+        if (max_bitmaps <= ctl->total_bitmaps) {
+                ctl->extents_thresh = 0;
+                return;
+        }
+        ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
+                                PAGE_CACHE_SIZE / sizeof(*info);
+}
+/*
+ * We don't fall back to bitmap, if we are below the extents threshold
+ * or this chunk of inode numbers is a big one.
+ */
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+                       struct btrfs_free_space *info)
+{
+        if (ctl->free_extents < ctl->extents_thresh ||
+            info->bytes > INODES_PER_BITMAP / 10)
+                return false;
+        return true;
+}
+static struct btrfs_free_space_op free_ino_op = {
+        .recalc_thresholds      = recalculate_thresholds,
+        .use_bitmap             = use_bitmap,
+};
+static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+}
+static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
+                              struct btrfs_free_space *info)
+{
+        /*
+         * We always use extents for two reasons:
+         *
+         * - The pinned tree is only used during the process of caching
+         *   work.
+         * - Make code simpler. See btrfs_unpin_free_ino().
+         */
+        return false;
+}
+static struct btrfs_free_space_op pinned_free_ino_op = {
+        .recalc_thresholds      = pinned_recalc_thresholds,
+        .use_bitmap             = pinned_use_bitmap,
+};
+void btrfs_init_free_ino_ctl(struct btrfs_root *root)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+        spin_lock_init(&ctl->tree_lock);
+        ctl->unit = 1;
+        ctl->start = 0;
+        ctl->private = NULL;
+        ctl->op = &free_ino_op;
+        /*
+         * Initially we allow to use 16K of ram to cache chunks of
+         * inode numbers before we resort to bitmaps. This is somewhat
+         * arbitrary, but it will be adjusted in runtime.
+         */
+        ctl->extents_thresh = INIT_THRESHOLD;
+        spin_lock_init(&pinned->tree_lock);
+        pinned->unit = 1;
+        pinned->start = 0;
+        pinned->private = NULL;
+        pinned->extents_thresh = 0;
+        pinned->op = &pinned_free_ino_op;
+}
+int btrfs_save_ino_cache(struct btrfs_root *root,
+                         struct btrfs_trans_handle *trans)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct btrfs_path *path;
+        struct inode *inode;
+        u64 alloc_hint = 0;
+        int ret;
+        int prealloc;
+        bool retry = false;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+again:
+        inode = lookup_free_ino_inode(root, path);
+        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+                ret = PTR_ERR(inode);
+                goto out;
+        }
+        if (IS_ERR(inode)) {
+                BUG_ON(retry);
+                retry = true;
+                ret = create_free_ino_inode(root, trans, path);
+                if (ret)
+                        goto out;
+                goto again;
+        }
+        BTRFS_I(inode)->generation = 0;
+        ret = btrfs_update_inode(trans, root, inode);
+        WARN_ON(ret);
+        if (i_size_read(inode) > 0) {
+                ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+                if (ret)
+                        goto out_put;
+        }
+        spin_lock(&root->cache_lock);
+        if (root->cached != BTRFS_CACHE_FINISHED) {
+                ret = -1;
+                spin_unlock(&root->cache_lock);
+                goto out_put;
+        }
+        spin_unlock(&root->cache_lock);
+        spin_lock(&ctl->tree_lock);
+        prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
+        prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
+        prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
+        spin_unlock(&ctl->tree_lock);
+        /* Just to make sure we have enough space */
+        prealloc += 8 * PAGE_CACHE_SIZE;
+        ret = btrfs_check_data_free_space(inode, prealloc);
+        if (ret)
+                goto out_put;
+        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
+                                              prealloc, prealloc, &alloc_hint);
+        if (ret)
+                goto out_put;
+        btrfs_free_reserved_data_space(inode, prealloc);
+out_put:
+        iput(inode);
+out:
+        if (ret == 0)
+                ret = btrfs_write_out_ino_cache(root, trans, path);
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
 {
        struct btrfs_path *path;
        int ret;
@@ -55,15 +490,14 @@ error:
        return ret;
 }
-int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
-                             struct btrfs_root *root,
-                             u64 dirid, u64 *objectid)
 {
        int ret;
        mutex_lock(&root->objectid_mutex);
        if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
-                ret = btrfs_find_highest_inode(root, &root->highest_objectid);
+                ret = btrfs_find_highest_objectid(root,
+                                                  &root->highest_objectid);
                if (ret)
                        goto out;
        }
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
new file mode 100644
index 000000000000..ddb347bfee23
--- /dev/null
+++ b/fs/btrfs/inode-map.h
@@ -0,0 +1,13 @@
+#ifndef __BTRFS_INODE_MAP
+#define __BTRFS_INODE_MAP
+void btrfs_init_free_ino_ctl(struct btrfs_root *root);
+void btrfs_unpin_free_ino(struct btrfs_root *root);
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid);
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid);
+int btrfs_save_ino_cache(struct btrfs_root *root,
+                         struct btrfs_trans_handle *trans);
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7cd8ab0ef04d..bb51bb1fa44f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -37,6 +37,7 @@
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -51,6 +52,7 @@
 #include "compression.h"
 #include "locking.h"
 #include "free-space-cache.h"
+#include "inode-map.h"
 struct btrfs_iget_args {
        u64 ino;
@@ -138,7 +140,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        path->leave_spinning = 1;
        btrfs_set_trans_block_group(trans, inode);
-        key.objectid = inode->i_ino;
+        key.objectid = btrfs_ino(inode);
        key.offset = start;
        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
        datasize = btrfs_file_extent_calc_inline_size(cur_size);
@@ -340,6 +342,10 @@ static noinline int compress_file_range(struct inode *inode,
        int will_compress;
        int compress_type = root->fs_info->compress_type;
+        /* if this is a small write inside eof, kick off a defragbot */
+        if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
+                btrfs_add_inode_defrag(NULL, inode);
        actual_end = min_t(u64, isize, end + 1);
 again:
        will_compress = 0;
@@ -649,7 +655,7 @@ retry:
                                        async_extent->start +
                                        async_extent->ram_size - 1, 0);
-                em = alloc_extent_map(GFP_NOFS);
+                em = alloc_extent_map();
                BUG_ON(!em);
                em->start = async_extent->start;
                em->len = async_extent->ram_size;
@@ -745,6 +751,15 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
        return alloc_hint;
 }
+static inline bool is_free_space_inode(struct btrfs_root *root,
+                                       struct inode *inode)
+{
+        if (root == root->fs_info->tree_root ||
+            BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
+                return true;
+        return false;
+}
 /*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
@@ -777,7 +792,7 @@ static noinline int cow_file_range(struct inode *inode,
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        int ret = 0;
-        BUG_ON(root == root->fs_info->tree_root);
+        BUG_ON(is_free_space_inode(root, inode));
        trans = btrfs_join_transaction(root, 1);
        BUG_ON(IS_ERR(trans));
        btrfs_set_trans_block_group(trans, inode);
@@ -788,6 +803,10 @@ static noinline int cow_file_range(struct inode *inode,
        disk_num_bytes = num_bytes;
        ret = 0;
+        /* if this is a small write inside eof, kick off defrag */
+        if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
+                btrfs_add_inode_defrag(trans, inode);
        if (start == 0) {
                /* lets try to make an inline extent */
                ret = cow_file_range_inline(trans, root, inode,
@@ -826,7 +845,7 @@ static noinline int cow_file_range(struct inode *inode,
                                           (u64)-1, &ins, 1);
                BUG_ON(ret);
-                em = alloc_extent_map(GFP_NOFS);
+                em = alloc_extent_map();
                BUG_ON(!em);
                em->start = start;
                em->orig_start = em->start;
@@ -1008,7 +1027,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
        LIST_HEAD(list);
        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
-                                       bytenr + num_bytes - 1, &list);
+                                       bytenr + num_bytes - 1, &list, 0);
        if (ret == 0 && list_empty(&list))
                return 0;
@@ -1049,29 +1068,31 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        int type;
        int nocow;
        int check_prev = 1;
-        bool nolock = false;
+        bool nolock;
+        u64 ino = btrfs_ino(inode);
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        if (root == root->fs_info->tree_root) {
-                nolock = true;
+        nolock = is_free_space_inode(root, inode);
+        if (nolock)
                trans = btrfs_join_transaction_nolock(root, 1);
-        } else {
+        else
                trans = btrfs_join_transaction(root, 1);
-        }
        BUG_ON(IS_ERR(trans));
        cow_start = (u64)-1;
        cur_offset = start;
        while (1) {
-                ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                ret = btrfs_lookup_file_extent(trans, root, path, ino,
                                               cur_offset, 0);
                BUG_ON(ret < 0);
                if (ret > 0 && path->slots[0] > 0 && check_prev) {
                        leaf = path->nodes[0];
                        btrfs_item_key_to_cpu(leaf, &found_key,
                                              path->slots[0] - 1);
-                        if (found_key.objectid == inode->i_ino &&
+                        if (found_key.objectid == ino &&
                            found_key.type == BTRFS_EXTENT_DATA_KEY)
                                path->slots[0]--;
                }
@@ -1092,7 +1113,7 @@ next_slot:
                num_bytes = 0;
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.objectid > inode->i_ino ||
+                if (found_key.objectid > ino ||
                    found_key.type > BTRFS_EXTENT_DATA_KEY ||
                    found_key.offset > end)
                        break;
@@ -1127,7 +1148,7 @@ next_slot:
                                goto out_check;
                        if (btrfs_extent_readonly(root, disk_bytenr))
                                goto out_check;
-                        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+                        if (btrfs_cross_ref_exist(trans, root, ino,
                                                  found_key.offset -
                                                  extent_offset, disk_bytenr))
                                goto out_check;
@@ -1164,7 +1185,7 @@ out_check:
                        goto next_slot;
                }
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                if (cow_start != (u64)-1) {
                        ret = cow_file_range(inode, locked_page, cow_start,
                                        found_key.offset - 1, page_started,
@@ -1177,7 +1198,7 @@ out_check:
                        struct extent_map *em;
                        struct extent_map_tree *em_tree;
                        em_tree = &BTRFS_I(inode)->extent_tree;
-                        em = alloc_extent_map(GFP_NOFS);
+                        em = alloc_extent_map();
                        BUG_ON(!em);
                        em->start = cur_offset;
                        em->orig_start = em->start;
@@ -1222,7 +1243,7 @@ out_check:
                if (cur_offset > end)
                        break;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (cur_offset <= end && cow_start == (u64)-1)
                cow_start = cur_offset;
@@ -1310,14 +1331,13 @@ static int btrfs_set_bit_hook(struct inode *inode,
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
-         * but in this case, we are only testeing for the DELALLOC
+         * but in this case, we are only testing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
                u64 len = state->end + 1 - state->start;
-                int do_list = (root->root_key.objectid !=
+                bool do_list = !is_free_space_inode(root, inode);
-                               BTRFS_ROOT_TREE_OBJECTID);
                if (*bits & EXTENT_FIRST_DELALLOC)
                        *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1344,14 +1364,13 @@ static int btrfs_clear_bit_hook(struct inode *inode,
 {
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
-         * but in this case, we are only testeing for the DELALLOC
+         * but in this case, we are only testing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
                u64 len = state->end + 1 - state->start;
-                int do_list = (root->root_key.objectid !=
+                bool do_list = !is_free_space_inode(root, inode);
-                               BTRFS_ROOT_TREE_OBJECTID);
                if (*bits & EXTENT_FIRST_DELALLOC)
                        *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1458,7 +1477,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-        if (root == root->fs_info->tree_root)
+        if (is_free_space_inode(root, inode))
                ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
        else
                ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
@@ -1644,7 +1663,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
                                 &hint, 0);
        BUG_ON(ret);
-        ins.objectid = inode->i_ino;
+        ins.objectid = btrfs_ino(inode);
        ins.offset = file_pos;
        ins.type = BTRFS_EXTENT_DATA_KEY;
        ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
@@ -1675,7 +1694,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        ins.type = BTRFS_EXTENT_ITEM_KEY;
        ret = btrfs_alloc_reserved_file_extent(trans, root,
                                        root->root_key.objectid,
-                                        inode->i_ino, file_pos, &ins);
+                                        btrfs_ino(inode), file_pos, &ins);
        BUG_ON(ret);
        btrfs_free_path(path);
@@ -1701,7 +1720,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        struct extent_state *cached_state = NULL;
        int compress_type = 0;
        int ret;
-        bool nolock = false;
+        bool nolock;
        ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
                                             end - start + 1);
@@ -1709,7 +1728,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                return 0;
        BUG_ON(!ordered_extent);
-        nolock = (root == root->fs_info->tree_root);
+        nolock = is_free_space_inode(root, inode);
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list));
@@ -1855,7 +1874,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
                }
                read_unlock(&em_tree->lock);
-                if (!em || IS_ERR(em)) {
+                if (IS_ERR_OR_NULL(em)) {
                        kfree(failrec);
                        return -EIO;
                }
@@ -2004,12 +2023,11 @@ good:
        return 0;
 zeroit:
-        if (printk_ratelimit()) {
+        printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u "
-                printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+                       "private %llu\n",
-                       "private %llu\n", page->mapping->host->i_ino,
+                       (unsigned long long)btrfs_ino(page->mapping->host),
                       (unsigned long long)start, csum,
                       (unsigned long long)private);
-        }
        memset(kaddr + offset, 1, end - start + 1);
        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_USER0);
@@ -2244,7 +2262,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
        /* insert an orphan item to track this unlinked/truncated file */
        if (insert >= 1) {
-                ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
                BUG_ON(ret);
        }
@@ -2281,7 +2299,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
        spin_unlock(&root->orphan_lock);
        if (trans && delete_item) {
-                ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+                ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
                BUG_ON(ret);
        }
@@ -2346,7 +2364,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                        break;
                /* release the path since we're done with it */
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                /*
                 * this is where we are basically btrfs_lookup, without the
@@ -2543,7 +2561,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
         * try to precache a NULL acl entry for files that don't have
         * any xattrs or acls
         */
-        maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
+        maybe_acls = acls_after_inode_item(leaf, path->slots[0],
+                                           btrfs_ino(inode));
        if (!maybe_acls)
                cache_no_acl(inode);
@@ -2647,11 +2666,26 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
        struct extent_buffer *leaf;
        int ret;
+        /*
+         * If root is tree root, it means this inode is used to
+         * store free space information. And these inodes are updated
+         * when committing the transaction, so they needn't delaye to
+         * be updated, or deadlock will occured.
+         */
+        if (!is_free_space_inode(root, inode)) {
+                ret = btrfs_delayed_update_inode(trans, root, inode);
+                if (!ret)
+                        btrfs_set_inode_last_trans(trans, inode);
+                return ret;
+        }
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        path->leave_spinning = 1;
-        ret = btrfs_lookup_inode(trans, root, path,
+        ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
-                                 &BTRFS_I(inode)->location, 1);
+                                 1);
        if (ret) {
                if (ret > 0)
                        ret = -ENOENT;
@@ -2661,7 +2695,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
        btrfs_unlock_up_safe(path, 1);
        leaf = path->nodes[0];
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
-                                  struct btrfs_inode_item);
+                                    struct btrfs_inode_item);
        fill_inode_item(trans, leaf, inode_item, inode);
        btrfs_mark_buffer_dirty(leaf);
@@ -2672,7 +2706,6 @@ failed:
        return ret;
 }
 /*
 * unlink helper that gets used here in inode.c and in the tree logging
 * recovery code.  It remove a link in a directory with a given name, and
@@ -2689,6 +2722,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        struct btrfs_dir_item *di;
        struct btrfs_key key;
        u64 index;
+        u64 ino = btrfs_ino(inode);
+        u64 dir_ino = btrfs_ino(dir);
        path = btrfs_alloc_path();
        if (!path) {
@@ -2697,7 +2732,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        }
        path->leave_spinning = 1;
-        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
                                    name, name_len, -1);
        if (IS_ERR(di)) {
                ret = PTR_ERR(di);
@@ -2712,33 +2747,23 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        ret = btrfs_delete_one_dir_name(trans, root, path, di);
        if (ret)
                goto err;
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
-        ret = btrfs_del_inode_ref(trans, root, name, name_len,
+        ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
-                                  inode->i_ino,
+                                  dir_ino, &index);
-                                  dir->i_ino, &index);
        if (ret) {
                printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
-                       "inode %lu parent %lu\n", name_len, name,
+                       "inode %llu parent %llu\n", name_len, name,
-                       inode->i_ino, dir->i_ino);
+                       (unsigned long long)ino, (unsigned long long)dir_ino);
                goto err;
        }
-        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
-                                         index, name, name_len, -1);
+        if (ret)
-        if (IS_ERR(di)) {
-                ret = PTR_ERR(di);
-                goto err;
-        }
-        if (!di) {
-                ret = -ENOENT;
                goto err;
-        }
-        ret = btrfs_delete_one_dir_name(trans, root, path, di);
-        btrfs_release_path(root, path);
        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
-                                         inode, dir->i_ino);
+                                         inode, dir_ino);
        BUG_ON(ret != 0 && ret != -ENOENT);
        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
@@ -2816,12 +2841,14 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        int check_link = 1;
        int err = -ENOSPC;
        int ret;
+        u64 ino = btrfs_ino(inode);
+        u64 dir_ino = btrfs_ino(dir);
        trans = btrfs_start_transaction(root, 10);
        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
                return trans;
-        if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+        if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                return ERR_PTR(-ENOSPC);
        /* check if there is someone else holds reference */
@@ -2862,7 +2889,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        } else {
                check_link = 0;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ret = btrfs_lookup_inode(trans, root, path,
                                &BTRFS_I(inode)->location, 0);
@@ -2876,11 +2903,11 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        } else {
                check_link = 0;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (ret == 0 && S_ISREG(inode->i_mode)) {
                ret = btrfs_lookup_file_extent(trans, root, path,
-                                               inode->i_ino, (u64)-1, 0);
+                                               ino, (u64)-1, 0);
                if (ret < 0) {
                        err = ret;
                        goto out;
@@ -2888,7 +2915,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
                BUG_ON(ret == 0);
                if (check_path_shared(root, path))
                        goto out;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
        }
        if (!check_link) {
@@ -2896,7 +2923,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
                goto out;
        }
-        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
                                dentry->d_name.name, dentry->d_name.len, 0);
        if (IS_ERR(di)) {
                err = PTR_ERR(di);
@@ -2909,11 +2936,11 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
                err = 0;
                goto out;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ref = btrfs_lookup_inode_ref(trans, root, path,
                                dentry->d_name.name, dentry->d_name.len,
-                                inode->i_ino, dir->i_ino, 0);
+                                ino, dir_ino, 0);
        if (IS_ERR(ref)) {
                err = PTR_ERR(ref);
                goto out;
@@ -2922,9 +2949,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        if (check_path_shared(root, path))
                goto out;
        index = btrfs_inode_ref_index(path->nodes[0], ref);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
-        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+        /*
+         * This is a commit root search, if we can lookup inode item and other
+         * relative items in the commit root, it means the transaction of
+         * dir/file creation has been committed, and the dir index item that we
+         * delay to insert has also been inserted into the commit root. So
+         * we needn't worry about the delayed insertion of the dir index item
+         * here.
+         */
+        di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
                                dentry->d_name.name, dentry->d_name.len, 0);
        if (IS_ERR(di)) {
                err = PTR_ERR(di);
@@ -2999,54 +3034,47 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        u64 index;
        int ret;
+        u64 dir_ino = btrfs_ino(dir);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
                                   name, name_len, -1);
-        BUG_ON(!di || IS_ERR(di));
+        BUG_ON(IS_ERR_OR_NULL(di));
        leaf = path->nodes[0];
        btrfs_dir_item_key_to_cpu(leaf, di, &key);
        WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
        ret = btrfs_delete_one_dir_name(trans, root, path, di);
        BUG_ON(ret);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
                                 objectid, root->root_key.objectid,
-                                 dir->i_ino, &index, name, name_len);
+                                 dir_ino, &index, name, name_len);
        if (ret < 0) {
                BUG_ON(ret != -ENOENT);
-                di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+                di = btrfs_search_dir_index_item(root, path, dir_ino,
                                                 name, name_len);
-                BUG_ON(!di || IS_ERR(di));
+                BUG_ON(IS_ERR_OR_NULL(di));
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                index = key.offset;
        }
+        btrfs_release_path(path);
-        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
-                                         index, name, name_len, -1);
-        BUG_ON(!di || IS_ERR(di));
-        leaf = path->nodes[0];
-        btrfs_dir_item_key_to_cpu(leaf, di, &key);
-        WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
-        ret = btrfs_delete_one_dir_name(trans, root, path, di);
        BUG_ON(ret);
-        btrfs_release_path(root, path);
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, dir);
        BUG_ON(ret);
-        btrfs_free_path(path);
        return 0;
 }
@@ -3059,7 +3087,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        unsigned long nr = 0;
        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
-            inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+            btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
                return -ENOTEMPTY;
        trans = __unlink_start_trans(dir, dentry);
@@ -3068,7 +3096,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        btrfs_set_trans_block_group(trans, dir);
-        if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+        if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
                err = btrfs_unlink_subvol(trans, root, dir,
                                          BTRFS_I(inode)->location.objectid,
                                          dentry->d_name.name,
@@ -3093,178 +3121,6 @@ out:
        return err;
 }
-#if 0
-/*
- * when truncating bytes in a file, it is possible to avoid reading
- * the leaves that contain only checksum items.  This can be the
- * majority of the IO required to delete a large file, but it must
- * be done carefully.
- *
- * The keys in the level just above the leaves are checked to make sure
- * the lowest key in a given leaf is a csum key, and starts at an offset
- * after the new  size.
- *
- * Then the key for the next leaf is checked to make sure it also has
- * a checksum item for the same file.  If it does, we know our target leaf
- * contains only checksum items, and it can be safely freed without reading
- * it.
- *
- * This is just an optimization targeted at large files.  It may do
- * nothing.  It will return 0 unless things went badly.
- */
-static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root,
-                                     struct btrfs_path *path,
-                                     struct inode *inode, u64 new_size)
-{
-        struct btrfs_key key;
-        int ret;
-        int nritems;
-        struct btrfs_key found_key;
-        struct btrfs_key other_key;
-        struct btrfs_leaf_ref *ref;
-        u64 leaf_gen;
-        u64 leaf_start;
-        path->lowest_level = 1;
-        key.objectid = inode->i_ino;
-        key.type = BTRFS_CSUM_ITEM_KEY;
-        key.offset = new_size;
-again:
-        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-        if (ret < 0)
-                goto out;
-        if (path->nodes[1] == NULL) {
-                ret = 0;
-                goto out;
-        }
-        ret = 0;
-        btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
-        nritems = btrfs_header_nritems(path->nodes[1]);
-        if (!nritems)
-                goto out;
-        if (path->slots[1] >= nritems)
-                goto next_node;
-        /* did we find a key greater than anything we want to delete? */
-        if (found_key.objectid > inode->i_ino ||
-           (found_key.objectid == inode->i_ino && found_key.type > key.type))
-                goto out;
-        /* we check the next key in the node to make sure the leave contains
-         * only checksum items.  This comparison doesn't work if our
-         * leaf is the last one in the node
-         */
-        if (path->slots[1] + 1 >= nritems) {
-next_node:
-                /* search forward from the last key in the node, this
-                 * will bring us into the next node in the tree
-                 */
-                btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
-                /* unlikely, but we inc below, so check to be safe */
-                if (found_key.offset == (u64)-1)
-                        goto out;
-                /* search_forward needs a path with locks held, do the
-                 * search again for the original key.  It is possible
-                 * this will race with a balance and return a path that
-                 * we could modify, but this drop is just an optimization
-                 * and is allowed to miss some leaves.
-                 */
-                btrfs_release_path(root, path);
-                found_key.offset++;
-                /* setup a max key for search_forward */
-                other_key.offset = (u64)-1;
-                other_key.type = key.type;
-                other_key.objectid = key.objectid;
-                path->keep_locks = 1;
-                ret = btrfs_search_forward(root, &found_key, &other_key,
-                                           path, 0, 0);
-                path->keep_locks = 0;
-                if (ret || found_key.objectid != key.objectid ||
-                    found_key.type != key.type) {
-                        ret = 0;
-                        goto out;
-                }
-                key.offset = found_key.offset;
-                btrfs_release_path(root, path);
-                cond_resched();
-                goto again;
-        }
-        /* we know there's one more slot after us in the tree,
-         * read that key so we can verify it is also a checksum item
-         */
-        btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
-        if (found_key.objectid < inode->i_ino)
-                goto next_key;
-        if (found_key.type != key.type || found_key.offset < new_size)
-                goto next_key;
-        /*
-         * if the key for the next leaf isn't a csum key from this objectid,
-         * we can't be sure there aren't good items inside this leaf.
-         * Bail out
-         */
-        if (other_key.objectid != inode->i_ino || other_key.type != key.type)
-                goto out;
-        leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
-        leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
-        /*
-         * it is safe to delete this leaf, it contains only
-         * csum items from this inode at an offset >= new_size
-         */
-        ret = btrfs_del_leaf(trans, root, path, leaf_start);
-        BUG_ON(ret);
-        if (root->ref_cows && leaf_gen < trans->transid) {
-                ref = btrfs_alloc_leaf_ref(root, 0);
-                if (ref) {
-                        ref->root_gen = root->root_key.offset;
-                        ref->bytenr = leaf_start;
-                        ref->owner = 0;
-                        ref->generation = leaf_gen;
-                        ref->nritems = 0;
-                        btrfs_sort_leaf_ref(ref);
-                        ret = btrfs_add_leaf_ref(root, ref, 0);
-                        WARN_ON(ret);
-                        btrfs_free_leaf_ref(root, ref);
-                } else {
-                        WARN_ON(1);
-                }
-        }
-next_key:
-        btrfs_release_path(root, path);
-        if (other_key.objectid == inode->i_ino &&
-            other_key.type == key.type && other_key.offset > key.offset) {
-                key.offset = other_key.offset;
-                cond_resched();
-                goto again;
-        }
-        ret = 0;
-out:
-        /* fixup any changes we've made to the path */
-        path->lowest_level = 0;
-        path->keep_locks = 0;
-        btrfs_release_path(root, path);
-        return ret;
-}
-#endif
 /*
 * this can truncate away extent items, csum items and directory items.
 * It starts at a high offset and removes keys until it can't find
@@ -3300,17 +3156,27 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        int encoding;
        int ret;
        int err = 0;
+        u64 ino = btrfs_ino(inode);
        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
        if (root->ref_cows || root == root->fs_info->tree_root)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+        /*
+         * This function is also used to drop the items in the log tree before
+         * we relog the inode, so if root != BTRFS_I(inode)->root, it means
+         * it is used to drop the loged items. So we shouldn't kill the delayed
+         * items.
+         */
+        if (min_type == 0 && root == BTRFS_I(inode)->root)
+                btrfs_kill_delayed_inode_items(inode);
        path = btrfs_alloc_path();
        BUG_ON(!path);
        path->reada = -1;
-        key.objectid = inode->i_ino;
+        key.objectid = ino;
        key.offset = (u64)-1;
        key.type = (u8)-1;
@@ -3338,7 +3204,7 @@ search_again:
                found_type = btrfs_key_type(&found_key);
                encoding = 0;
-                if (found_key.objectid != inode->i_ino)
+                if (found_key.objectid != ino)
                        break;
                if (found_type < min_type)
@@ -3428,7 +3294,6 @@ search_again:
                                    btrfs_file_extent_calc_inline_size(size);
                                ret = btrfs_truncate_item(trans, root, path,
                                                          size, 1);
-                                BUG_ON(ret);
                        } else if (root->ref_cows) {
                                inode_sub_bytes(inode, item_end + 1 -
                                                found_key.offset);
@@ -3457,7 +3322,7 @@ delete:
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
                                                btrfs_header_owner(leaf),
-                                                inode->i_ino, extent_offset);
+                                                ino, extent_offset);
                        BUG_ON(ret);
                }
@@ -3466,7 +3331,9 @@ delete:
                if (path->slots[0] == 0 ||
                    path->slots[0] != pending_del_slot) {
-                        if (root->ref_cows) {
+                        if (root->ref_cows &&
+                            BTRFS_I(inode)->location.objectid !=
+                                                BTRFS_FREE_INO_OBJECTID) {
                                err = -EAGAIN;
                                goto out;
                        }
@@ -3477,7 +3344,7 @@ delete:
                                BUG_ON(ret);
                                pending_del_nr = 0;
                        }
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto search_again;
                } else {
                        path->slots[0]--;
@@ -3635,7 +3502,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
        while (1) {
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                block_end - cur_offset, 0);
-                BUG_ON(IS_ERR(em) || !em);
+                BUG_ON(IS_ERR_OR_NULL(em));
                last_byte = min(extent_map_end(em), block_end);
                last_byte = (last_byte + mask) & ~mask;
                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3656,7 +3523,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                break;
                        err = btrfs_insert_file_extent(trans, root,
-                                        inode->i_ino, cur_offset, 0,
+                                        btrfs_ino(inode), cur_offset, 0,
                                        0, hole_size, 0, hole_size,
                                        0, 0, 0);
                        if (err)
@@ -3758,7 +3625,7 @@ void btrfs_evict_inode(struct inode *inode)
        truncate_inode_pages(&inode->i_data, 0);
        if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
-                               root == root->fs_info->tree_root))
+                               is_free_space_inode(root, inode)))
                goto no_delete;
        if (is_bad_inode(inode)) {
@@ -3811,6 +3678,10 @@ void btrfs_evict_inode(struct inode *inode)
                BUG_ON(ret);
        }
+        if (!(root == root->fs_info->tree_root ||
+              root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
+                btrfs_return_ino(root, btrfs_ino(inode));
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
@@ -3836,12 +3707,12 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
+        di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
                                    namelen, 0);
        if (IS_ERR(di))
                ret = PTR_ERR(di);
-        if (!di || IS_ERR(di))
+        if (IS_ERR_OR_NULL(di))
                goto out_err;
        btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
@@ -3889,7 +3760,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
        leaf = path->nodes[0];
        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
-        if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
+        if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
            btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
                goto out;
@@ -3899,7 +3770,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
        if (ret)
                goto out;
-        btrfs_release_path(root->fs_info->tree_root, path);
+        btrfs_release_path(path);
        new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
        if (IS_ERR(new_root)) {
@@ -3928,6 +3799,7 @@ static void inode_tree_add(struct inode *inode)
        struct btrfs_inode *entry;
        struct rb_node **p;
        struct rb_node *parent;
+        u64 ino = btrfs_ino(inode);
 again:
        p = &root->inode_tree.rb_node;
        parent = NULL;
@@ -3940,9 +3812,9 @@ again:
                parent = *p;
                entry = rb_entry(parent, struct btrfs_inode, rb_node);
-                if (inode->i_ino < entry->vfs_inode.i_ino)
+                if (ino < btrfs_ino(&entry->vfs_inode))
                        p = &parent->rb_left;
-                else if (inode->i_ino > entry->vfs_inode.i_ino)
+                else if (ino > btrfs_ino(&entry->vfs_inode))
                        p = &parent->rb_right;
                else {
                        WARN_ON(!(entry->vfs_inode.i_state &
@@ -4006,9 +3878,9 @@ again:
                prev = node;
                entry = rb_entry(node, struct btrfs_inode, rb_node);
-                if (objectid < entry->vfs_inode.i_ino)
+                if (objectid < btrfs_ino(&entry->vfs_inode))
                        node = node->rb_left;
-                else if (objectid > entry->vfs_inode.i_ino)
+                else if (objectid > btrfs_ino(&entry->vfs_inode))
                        node = node->rb_right;
                else
                        break;
@@ -4016,7 +3888,7 @@ again:
        if (!node) {
                while (prev) {
                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
-                        if (objectid <= entry->vfs_inode.i_ino) {
+                        if (objectid <= btrfs_ino(&entry->vfs_inode)) {
                                node = prev;
                                break;
                        }
@@ -4025,7 +3897,7 @@ again:
        }
        while (node) {
                entry = rb_entry(node, struct btrfs_inode, rb_node);
-                objectid = entry->vfs_inode.i_ino + 1;
+                objectid = btrfs_ino(&entry->vfs_inode) + 1;
                inode = igrab(&entry->vfs_inode);
                if (inode) {
                        spin_unlock(&root->inode_lock);
@@ -4063,7 +3935,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 static int btrfs_find_actor(struct inode *inode, void *opaque)
 {
        struct btrfs_iget_args *args = opaque;
-        return args->ino == inode->i_ino &&
+        return args->ino == btrfs_ino(inode) &&
                args->root == BTRFS_I(inode)->root;
 }
@@ -4208,7 +4080,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
        return d_splice_alias(inode, dentry);
 }
-static unsigned char btrfs_filetype_table[] = {
+unsigned char btrfs_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
@@ -4222,6 +4094,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct btrfs_path *path;
+        struct list_head ins_list;
+        struct list_head del_list;
        int ret;
        struct extent_buffer *leaf;
        int slot;
@@ -4234,6 +4108,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        char tmp_name[32];
        char *name_ptr;
        int name_len;
+        int is_curr = 0;        /* filp->f_pos points to the current index? */
        /* FIXME, use a real flag for deciding about the key type */
        if (root->fs_info->tree_root == root)
@@ -4241,9 +4116,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        /* special case for "." */
        if (filp->f_pos == 0) {
-                over = filldir(dirent, ".", 1,
+                over = filldir(dirent, ".", 1, 1, btrfs_ino(inode), DT_DIR);
-                               1, inode->i_ino,
-                               DT_DIR);
                if (over)
                        return 0;
                filp->f_pos = 1;
@@ -4258,11 +4131,19 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                filp->f_pos = 2;
        }
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        path->reada = 2;
+        if (key_type == BTRFS_DIR_INDEX_KEY) {
+                INIT_LIST_HEAD(&ins_list);
+                INIT_LIST_HEAD(&del_list);
+                btrfs_get_delayed_items(inode, &ins_list, &del_list);
+        }
        btrfs_set_key_type(&key, key_type);
        key.offset = filp->f_pos;
-        key.objectid = inode->i_ino;
+        key.objectid = btrfs_ino(inode);
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
@@ -4289,8 +4170,13 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                        break;
                if (found_key.offset < filp->f_pos)
                        goto next;
+                if (key_type == BTRFS_DIR_INDEX_KEY &&
+                    btrfs_should_delete_dir_index(&del_list,
+                                                  found_key.offset))
+                        goto next;
                filp->f_pos = found_key.offset;
+                is_curr = 1;
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
                di_cur = 0;
@@ -4345,6 +4231,15 @@ next:
                path->slots[0]++;
        }
+        if (key_type == BTRFS_DIR_INDEX_KEY) {
+                if (is_curr)
+                        filp->f_pos++;
+                ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
+                                                      &ins_list);
+                if (ret)
+                        goto nopos;
+        }
        /* Reached end of directory/root. Bump pos past the last item. */
        if (key_type == BTRFS_DIR_INDEX_KEY)
                /*
@@ -4357,6 +4252,8 @@ next:
 nopos:
        ret = 0;
 err:
+        if (key_type == BTRFS_DIR_INDEX_KEY)
+                btrfs_put_delayed_items(&ins_list, &del_list);
        btrfs_free_path(path);
        return ret;
 }
@@ -4372,7 +4269,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
                return 0;
        smp_mb();
-        nolock = (root->fs_info->closing && root == root->fs_info->tree_root);
+        if (root->fs_info->closing && is_free_space_inode(root, inode))
+                nolock = true;
        if (wbc->sync_mode == WB_SYNC_ALL) {
                if (nolock)
@@ -4415,25 +4313,25 @@ void btrfs_dirty_inode(struct inode *inode)
                btrfs_end_transaction(trans, root);
                trans = btrfs_start_transaction(root, 1);
                if (IS_ERR(trans)) {
-                        if (printk_ratelimit()) {
+                        printk_ratelimited(KERN_ERR "btrfs: fail to "
-                                printk(KERN_ERR "btrfs: fail to "
+                                       "dirty  inode %llu error %ld\n",
-                                       "dirty  inode %lu error %ld\n",
+                                       (unsigned long long)btrfs_ino(inode),
-                                       inode->i_ino, PTR_ERR(trans));
+                                       PTR_ERR(trans));
-                        }
                        return;
                }
                btrfs_set_trans_block_group(trans, inode);
                ret = btrfs_update_inode(trans, root, inode);
                if (ret) {
-                        if (printk_ratelimit()) {
+                        printk_ratelimited(KERN_ERR "btrfs: fail to "
-                                printk(KERN_ERR "btrfs: fail to "
+                                       "dirty  inode %llu error %d\n",
-                                       "dirty  inode %lu error %d\n",
+                                       (unsigned long long)btrfs_ino(inode),
-                                       inode->i_ino, ret);
+                                       ret);
-                        }
                }
        }
        btrfs_end_transaction(trans, root);
+        if (BTRFS_I(inode)->delayed_node)
+                btrfs_balance_delayed_items(root);
 }
 /*
@@ -4449,7 +4347,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
        struct extent_buffer *leaf;
        int ret;
-        key.objectid = inode->i_ino;
+        key.objectid = btrfs_ino(inode);
        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
        key.offset = (u64)-1;
@@ -4481,7 +4379,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-        if (found_key.objectid != inode->i_ino ||
+        if (found_key.objectid != btrfs_ino(inode) ||
            btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
                BTRFS_I(inode)->index_cnt = 2;
                goto out;
@@ -4502,9 +4400,12 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
        int ret = 0;
        if (BTRFS_I(dir)->index_cnt == (u64)-1) {
-                ret = btrfs_set_inode_index_count(dir);
+                ret = btrfs_inode_delayed_dir_index_count(dir);
-                if (ret)
+                if (ret) {
-                        return ret;
+                        ret = btrfs_set_inode_index_count(dir);
+                        if (ret)
+                                return ret;
+                }
        }
        *index = BTRFS_I(dir)->index_cnt;
@@ -4540,6 +4441,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                return ERR_PTR(-ENOMEM);
        }
+        /*
+         * we have to initialize this early, so we can reclaim the inode
+         * number if we fail afterwards in this function.
+         */
+        inode->i_ino = objectid;
        if (dir) {
                trace_btrfs_inode_request(dir);
@@ -4585,7 +4492,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                goto fail;
        inode_init_owner(inode, dir, mode);
-        inode->i_ino = objectid;
        inode_set_bytes(inode, 0);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -4649,29 +4555,29 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
        int ret = 0;
        struct btrfs_key key;
        struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+        u64 ino = btrfs_ino(inode);
+        u64 parent_ino = btrfs_ino(parent_inode);
-        if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
                memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
        } else {
-                key.objectid = inode->i_ino;
+                key.objectid = ino;
                btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
                key.offset = 0;
        }
-        if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
                ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
                                         key.objectid, root->root_key.objectid,
-                                         parent_inode->i_ino,
+                                         parent_ino, index, name, name_len);
-                                         index, name, name_len);
        } else if (add_backref) {
-                ret = btrfs_insert_inode_ref(trans, root,
+                ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
-                                             name, name_len, inode->i_ino,
+                                             parent_ino, index);
-                                             parent_inode->i_ino, index);
        }
        if (ret == 0) {
                ret = btrfs_insert_dir_item(trans, root, name, name_len,
-                                            parent_inode->i_ino, &key,
+                                            parent_inode, &key,
                                            btrfs_inode_type(inode), index);
                BUG_ON(ret);
@@ -4714,10 +4620,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (!new_valid_dev(rdev))
                return -EINVAL;
-        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
-        if (err)
-                return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
@@ -4729,8 +4631,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_ino(root, &objectid);
+        if (err)
+                goto out_unlock;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len, dir->i_ino, objectid,
+                                dentry->d_name.len, btrfs_ino(dir), objectid,
                                BTRFS_I(dir)->block_group, mode, &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
@@ -4777,9 +4683,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        u64 objectid;
        u64 index = 0;
-        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
-        if (err)
-                return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
@@ -4791,8 +4694,12 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_ino(root, &objectid);
+        if (err)
+                goto out_unlock;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len, dir->i_ino, objectid,
+                                dentry->d_name.len, btrfs_ino(dir), objectid,
                                BTRFS_I(dir)->block_group, mode, &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
@@ -4903,10 +4810,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        u64 index = 0;
        unsigned long nr = 1;
-        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
-        if (err)
-                return err;
        /*
         * 2 items for inode and ref
         * 2 items for dir items
@@ -4917,8 +4820,12 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                return PTR_ERR(trans);
        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_ino(root, &objectid);
+        if (err)
+                goto out_fail;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len, dir->i_ino, objectid,
+                                dentry->d_name.len, btrfs_ino(dir), objectid,
                                BTRFS_I(dir)->block_group, S_IFDIR | mode,
                                &index);
        if (IS_ERR(inode)) {
@@ -5041,7 +4948,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
        u64 bytenr;
        u64 extent_start = 0;
        u64 extent_end = 0;
-        u64 objectid = inode->i_ino;
+        u64 objectid = btrfs_ino(inode);
        u32 found_type;
        struct btrfs_path *path = NULL;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -5069,7 +4976,7 @@ again:
                else
                        goto out;
        }
-        em = alloc_extent_map(GFP_NOFS);
+        em = alloc_extent_map();
        if (!em) {
                err = -ENOMEM;
                goto out;
@@ -5223,7 +5130,7 @@ again:
                                kunmap(page);
                                free_extent_map(em);
                                em = NULL;
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                                trans = btrfs_join_transaction(root, 1);
                                if (IS_ERR(trans))
                                        return ERR_CAST(trans);
@@ -5249,7 +5156,7 @@ not_found_em:
        em->block_start = EXTENT_MAP_HOLE;
        set_bit(EXTENT_FLAG_VACANCY, &em->flags);
 insert:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (em->start > start || extent_map_end(em) <= start) {
                printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
                       "[%llu %llu]\n", (unsigned long long)em->start,
@@ -5382,7 +5289,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
                u64 hole_start = start;
                u64 hole_len = len;
-                em = alloc_extent_map(GFP_NOFS);
+                em = alloc_extent_map();
                if (!em) {
                        err = -ENOMEM;
                        goto out;
@@ -5472,6 +5379,9 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        if (IS_ERR(trans))
                return ERR_CAST(trans);
+        if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
+                btrfs_add_inode_defrag(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5483,7 +5393,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        }
        if (!em) {
-                em = alloc_extent_map(GFP_NOFS);
+                em = alloc_extent_map();
                if (!em) {
                        em = ERR_PTR(-ENOMEM);
                        goto out;
@@ -5549,7 +5459,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
-        ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+        ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
                                       offset, 0);
        if (ret < 0)
                goto out;
@@ -5566,7 +5476,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
        ret = 0;
        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, slot);
-        if (key.objectid != inode->i_ino ||
+        if (key.objectid != btrfs_ino(inode) ||
            key.type != BTRFS_EXTENT_DATA_KEY) {
                /* not our file or wrong item type, must cow */
                goto out;
@@ -5600,7 +5510,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
         * look for other files referencing this extent, if we
         * find any we must cow
         */
-        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+        if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
                                  key.offset - backref_offset, disk_bytenr))
                goto out;
@@ -5790,9 +5700,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
                        flush_dcache_page(bvec->bv_page);
                        if (csum != *private) {
-                                printk(KERN_ERR "btrfs csum failed ino %lu off"
+                                printk(KERN_ERR "btrfs csum failed ino %llu off"
                                      " %llu csum %u private %u\n",
-                                      inode->i_ino, (unsigned long long)start,
+                                      (unsigned long long)btrfs_ino(inode),
+                                      (unsigned long long)start,
                                      csum, *private);
                                err = -EIO;
                        }
@@ -5939,9 +5850,9 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
        struct btrfs_dio_private *dip = bio->bi_private;
        if (err) {
-                printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
+                printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
                      "sector %#Lx len %u err no %d\n",
-                      dip->inode->i_ino, bio->bi_rw,
+                      (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw,
                      (unsigned long long)bio->bi_sector, bio->bi_size, err);
                dip->errors = 1;
@@ -6782,12 +6693,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->ordered_data_close = 0;
        ei->orphan_meta_reserved = 0;
        ei->dummy_inode = 0;
+        ei->in_defrag = 0;
        ei->force_compress = BTRFS_COMPRESS_NONE;
+        ei->delayed_node = NULL;
        inode = &ei->vfs_inode;
-        extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+        extent_map_tree_init(&ei->extent_tree);
-        extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+        extent_io_tree_init(&ei->io_tree, &inode->i_data);
-        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
        mutex_init(&ei->log_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
@@ -6851,8 +6765,8 @@ void btrfs_destroy_inode(struct inode *inode)
        spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-                printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
+                printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
-                       inode->i_ino);
+                       (unsigned long long)btrfs_ino(inode));
                list_del_init(&BTRFS_I(inode)->i_orphan);
        }
        spin_unlock(&root->orphan_lock);
@@ -6874,6 +6788,7 @@ void btrfs_destroy_inode(struct inode *inode)
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
+        btrfs_remove_delayed_node(inode);
        call_rcu(&inode->i_rcu, btrfs_i_callback);
 }
@@ -6882,7 +6797,7 @@ int btrfs_drop_inode(struct inode *inode)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        if (btrfs_root_refs(&root->root_item) == 0 &&
-            root != root->fs_info->tree_root)
+            !is_free_space_inode(root, inode))
                return 1;
        else
                return generic_drop_inode(inode);
@@ -6991,16 +6906,17 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        u64 index = 0;
        u64 root_objectid;
        int ret;
+        u64 old_ino = btrfs_ino(old_inode);
-        if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+        if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                return -EPERM;
        /* we only allow rename subvolume link between subvolumes */
-        if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+        if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
                return -EXDEV;
-        if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+        if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
-            (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
+            (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
                return -ENOTEMPTY;
        if (S_ISDIR(old_inode->i_mode) && new_inode &&
@@ -7016,7 +6932,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                filemap_flush(old_inode->i_mapping);
        /* close the racy window with snapshot create/destroy ioctl */
-        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
                down_read(&root->fs_info->subvol_sem);
        /*
         * We want to reserve the absolute worst case amount of items.  So if
@@ -7041,15 +6957,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (ret)
                goto out_fail;
-        if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                /* force full log commit if subvolume involved. */
                root->fs_info->last_trans_log_full_commit = trans->transid;
        } else {
                ret = btrfs_insert_inode_ref(trans, dest,
                                             new_dentry->d_name.name,
                                             new_dentry->d_name.len,
-                                             old_inode->i_ino,
+                                             old_ino,
-                                             new_dir->i_ino, index);
+                                             btrfs_ino(new_dir), index);
                if (ret)
                        goto out_fail;
                /*
@@ -7065,10 +6981,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * make sure the inode gets flushed if it is replacing
         * something.
         */
-        if (new_inode && new_inode->i_size &&
+        if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
-            old_inode && S_ISREG(old_inode->i_mode)) {
                btrfs_add_ordered_operation(trans, root, old_inode);
-        }
        old_dir->i_ctime = old_dir->i_mtime = ctime;
        new_dir->i_ctime = new_dir->i_mtime = ctime;
@@ -7077,7 +6991,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (old_dentry->d_parent != new_dentry->d_parent)
                btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
-        if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
                ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
                                        old_dentry->d_name.name,
@@ -7094,7 +7008,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (new_inode) {
                new_inode->i_ctime = CURRENT_TIME;
-                if (unlikely(new_inode->i_ino ==
+                if (unlikely(btrfs_ino(new_inode) ==
                             BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
                        root_objectid = BTRFS_I(new_inode)->location.objectid;
                        ret = btrfs_unlink_subvol(trans, dest, new_dir,
@@ -7122,7 +7036,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                             new_dentry->d_name.len, 0, index);
        BUG_ON(ret);
-        if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+        if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
                struct dentry *parent = dget_parent(new_dentry);
                btrfs_log_new_name(trans, old_inode, old_dir, parent);
                dput(parent);
@@ -7131,7 +7045,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 out_fail:
        btrfs_end_transaction_throttle(trans, root);
 out_notrans:
-        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
        return ret;
@@ -7185,58 +7099,6 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        return 0;
 }
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
-                                   int sync)
-{
-        struct btrfs_inode *binode;
-        struct inode *inode = NULL;
-        spin_lock(&root->fs_info->delalloc_lock);
-        while (!list_empty(&root->fs_info->delalloc_inodes)) {
-                binode = list_entry(root->fs_info->delalloc_inodes.next,
-                                    struct btrfs_inode, delalloc_inodes);
-                inode = igrab(&binode->vfs_inode);
-                if (inode) {
-                        list_move_tail(&binode->delalloc_inodes,
-                                       &root->fs_info->delalloc_inodes);
-                        break;
-                }
-                list_del_init(&binode->delalloc_inodes);
-                cond_resched_lock(&root->fs_info->delalloc_lock);
-        }
-        spin_unlock(&root->fs_info->delalloc_lock);
-        if (inode) {
-                if (sync) {
-                        filemap_write_and_wait(inode->i_mapping);
-                        /*
-                         * We have to do this because compression doesn't
-                         * actually set PG_writeback until it submits the pages
-                         * for IO, which happens in an async thread, so we could
-                         * race and not actually wait for any writeback pages
-                         * because they've not been submitted yet.  Technically
-                         * this could still be the case for the ordered stuff
-                         * since the async thread may not have started to do its
-                         * work yet.  If this becomes the case then we need to
-                         * figure out a way to make sure that in writepage we
-                         * wait for any async pages to be submitted before
-                         * returning so that fdatawait does what its supposed to
-                         * do.
-                         */
-                        btrfs_wait_ordered_range(inode, 0, (u64)-1);
-                } else {
-                        filemap_flush(inode->i_mapping);
-                }
-                if (delay_iput)
-                        btrfs_add_delayed_iput(inode);
-                else
-                        iput(inode);
-                return 1;
-        }
-        return 0;
-}
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                         const char *symname)
 {
@@ -7260,9 +7122,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
                return -ENAMETOOLONG;
-        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
-        if (err)
-                return err;
        /*
         * 2 items for inode item and ref
         * 2 items for dir items
@@ -7274,8 +7133,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_ino(root, &objectid);
+        if (err)
+                goto out_unlock;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len, dir->i_ino, objectid,
+                                dentry->d_name.len, btrfs_ino(dir), objectid,
                                BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
                                &index);
        if (IS_ERR(inode)) {
@@ -7307,7 +7170,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        key.objectid = inode->i_ino;
+        key.objectid = btrfs_ino(inode);
        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
        datasize = btrfs_file_extent_calc_inline_size(name_len);
@@ -7315,6 +7178,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                                      datasize);
        if (err) {
                drop_inode = 1;
+                btrfs_free_path(path);
                goto out_unlock;
        }
        leaf = path->nodes[0];
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2616f7ed4799..85e818ce00c5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -50,6 +50,7 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "locking.h"
+#include "inode-map.h"
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -281,8 +282,9 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        mutex_lock(&fs_info->fs_devices->device_list_mutex);
+        rcu_read_lock();
-        list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+        list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
+                                dev_list) {
                if (!device->bdev)
                        continue;
                q = bdev_get_queue(device->bdev);
@@ -292,7 +294,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
                                     minlen);
                }
        }
-        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+        rcu_read_unlock();
        if (!num_devices)
                return -EOPNOTSUPP;
@@ -329,8 +331,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
        u64 index = 0;
-        ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
+        ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
-                                       0, &objectid);
        if (ret) {
                dput(parent);
                return ret;
@@ -422,7 +423,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        BUG_ON(ret);
        ret = btrfs_insert_dir_item(trans, root,
-                                    name, namelen, dir->i_ino, &key,
+                                    name, namelen, dir, &key,
                                    BTRFS_FT_DIR, index);
        if (ret)
                goto fail;
@@ -433,7 +434,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
                                 objectid, root->root_key.objectid,
-                                 dir->i_ino, index, name, namelen);
+                                 btrfs_ino(dir), index, name, namelen);
        BUG_ON(ret);
@@ -655,6 +656,106 @@ out_unlock:
        return error;
 }
+/*
+ * When we're defragging a range, we don't want to kick it off again
+ * if it is really just waiting for delalloc to send it down.
+ * If we find a nice big extent or delalloc range for the bytes in the
+ * file you want to defrag, we return 0 to let you know to skip this
+ * part of the file
+ */
+static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
+{
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct extent_map *em = NULL;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        u64 end;
+        read_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+        read_unlock(&em_tree->lock);
+        if (em) {
+                end = extent_map_end(em);
+                free_extent_map(em);
+                if (end - offset > thresh)
+                        return 0;
+        }
+        /* if we already have a nice delalloc here, just stop */
+        thresh /= 2;
+        end = count_range_bits(io_tree, &offset, offset + thresh,
+                               thresh, EXTENT_DELALLOC, 1);
+        if (end >= thresh)
+                return 0;
+        return 1;
+}
+/*
+ * helper function to walk through a file and find extents
+ * newer than a specific transid, and smaller than thresh.
+ *
+ * This is used by the defragging code to find new and small
+ * extents
+ */
+static int find_new_extents(struct btrfs_root *root,
+                            struct inode *inode, u64 newer_than,
+                            u64 *off, int thresh)
+{
+        struct btrfs_path *path;
+        struct btrfs_key min_key;
+        struct btrfs_key max_key;
+        struct extent_buffer *leaf;
+        struct btrfs_file_extent_item *extent;
+        int type;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        min_key.objectid = inode->i_ino;
+        min_key.type = BTRFS_EXTENT_DATA_KEY;
+        min_key.offset = *off;
+        max_key.objectid = inode->i_ino;
+        max_key.type = (u8)-1;
+        max_key.offset = (u64)-1;
+        path->keep_locks = 1;
+        while(1) {
+                ret = btrfs_search_forward(root, &min_key, &max_key,
+                                           path, 0, newer_than);
+                if (ret != 0)
+                        goto none;
+                if (min_key.objectid != inode->i_ino)
+                        goto none;
+                if (min_key.type != BTRFS_EXTENT_DATA_KEY)
+                        goto none;
+                leaf = path->nodes[0];
+                extent = btrfs_item_ptr(leaf, path->slots[0],
+                                        struct btrfs_file_extent_item);
+                type = btrfs_file_extent_type(leaf, extent);
+                if (type == BTRFS_FILE_EXTENT_REG &&
+                    btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
+                    check_defrag_in_cache(inode, min_key.offset, thresh)) {
+                        *off = min_key.offset;
+                        btrfs_free_path(path);
+                        return 0;
+                }
+                if (min_key.offset == (u64)-1)
+                        goto none;
+                min_key.offset++;
+                btrfs_release_path(path);
+        }
+none:
+        btrfs_free_path(path);
+        return -ENOENT;
+}
 static int should_defrag_range(struct inode *inode, u64 start, u64 len,
                               int thresh, u64 *last_len, u64 *skip,
                               u64 *defrag_end)
@@ -664,10 +765,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        int ret = 1;
-        if (thresh == 0)
-                thresh = 256 * 1024;
        /*
         * make sure that once we start defragging and extent, we keep on
         * defragging it
@@ -726,27 +823,176 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
        return ret;
 }
-static int btrfs_defrag_file(struct file *file,
+/*
-                             struct btrfs_ioctl_defrag_range_args *range)
+ * it doesn't do much good to defrag one or two pages
+ * at a time.  This pulls in a nice chunk of pages
+ * to COW and defrag.
+ *
+ * It also makes sure the delalloc code has enough
+ * dirty data to avoid making new small extents as part
+ * of the defrag
+ *
+ * It's a good idea to start RA on this range
+ * before calling this.
+ */
+static int cluster_pages_for_defrag(struct inode *inode,
+                                    struct page **pages,
+                                    unsigned long start_index,
+                                    int num_pages)
 {
-        struct inode *inode = fdentry(file)->d_inode;
+        unsigned long file_end;
-        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 isize = i_size_read(inode);
-        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        u64 page_start;
+        u64 page_end;
+        int ret;
+        int i;
+        int i_done;
        struct btrfs_ordered_extent *ordered;
-        struct page *page;
+        struct extent_state *cached_state = NULL;
+        if (isize == 0)
+                return 0;
+        file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+        ret = btrfs_delalloc_reserve_space(inode,
+                                           num_pages << PAGE_CACHE_SHIFT);
+        if (ret)
+                return ret;
+again:
+        ret = 0;
+        i_done = 0;
+        /* step one, lock all the pages */
+        for (i = 0; i < num_pages; i++) {
+                struct page *page;
+                page = grab_cache_page(inode->i_mapping,
+                                            start_index + i);
+                if (!page)
+                        break;
+                if (!PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                ret = -EIO;
+                                break;
+                        }
+                }
+                isize = i_size_read(inode);
+                file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+                if (!isize || page->index > file_end ||
+                    page->mapping != inode->i_mapping) {
+                        /* whoops, we blew past eof, skip this page */
+                        unlock_page(page);
+                        page_cache_release(page);
+                        break;
+                }
+                pages[i] = page;
+                i_done++;
+        }
+        if (!i_done || ret)
+                goto out;
+        if (!(inode->i_sb->s_flags & MS_ACTIVE))
+                goto out;
+        /*
+         * so now we have a nice long stream of locked
+         * and up to date pages, lets wait on them
+         */
+        for (i = 0; i < i_done; i++)
+                wait_on_page_writeback(pages[i]);
+        page_start = page_offset(pages[0]);
+        page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
+        lock_extent_bits(&BTRFS_I(inode)->io_tree,
+                         page_start, page_end - 1, 0, &cached_state,
+                         GFP_NOFS);
+        ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1);
+        if (ordered &&
+            ordered->file_offset + ordered->len > page_start &&
+            ordered->file_offset < page_end) {
+                btrfs_put_ordered_extent(ordered);
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                                     page_start, page_end - 1,
+                                     &cached_state, GFP_NOFS);
+                for (i = 0; i < i_done; i++) {
+                        unlock_page(pages[i]);
+                        page_cache_release(pages[i]);
+                }
+                btrfs_wait_ordered_range(inode, page_start,
+                                         page_end - page_start);
+                goto again;
+        }
+        if (ordered)
+                btrfs_put_ordered_extent(ordered);
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
+                          page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+                          EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
+                          GFP_NOFS);
+        if (i_done != num_pages) {
+                atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+                btrfs_delalloc_release_space(inode,
+                                     (num_pages - i_done) << PAGE_CACHE_SHIFT);
+        }
+        btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
+                                  &cached_state);
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                             page_start, page_end - 1, &cached_state,
+                             GFP_NOFS);
+        for (i = 0; i < i_done; i++) {
+                clear_page_dirty_for_io(pages[i]);
+                ClearPageChecked(pages[i]);
+                set_page_extent_mapped(pages[i]);
+                set_page_dirty(pages[i]);
+                unlock_page(pages[i]);
+                page_cache_release(pages[i]);
+        }
+        return i_done;
+out:
+        for (i = 0; i < i_done; i++) {
+                unlock_page(pages[i]);
+                page_cache_release(pages[i]);
+        }
+        btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT);
+        return ret;
+}
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+                      struct btrfs_ioctl_defrag_range_args *range,
+                      u64 newer_than, unsigned long max_to_defrag)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_super_block *disk_super;
+        struct file_ra_state *ra = NULL;
        unsigned long last_index;
-        unsigned long ra_pages = root->fs_info->bdi.ra_pages;
-        unsigned long total_read = 0;
        u64 features;
-        u64 page_start;
-        u64 page_end;
        u64 last_len = 0;
        u64 skip = 0;
        u64 defrag_end = 0;
+        u64 newer_off = range->start;
+        int newer_left = 0;
        unsigned long i;
        int ret;
+        int defrag_count = 0;
        int compress_type = BTRFS_COMPRESS_ZLIB;
+        int extent_thresh = range->extent_thresh;
+        int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+        u64 new_align = ~((u64)128 * 1024 - 1);
+        struct page **pages = NULL;
+        if (extent_thresh == 0)
+                extent_thresh = 256 * 1024;
        if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
                if (range->compress_type > BTRFS_COMPRESS_TYPES)
@@ -758,6 +1004,27 @@ static int btrfs_defrag_file(struct file *file,
        if (inode->i_size == 0)
                return 0;
+        /*
+         * if we were not given a file, allocate a readahead
+         * context
+         */
+        if (!file) {
+                ra = kzalloc(sizeof(*ra), GFP_NOFS);
+                if (!ra)
+                        return -ENOMEM;
+                file_ra_state_init(ra, inode->i_mapping);
+        } else {
+                ra = &file->f_ra;
+        }
+        pages = kmalloc(sizeof(struct page *) * newer_cluster,
+                        GFP_NOFS);
+        if (!pages) {
+                ret = -ENOMEM;
+                goto out_ra;
+        }
+        /* find the last page to defrag */
        if (range->start + range->len > range->start) {
                last_index = min_t(u64, inode->i_size - 1,
                         range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
@@ -765,11 +1032,37 @@ static int btrfs_defrag_file(struct file *file,
                last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
        }
-        i = range->start >> PAGE_CACHE_SHIFT;
+        if (newer_than) {
-        while (i <= last_index) {
+                ret = find_new_extents(root, inode, newer_than,
-                if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+                                       &newer_off, 64 * 1024);
+                if (!ret) {
+                        range->start = newer_off;
+                        /*
+                         * we always align our defrag to help keep
+                         * the extents in the file evenly spaced
+                         */
+                        i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+                        newer_left = newer_cluster;
+                } else
+                        goto out_ra;
+        } else {
+                i = range->start >> PAGE_CACHE_SHIFT;
+        }
+        if (!max_to_defrag)
+                max_to_defrag = last_index - 1;
+        while (i <= last_index && defrag_count < max_to_defrag) {
+                /*
+                 * make sure we stop running if someone unmounts
+                 * the FS
+                 */
+                if (!(inode->i_sb->s_flags & MS_ACTIVE))
+                        break;
+                if (!newer_than &&
+                    !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
                                        PAGE_CACHE_SIZE,
-                                        range->extent_thresh,
+                                        extent_thresh,
                                        &last_len, &skip,
                                        &defrag_end)) {
                        unsigned long next;
@@ -781,92 +1074,39 @@ static int btrfs_defrag_file(struct file *file,
                        i = max(i + 1, next);
                        continue;
                }
-                if (total_read % ra_pages == 0) {
-                        btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
-                                       min(last_index, i + ra_pages - 1));
-                }
-                total_read++;
-                mutex_lock(&inode->i_mutex);
                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
                        BTRFS_I(inode)->force_compress = compress_type;
-                ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+                btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
-                if (ret)
-                        goto err_unlock;
-again:
-                if (inode->i_size == 0 ||
-                    i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
-                        ret = 0;
-                        goto err_reservations;
-                }
-                page = grab_cache_page(inode->i_mapping, i);
+                ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
-                if (!page) {
+                if (ret < 0)
-                        ret = -ENOMEM;
+                        goto out_ra;
-                        goto err_reservations;
-                }
-                if (!PageUptodate(page)) {
-                        btrfs_readpage(NULL, page);
-                        lock_page(page);
-                        if (!PageUptodate(page)) {
-                                unlock_page(page);
-                                page_cache_release(page);
-                                ret = -EIO;
-                                goto err_reservations;
-                        }
-                }
-                if (page->mapping != inode->i_mapping) {
-                        unlock_page(page);
-                        page_cache_release(page);
-                        goto again;
-                }
-                wait_on_page_writeback(page);
+                defrag_count += ret;
+                balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+                i += ret;
-                if (PageDirty(page)) {
+                if (newer_than) {
-                        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+                        if (newer_off == (u64)-1)
-                        goto loop_unlock;
+                                break;
-                }
-                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-                page_end = page_start + PAGE_CACHE_SIZE - 1;
-                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-                ordered = btrfs_lookup_ordered_extent(inode, page_start);
+                        newer_off = max(newer_off + 1,
-                if (ordered) {
+                                        (u64)i << PAGE_CACHE_SHIFT);
-                        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-                        unlock_page(page);
+                        ret = find_new_extents(root, inode,
-                        page_cache_release(page);
+                                               newer_than, &newer_off,
-                        btrfs_start_ordered_extent(inode, ordered, 1);
+                                               64 * 1024);
-                        btrfs_put_ordered_extent(ordered);
+                        if (!ret) {
-                        goto again;
+                                range->start = newer_off;
+                                i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+                                newer_left = newer_cluster;
+                        } else {
+                                break;
+                        }
+                } else {
+                        i++;
                }
-                set_page_extent_mapped(page);
-                /*
-                 * this makes sure page_mkwrite is called on the
-                 * page if it is dirtied again later
-                 */
-                clear_page_dirty_for_io(page);
-                clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
-                                  page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
-                                  EXTENT_DO_ACCOUNTING, GFP_NOFS);
-                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
-                ClearPageChecked(page);
-                set_page_dirty(page);
-                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-loop_unlock:
-                unlock_page(page);
-                page_cache_release(page);
-                mutex_unlock(&inode->i_mutex);
-                balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
-                i++;
        }
        if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
@@ -898,12 +1138,14 @@ loop_unlock:
                btrfs_set_super_incompat_flags(disk_super, features);
        }
-        return 0;
+        if (!file)
+                kfree(ra);
+        return defrag_count;
-err_reservations:
+out_ra:
-        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+        if (!file)
-err_unlock:
+                kfree(ra);
-        mutex_unlock(&inode->i_mutex);
+        kfree(pages);
        return ret;
 }
@@ -1129,7 +1371,7 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
        int ret = 0;
        u64 flags = 0;
-        if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+        if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
                return -EINVAL;
        down_read(&root->fs_info->subvol_sem);
@@ -1156,7 +1398,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
-        if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+        if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
                return -EINVAL;
        if (copy_from_user(&flags, arg, sizeof(flags)))
@@ -1279,7 +1521,6 @@ static noinline int copy_to_sk(struct btrfs_root *root,
        int nritems;
        int i;
        int slot;
-        int found = 0;
        int ret = 0;
        leaf = path->nodes[0];
@@ -1326,7 +1567,7 @@ static noinline int copy_to_sk(struct btrfs_root *root,
                                           item_off, item_len);
                        *sk_offset += item_len;
                }
-                found++;
+                (*num_found)++;
                if (*num_found >= sk->nr_items)
                        break;
@@ -1345,7 +1586,6 @@ advance_key:
        } else
                ret = 1;
 overflow:
-        *num_found += found;
        return ret;
 }
@@ -1402,7 +1642,7 @@ static noinline int search_ioctl(struct inode *inode,
                }
                ret = copy_to_sk(root, path, &key, sk, args->buf,
                                 &sk_offset, &num_found);
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                if (ret || num_found >= sk->nr_items)
                        break;
@@ -1509,7 +1749,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
                if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
                        break;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                key.objectid = key.offset;
                key.offset = (u64)-1;
                dirid = key.objectid;
@@ -1639,7 +1879,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                        goto out_dput;
        }
-        if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+        if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
                err = -EINVAL;
                goto out_dput;
        }
@@ -1757,7 +1997,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        /* the rest are all set to zero by kzalloc */
                        range->len = (u64)-1;
                }
-                ret = btrfs_defrag_file(file, range);
+                ret = btrfs_defrag_file(fdentry(file)->d_inode, file,
+                                        range, 0, 0);
+                if (ret > 0)
+                        ret = 0;
                kfree(range);
                break;
        default:
@@ -1809,6 +2052,75 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
        return ret;
 }
+static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
+{
+        struct btrfs_ioctl_fs_info_args fi_args;
+        struct btrfs_device *device;
+        struct btrfs_device *next;
+        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        fi_args.num_devices = fs_devices->num_devices;
+        fi_args.max_id = 0;
+        memcpy(&fi_args.fsid, root->fs_info->fsid, sizeof(fi_args.fsid));
+        mutex_lock(&fs_devices->device_list_mutex);
+        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+                if (device->devid > fi_args.max_id)
+                        fi_args.max_id = device->devid;
+        }
+        mutex_unlock(&fs_devices->device_list_mutex);
+        if (copy_to_user(arg, &fi_args, sizeof(fi_args)))
+                return -EFAULT;
+        return 0;
+}
+static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
+{
+        struct btrfs_ioctl_dev_info_args *di_args;
+        struct btrfs_device *dev;
+        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+        int ret = 0;
+        char *s_uuid = NULL;
+        char empty_uuid[BTRFS_UUID_SIZE] = {0};
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        di_args = memdup_user(arg, sizeof(*di_args));
+        if (IS_ERR(di_args))
+                return PTR_ERR(di_args);
+        if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0)
+                s_uuid = di_args->uuid;
+        mutex_lock(&fs_devices->device_list_mutex);
+        dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
+        mutex_unlock(&fs_devices->device_list_mutex);
+        if (!dev) {
+                ret = -ENODEV;
+                goto out;
+        }
+        di_args->devid = dev->devid;
+        di_args->bytes_used = dev->bytes_used;
+        di_args->total_bytes = dev->total_bytes;
+        memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
+        strncpy(di_args->path, dev->name, sizeof(di_args->path));
+out:
+        if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
+                ret = -EFAULT;
+        kfree(di_args);
+        return ret;
+}
 static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                       u64 off, u64 olen, u64 destoff)
 {
@@ -1925,7 +2237,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        }
        /* clone data */
-        key.objectid = src->i_ino;
+        key.objectid = btrfs_ino(src);
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = 0;
@@ -1952,7 +2264,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
-                    key.objectid != src->i_ino)
+                    key.objectid != btrfs_ino(src))
                        break;
                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
@@ -1988,14 +2300,14 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                datal = btrfs_file_extent_ram_bytes(leaf,
                                                                    extent);
                        }
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        if (key.offset + datal <= off ||
                            key.offset >= off+len)
                                goto next;
                        memcpy(&new_key, &key, sizeof(new_key));
-                        new_key.objectid = inode->i_ino;
+                        new_key.objectid = btrfs_ino(inode);
                        if (off <= key.offset)
                                new_key.offset = key.offset + destoff - off;
                        else
@@ -2049,7 +2361,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                        ret = btrfs_inc_extent_ref(trans, root,
                                                        disko, diskl, 0,
                                                        root->root_key.objectid,
-                                                        inode->i_ino,
+                                                        btrfs_ino(inode),
                                                        new_key.offset - datao);
                                        BUG_ON(ret);
                                }
@@ -2098,7 +2410,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        }
                        btrfs_mark_buffer_dirty(leaf);
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -2119,12 +2431,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        btrfs_end_transaction(trans, root);
                }
 next:
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                key.offset++;
        }
        ret = 0;
 out:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
 out_unlock:
        mutex_unlock(&src->i_mutex);
@@ -2471,6 +2783,58 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
        return btrfs_wait_for_commit(root, transid);
 }
+static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+{
+        int ret;
+        struct btrfs_ioctl_scrub_args *sa;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        sa = memdup_user(arg, sizeof(*sa));
+        if (IS_ERR(sa))
+                return PTR_ERR(sa);
+        ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
+                              &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+        if (copy_to_user(arg, sa, sizeof(*sa)))
+                ret = -EFAULT;
+        kfree(sa);
+        return ret;
+}
+static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return btrfs_scrub_cancel(root);
+}
+static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
+                                       void __user *arg)
+{
+        struct btrfs_ioctl_scrub_args *sa;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        sa = memdup_user(arg, sizeof(*sa));
+        if (IS_ERR(sa))
+                return PTR_ERR(sa);
+        ret = btrfs_scrub_progress(root, sa->devid, &sa->progress);
+        if (copy_to_user(arg, sa, sizeof(*sa)))
+                ret = -EFAULT;
+        kfree(sa);
+        return ret;
+}
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -2510,6 +2874,10 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_add_dev(root, argp);
        case BTRFS_IOC_RM_DEV:
                return btrfs_ioctl_rm_dev(root, argp);
+        case BTRFS_IOC_FS_INFO:
+                return btrfs_ioctl_fs_info(root, argp);
+        case BTRFS_IOC_DEV_INFO:
+                return btrfs_ioctl_dev_info(root, argp);
        case BTRFS_IOC_BALANCE:
                return btrfs_balance(root->fs_info->dev_root);
        case BTRFS_IOC_CLONE:
@@ -2533,6 +2901,12 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_start_sync(file, argp);
        case BTRFS_IOC_WAIT_SYNC:
                return btrfs_ioctl_wait_sync(file, argp);
+        case BTRFS_IOC_SCRUB:
+                return btrfs_ioctl_scrub(root, argp);
+        case BTRFS_IOC_SCRUB_CANCEL:
+                return btrfs_ioctl_scrub_cancel(root, argp);
+        case BTRFS_IOC_SCRUB_PROGRESS:
+                return btrfs_ioctl_scrub_progress(root, argp);
        }
        return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 8fb382167b13..ad1ea789fcb4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -32,6 +32,8 @@ struct btrfs_ioctl_vol_args {
 #define BTRFS_SUBVOL_CREATE_ASYNC       (1ULL << 0)
 #define BTRFS_SUBVOL_RDONLY             (1ULL << 1)
+#define BTRFS_FSID_SIZE 16
+#define BTRFS_UUID_SIZE 16
 #define BTRFS_SUBVOL_NAME_MAX 4039
 struct btrfs_ioctl_vol_args_v2 {
@@ -42,6 +44,71 @@ struct btrfs_ioctl_vol_args_v2 {
        char name[BTRFS_SUBVOL_NAME_MAX + 1];
 };
+/*
+ * structure to report errors and progress to userspace, either as a
+ * result of a finished scrub, a canceled scrub or a progress inquiry
+ */
+struct btrfs_scrub_progress {
+        __u64 data_extents_scrubbed;    /* # of data extents scrubbed */
+        __u64 tree_extents_scrubbed;    /* # of tree extents scrubbed */
+        __u64 data_bytes_scrubbed;      /* # of data bytes scrubbed */
+        __u64 tree_bytes_scrubbed;      /* # of tree bytes scrubbed */
+        __u64 read_errors;              /* # of read errors encountered (EIO) */
+        __u64 csum_errors;              /* # of failed csum checks */
+        __u64 verify_errors;            /* # of occurences, where the metadata
+                                         * of a tree block did not match the
+                                         * expected values, like generation or
+                                         * logical */
+        __u64 no_csum;                  /* # of 4k data block for which no csum
+                                         * is present, probably the result of
+                                         * data written with nodatasum */
+        __u64 csum_discards;            /* # of csum for which no data was found
+                                         * in the extent tree. */
+        __u64 super_errors;             /* # of bad super blocks encountered */
+        __u64 malloc_errors;            /* # of internal kmalloc errors. These
+                                         * will likely cause an incomplete
+                                         * scrub */
+        __u64 uncorrectable_errors;     /* # of errors where either no intact
+                                         * copy was found or the writeback
+                                         * failed */
+        __u64 corrected_errors;         /* # of errors corrected */
+        __u64 last_physical;            /* last physical address scrubbed. In
+                                         * case a scrub was aborted, this can
+                                         * be used to restart the scrub */
+        __u64 unverified_errors;        /* # of occurences where a read for a
+                                         * full (64k) bio failed, but the re-
+                                         * check succeeded for each 4k piece.
+                                         * Intermittent error. */
+};
+#define BTRFS_SCRUB_READONLY    1
+struct btrfs_ioctl_scrub_args {
+        __u64 devid;                            /* in */
+        __u64 start;                            /* in */
+        __u64 end;                              /* in */
+        __u64 flags;                            /* in */
+        struct btrfs_scrub_progress progress;   /* out */
+        /* pad to 1k */
+        __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
+};
+#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+struct btrfs_ioctl_dev_info_args {
+        __u64 devid;                            /* in/out */
+        __u8 uuid[BTRFS_UUID_SIZE];             /* in/out */
+        __u64 bytes_used;                       /* out */
+        __u64 total_bytes;                      /* out */
+        __u64 unused[379];                      /* pad to 4k */
+        __u8 path[BTRFS_DEVICE_PATH_NAME_MAX];  /* out */
+};
+struct btrfs_ioctl_fs_info_args {
+        __u64 max_id;                           /* out */
+        __u64 num_devices;                      /* out */
+        __u8 fsid[BTRFS_FSID_SIZE];             /* out */
+        __u64 reserved[124];                    /* pad to 1k */
+};
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
        __u64 treeid;
@@ -114,37 +181,6 @@ struct btrfs_ioctl_clone_range_args {
 #define BTRFS_DEFRAG_RANGE_COMPRESS 1
 #define BTRFS_DEFRAG_RANGE_START_IO 2
-struct btrfs_ioctl_defrag_range_args {
-        /* start of the defrag operation */
-        __u64 start;
-        /* number of bytes to defrag, use (u64)-1 to say all */
-        __u64 len;
-        /*
-         * flags for the operation, which can include turning
-         * on compression for this one defrag
-         */
-        __u64 flags;
-        /*
-         * any extent bigger than this will be considered
-         * already defragged.  Use 0 to take the kernel default
-         * Use 1 to say every single extent must be rewritten
-         */
-        __u32 extent_thresh;
-        /*
-         * which compression method to use if turning on compression
-         * for this defrag operation.  If unspecified, zlib will
-         * be used
-         */
-        __u32 compress_type;
-        /* spare for later */
-        __u32 unused[4];
-};
 struct btrfs_ioctl_space_info {
        __u64 flags;
        __u64 total_bytes;
@@ -203,4 +239,13 @@ struct btrfs_ioctl_space_args {
                                   struct btrfs_ioctl_vol_args_v2)
 #define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
 #define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
+#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
+                              struct btrfs_ioctl_scrub_args)
+#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
+#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \
+                                       struct btrfs_ioctl_scrub_args)
+#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \
+                                 struct btrfs_ioctl_dev_info_args)
+#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
+                               struct btrfs_ioctl_fs_info_args)
 #endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6151f2ea38bb..66fa43dc3f0f 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -185,31 +185,6 @@ sleep:
        return 0;
 }
-/*
- * Very quick trylock, this does not spin or schedule.  It returns
- * 1 with the spinlock held if it was able to take the lock, or it
- * returns zero if it was unable to take the lock.
- *
- * After this call, scheduling is not safe without first calling
- * btrfs_set_lock_blocking()
- */
-int btrfs_try_tree_lock(struct extent_buffer *eb)
-{
-        if (spin_trylock(&eb->lock)) {
-                if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
-                        /*
-                         * we've got the spinlock, but the real owner is
-                         * blocking.  Drop the spinlock and return failure
-                         */
-                        spin_unlock(&eb->lock);
-                        return 0;
-                }
-                return 1;
-        }
-        /* someone else has the spinlock giveup */
-        return 0;
-}
 int btrfs_tree_unlock(struct extent_buffer *eb)
 {
        /*
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6c4ce457168c..5c33a560a2f1 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,8 +21,6 @@
 int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
-int btrfs_try_tree_lock(struct extent_buffer *eb);
 int btrfs_try_spin_lock(struct extent_buffer *eb);
 void btrfs_set_lock_blocking(struct extent_buffer *eb);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index a97314cf6bd6..82d569cb6267 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -23,56 +23,6 @@
 #include "ref-cache.h"
 #include "transaction.h"
-/*
- * leaf refs are used to cache the information about which extents
- * a given leaf has references on.  This allows us to process that leaf
- * in btrfs_drop_snapshot without needing to read it back from disk.
- */
-/*
- * kmalloc a leaf reference struct and update the counters for the
- * total ref cache size
- */
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
-                                            int nr_extents)
-{
-        struct btrfs_leaf_ref *ref;
-        size_t size = btrfs_leaf_ref_size(nr_extents);
-        ref = kmalloc(size, GFP_NOFS);
-        if (ref) {
-                spin_lock(&root->fs_info->ref_cache_lock);
-                root->fs_info->total_ref_cache_size += size;
-                spin_unlock(&root->fs_info->ref_cache_lock);
-                memset(ref, 0, sizeof(*ref));
-                atomic_set(&ref->usage, 1);
-                INIT_LIST_HEAD(&ref->list);
-        }
-        return ref;
-}
-/*
- * free a leaf reference struct and update the counters for the
- * total ref cache size
- */
-void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
-{
-        if (!ref)
-                return;
-        WARN_ON(atomic_read(&ref->usage) == 0);
-        if (atomic_dec_and_test(&ref->usage)) {
-                size_t size = btrfs_leaf_ref_size(ref->nritems);
-                BUG_ON(ref->in_tree);
-                kfree(ref);
-                spin_lock(&root->fs_info->ref_cache_lock);
-                root->fs_info->total_ref_cache_size -= size;
-                spin_unlock(&root->fs_info->ref_cache_lock);
-        }
-}
 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
                                   struct rb_node *node)
 {
@@ -116,117 +66,3 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
        }
        return NULL;
 }
-int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
-                           int shared)
-{
-        struct btrfs_leaf_ref *ref = NULL;
-        struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-        if (shared)
-                tree = &root->fs_info->shared_ref_tree;
-        if (!tree)
-                return 0;
-        spin_lock(&tree->lock);
-        while (!list_empty(&tree->list)) {
-                ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
-                BUG_ON(ref->tree != tree);
-                if (ref->root_gen > max_root_gen)
-                        break;
-                if (!xchg(&ref->in_tree, 0)) {
-                        cond_resched_lock(&tree->lock);
-                        continue;
-                }
-                rb_erase(&ref->rb_node, &tree->root);
-                list_del_init(&ref->list);
-                spin_unlock(&tree->lock);
-                btrfs_free_leaf_ref(root, ref);
-                cond_resched();
-                spin_lock(&tree->lock);
-        }
-        spin_unlock(&tree->lock);
-        return 0;
-}
-/*
- * find the leaf ref for a given extent.  This returns the ref struct with
- * a usage reference incremented
- */
-struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
-                                             u64 bytenr)
-{
-        struct rb_node *rb;
-        struct btrfs_leaf_ref *ref = NULL;
-        struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-again:
-        if (tree) {
-                spin_lock(&tree->lock);
-                rb = tree_search(&tree->root, bytenr);
-                if (rb)
-                        ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
-                if (ref)
-                        atomic_inc(&ref->usage);
-                spin_unlock(&tree->lock);
-                if (ref)
-                        return ref;
-        }
-        if (tree != &root->fs_info->shared_ref_tree) {
-                tree = &root->fs_info->shared_ref_tree;
-                goto again;
-        }
-        return NULL;
-}
-/*
- * add a fully filled in leaf ref struct
- * remove all the refs older than a given root generation
- */
-int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
-                       int shared)
-{
-        int ret = 0;
-        struct rb_node *rb;
-        struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-        if (shared)
-                tree = &root->fs_info->shared_ref_tree;
-        spin_lock(&tree->lock);
-        rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
-        if (rb) {
-                ret = -EEXIST;
-        } else {
-                atomic_inc(&ref->usage);
-                ref->tree = tree;
-                ref->in_tree = 1;
-                list_add_tail(&ref->list, &tree->list);
-        }
-        spin_unlock(&tree->lock);
-        return ret;
-}
-/*
- * remove a single leaf ref from the tree.  This drops the ref held by the tree
- * only
- */
-int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
-{
-        struct btrfs_leaf_ref_tree *tree;
-        if (!xchg(&ref->in_tree, 0))
-                return 0;
-        tree = ref->tree;
-        spin_lock(&tree->lock);
-        rb_erase(&ref->rb_node, &tree->root);
-        list_del_init(&ref->list);
-        spin_unlock(&tree->lock);
-        btrfs_free_leaf_ref(root, ref);
-        return 0;
-}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index e2a55cb2072b..24f7001f6387 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -49,28 +49,4 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents)
        return sizeof(struct btrfs_leaf_ref) +
               sizeof(struct btrfs_extent_info) * nr_extents;
 }
-static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
-{
-        tree->root = RB_ROOT;
-        INIT_LIST_HEAD(&tree->list);
-        spin_lock_init(&tree->lock);
-}
-static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
-{
-        return RB_EMPTY_ROOT(&tree->root);
-}
-void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
-                                            int nr_extents);
-void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
-                                             u64 bytenr);
-int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
-                       int shared);
-int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
-                           int shared);
-int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
 #endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f340f7c99d09..ca38eca70af0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -30,6 +30,7 @@
 #include "btrfs_inode.h"
 #include "async-thread.h"
 #include "free-space-cache.h"
+#include "inode-map.h"
 /*
 * backref_node, mapping_node and tree_block start with this
@@ -507,6 +508,7 @@ static int update_backref_cache(struct btrfs_trans_handle *trans,
        return 1;
 }
 static int should_ignore_root(struct btrfs_root *root)
 {
        struct btrfs_root *reloc_root;
@@ -529,7 +531,6 @@ static int should_ignore_root(struct btrfs_root *root)
         */
        return 1;
 }
 /*
 * find reloc tree by address of tree root
 */
@@ -961,7 +962,7 @@ again:
                        lower = upper;
                        upper = NULL;
                }
-                btrfs_release_path(root, path2);
+                btrfs_release_path(path2);
 next:
                if (ptr < end) {
                        ptr += btrfs_extent_inline_ref_size(key.type);
@@ -974,7 +975,7 @@ next:
                if (ptr >= end)
                        path1->slots[0]++;
        }
-        btrfs_release_path(rc->extent_root, path1);
+        btrfs_release_path(path1);
        cur->checked = 1;
        WARN_ON(exist);
@@ -1409,9 +1410,9 @@ again:
                prev = node;
                entry = rb_entry(node, struct btrfs_inode, rb_node);
-                if (objectid < entry->vfs_inode.i_ino)
+                if (objectid < btrfs_ino(&entry->vfs_inode))
                        node = node->rb_left;
-                else if (objectid > entry->vfs_inode.i_ino)
+                else if (objectid > btrfs_ino(&entry->vfs_inode))
                        node = node->rb_right;
                else
                        break;
@@ -1419,7 +1420,7 @@ again:
        if (!node) {
                while (prev) {
                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
-                        if (objectid <= entry->vfs_inode.i_ino) {
+                        if (objectid <= btrfs_ino(&entry->vfs_inode)) {
                                node = prev;
                                break;
                        }
@@ -1434,7 +1435,7 @@ again:
                        return inode;
                }
-                objectid = entry->vfs_inode.i_ino + 1;
+                objectid = btrfs_ino(&entry->vfs_inode) + 1;
                if (cond_resched_lock(&root->inode_lock))
                        goto again;
@@ -1470,7 +1471,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
                return -ENOMEM;
        bytenr -= BTRFS_I(reloc_inode)->index_cnt;
-        ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
                                       bytenr, 0);
        if (ret < 0)
                goto out;
@@ -1558,11 +1559,11 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
                        if (first) {
                                inode = find_next_inode(root, key.objectid);
                                first = 0;
-                        } else if (inode && inode->i_ino < key.objectid) {
+                        } else if (inode && btrfs_ino(inode) < key.objectid) {
                                btrfs_add_delayed_iput(inode);
                                inode = find_next_inode(root, key.objectid);
                        }
-                        if (inode && inode->i_ino == key.objectid) {
+                        if (inode && btrfs_ino(inode) == key.objectid) {
                                end = key.offset +
                                      btrfs_file_extent_num_bytes(leaf, fi);
                                WARN_ON(!IS_ALIGNED(key.offset,
@@ -1749,7 +1750,7 @@ again:
                btrfs_node_key_to_cpu(path->nodes[level], &key,
                                      path->slots[level]);
-                btrfs_release_path(src, path);
+                btrfs_release_path(path);
                path->lowest_level = level;
                ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
@@ -1893,6 +1894,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
        struct inode *inode = NULL;
        u64 objectid;
        u64 start, end;
+        u64 ino;
        objectid = min_key->objectid;
        while (1) {
@@ -1905,17 +1907,18 @@ static int invalidate_extent_cache(struct btrfs_root *root,
                inode = find_next_inode(root, objectid);
                if (!inode)
                        break;
+                ino = btrfs_ino(inode);
-                if (inode->i_ino > max_key->objectid) {
+                if (ino > max_key->objectid) {
                        iput(inode);
                        break;
                }
-                objectid = inode->i_ino + 1;
+                objectid = ino + 1;
                if (!S_ISREG(inode->i_mode))
                        continue;
-                if (unlikely(min_key->objectid == inode->i_ino)) {
+                if (unlikely(min_key->objectid == ino)) {
                        if (min_key->type > BTRFS_EXTENT_DATA_KEY)
                                continue;
                        if (min_key->type < BTRFS_EXTENT_DATA_KEY)
@@ -1928,7 +1931,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
                        start = 0;
                }
-                if (unlikely(max_key->objectid == inode->i_ino)) {
+                if (unlikely(max_key->objectid == ino)) {
                        if (max_key->type < BTRFS_EXTENT_DATA_KEY)
                                continue;
                        if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
@@ -2496,7 +2499,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        path->locks[upper->level] = 0;
                        slot = path->slots[upper->level];
-                        btrfs_release_path(NULL, path);
+                        btrfs_release_path(path);
                } else {
                        ret = btrfs_bin_search(upper->eb, key, upper->level,
                                               &slot);
@@ -2737,7 +2740,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
                } else {
                        path->lowest_level = node->level;
                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        if (ret > 0)
                                ret = 0;
                }
@@ -2870,7 +2873,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
        struct extent_map *em;
        int ret = 0;
-        em = alloc_extent_map(GFP_NOFS);
+        em = alloc_extent_map();
        if (!em)
                return -ENOMEM;
@@ -3119,7 +3122,7 @@ static int add_tree_block(struct reloc_control *rc,
 #endif
        }
-        btrfs_release_path(rc->extent_root, path);
+        btrfs_release_path(path);
        BUG_ON(level == -1);
@@ -3220,7 +3223,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
        key.offset = 0;
        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
-        if (!inode || IS_ERR(inode) || is_bad_inode(inode)) {
+        if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) {
                if (inode && !IS_ERR(inode))
                        iput(inode);
                return -ENOENT;
@@ -3505,7 +3508,7 @@ int add_data_references(struct reloc_control *rc,
                }
                path->slots[0]++;
        }
-        btrfs_release_path(rc->extent_root, path);
+        btrfs_release_path(path);
        if (err)
                free_block_list(blocks);
        return err;
@@ -3568,7 +3571,7 @@ next:
                                            EXTENT_DIRTY);
                if (ret == 0 && start <= key.objectid) {
-                        btrfs_release_path(rc->extent_root, path);
+                        btrfs_release_path(path);
                        rc->search_start = end + 1;
                } else {
                        rc->search_start = key.objectid + key.offset;
@@ -3576,7 +3579,7 @@ next:
                        return 0;
                }
        }
-        btrfs_release_path(rc->extent_root, path);
+        btrfs_release_path(path);
        return ret;
 }
@@ -3713,7 +3716,7 @@ restart:
                                flags = BTRFS_EXTENT_FLAG_DATA;
                        if (path_change) {
-                                btrfs_release_path(rc->extent_root, path);
+                                btrfs_release_path(path);
                                path->search_commit_root = 1;
                                path->skip_locking = 1;
@@ -3736,7 +3739,7 @@ restart:
                           (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        ret = add_data_references(rc, &key, path, &blocks);
                } else {
-                        btrfs_release_path(rc->extent_root, path);
+                        btrfs_release_path(path);
                        ret = 0;
                }
                if (ret < 0) {
@@ -3799,7 +3802,7 @@ restart:
                }
        }
-        btrfs_release_path(rc->extent_root, path);
+        btrfs_release_path(path);
        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
                          GFP_NOFS);
@@ -3867,7 +3870,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
                                          BTRFS_INODE_PREALLOC);
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
 out:
        btrfs_free_path(path);
        return ret;
@@ -3897,7 +3900,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        if (IS_ERR(trans))
                return ERR_CAST(trans);
-        err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+        err = btrfs_find_free_objectid(root, &objectid);
        if (err)
                goto out;
@@ -3935,7 +3938,7 @@ static struct reloc_control *alloc_reloc_control(void)
        INIT_LIST_HEAD(&rc->reloc_roots);
        backref_cache_init(&rc->backref_cache);
        mapping_tree_init(&rc->reloc_root_tree);
-        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+        extent_io_tree_init(&rc->processed_blocks, NULL);
        return rc;
 }
@@ -4109,7 +4112,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                }
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                btrfs_release_path(root->fs_info->tree_root, path);
+                btrfs_release_path(path);
                if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
                    key.type != BTRFS_ROOT_ITEM_KEY)
@@ -4141,7 +4144,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                key.offset--;
        }
-        btrfs_release_path(root->fs_info->tree_root, path);
+        btrfs_release_path(path);
        if (list_empty(&reloc_roots))
                goto out;
@@ -4242,7 +4245,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
-                                       disk_bytenr + len - 1, &list);
+                                       disk_bytenr + len - 1, &list, 0);
        while (!list_empty(&list)) {
                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6928bff62daa..ebe45443de06 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -22,53 +22,6 @@
 #include "print-tree.h"
 /*
- *  search forward for a root, starting with objectid 'search_start'
- *  if a root key is found, the objectid we find is filled into 'found_objectid'
- *  and 0 is returned.  < 0 is returned on error, 1 if there is nothing
- *  left in the tree.
- */
-int btrfs_search_root(struct btrfs_root *root, u64 search_start,
-                      u64 *found_objectid)
-{
-        struct btrfs_path *path;
-        struct btrfs_key search_key;
-        int ret;
-        root = root->fs_info->tree_root;
-        search_key.objectid = search_start;
-        search_key.type = (u8)-1;
-        search_key.offset = (u64)-1;
-        path = btrfs_alloc_path();
-        BUG_ON(!path);
-again:
-        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
-        if (ret < 0)
-                goto out;
-        if (ret == 0) {
-                ret = 1;
-                goto out;
-        }
-        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
-                ret = btrfs_next_leaf(root, path);
-                if (ret)
-                        goto out;
-        }
-        btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
-        if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
-                search_key.offset++;
-                btrfs_release_path(root, path);
-                goto again;
-        }
-        ret = 0;
-        *found_objectid = search_key.objectid;
-out:
-        btrfs_free_path(path);
-        return ret;
-}
-/*
 * lookup the root with the highest offset for a given objectid.  The key we do
 * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
 * on error.
@@ -230,7 +183,7 @@ again:
                memcpy(&found_key, &key, sizeof(key));
                key.offset++;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                dead_root =
                        btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
                                                    &found_key);
@@ -292,7 +245,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                }
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                btrfs_release_path(tree_root, path);
+                btrfs_release_path(path);
                if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
                    key.type != BTRFS_ORPHAN_ITEM_KEY)
@@ -385,18 +338,22 @@ again:
                *sequence = btrfs_root_ref_sequence(leaf, ref);
                ret = btrfs_del_item(trans, tree_root, path);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out;
+                }
        } else
                err = -ENOENT;
        if (key.type == BTRFS_ROOT_BACKREF_KEY) {
-                btrfs_release_path(tree_root, path);
+                btrfs_release_path(path);
                key.objectid = ref_id;
                key.type = BTRFS_ROOT_REF_KEY;
                key.offset = root_id;
                goto again;
        }
+out:
        btrfs_free_path(path);
        return err;
 }
@@ -463,7 +420,7 @@ again:
        btrfs_mark_buffer_dirty(leaf);
        if (key.type == BTRFS_ROOT_BACKREF_KEY) {
-                btrfs_release_path(tree_root, path);
+                btrfs_release_path(path);
                key.objectid = ref_id;
                key.type = BTRFS_ROOT_REF_KEY;
                key.offset = root_id;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
new file mode 100644
index 000000000000..6dfed0c27ac3
--- /dev/null
+++ b/fs/btrfs/scrub.c
@@ -0,0 +1,1369 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "ordered-data.h"
+/*
+ * This is only the first step towards a full-features scrub. It reads all
+ * extent and super block and verifies the checksums. In case a bad checksum
+ * is found or the extent cannot be read, good data will be written back if
+ * any can be found.
+ *
+ * Future enhancements:
+ *  - To enhance the performance, better read-ahead strategies for the
+ *    extent-tree can be employed.
+ *  - In case an unrepairable extent is encountered, track which files are
+ *    affected and report them
+ *  - In case of a read error on files with nodatasum, map the file and read
+ *    the extent to trigger a writeback of the good copy
+ *  - track and record media errors, throw out bad devices
+ *  - add a mode to also read unallocated space
+ *  - make the prefetch cancellable
+ */
+struct scrub_bio;
+struct scrub_page;
+struct scrub_dev;
+static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_checksum(struct btrfs_work *work);
+static int scrub_checksum_data(struct scrub_dev *sdev,
+                               struct scrub_page *spag, void *buffer);
+static int scrub_checksum_tree_block(struct scrub_dev *sdev,
+                                     struct scrub_page *spag, u64 logical,
+                                     void *buffer);
+static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
+static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
+static void scrub_fixup_end_io(struct bio *bio, int err);
+static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
+                          struct page *page);
+static void scrub_fixup(struct scrub_bio *sbio, int ix);
+#define SCRUB_PAGES_PER_BIO     16      /* 64k per bio */
+#define SCRUB_BIOS_PER_DEV      16      /* 1 MB per device in flight */
+struct scrub_page {
+        u64                     flags;  /* extent flags */
+        u64                     generation;
+        u64                     mirror_num;
+        int                     have_csum;
+        u8                      csum[BTRFS_CSUM_SIZE];
+};
+struct scrub_bio {
+        int                     index;
+        struct scrub_dev        *sdev;
+        struct bio              *bio;
+        int                     err;
+        u64                     logical;
+        u64                     physical;
+        struct scrub_page       spag[SCRUB_PAGES_PER_BIO];
+        u64                     count;
+        int                     next_free;
+        struct btrfs_work       work;
+};
+struct scrub_dev {
+        struct scrub_bio        *bios[SCRUB_BIOS_PER_DEV];
+        struct btrfs_device     *dev;
+        int                     first_free;
+        int                     curr;
+        atomic_t                in_flight;
+        spinlock_t              list_lock;
+        wait_queue_head_t       list_wait;
+        u16                     csum_size;
+        struct list_head        csum_list;
+        atomic_t                cancel_req;
+        int                     readonly;
+        /*
+         * statistics
+         */
+        struct btrfs_scrub_progress stat;
+        spinlock_t              stat_lock;
+};
+static void scrub_free_csums(struct scrub_dev *sdev)
+{
+        while (!list_empty(&sdev->csum_list)) {
+                struct btrfs_ordered_sum *sum;
+                sum = list_first_entry(&sdev->csum_list,
+                                       struct btrfs_ordered_sum, list);
+                list_del(&sum->list);
+                kfree(sum);
+        }
+}
+static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
+{
+        int i;
+        int j;
+        struct page *last_page;
+        if (!sdev)
+                return;
+        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+                struct scrub_bio *sbio = sdev->bios[i];
+                struct bio *bio;
+                if (!sbio)
+                        break;
+                bio = sbio->bio;
+                if (bio) {
+                        last_page = NULL;
+                        for (j = 0; j < bio->bi_vcnt; ++j) {
+                                if (bio->bi_io_vec[j].bv_page == last_page)
+                                        continue;
+                                last_page = bio->bi_io_vec[j].bv_page;
+                                __free_page(last_page);
+                        }
+                        bio_put(bio);
+                }
+                kfree(sbio);
+        }
+        scrub_free_csums(sdev);
+        kfree(sdev);
+}
+static noinline_for_stack
+struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
+{
+        struct scrub_dev *sdev;
+        int             i;
+        int             j;
+        int             ret;
+        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+        sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
+        if (!sdev)
+                goto nomem;
+        sdev->dev = dev;
+        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+                struct bio *bio;
+                struct scrub_bio *sbio;
+                sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
+                if (!sbio)
+                        goto nomem;
+                sdev->bios[i] = sbio;
+                bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
+                if (!bio)
+                        goto nomem;
+                sbio->index = i;
+                sbio->sdev = sdev;
+                sbio->bio = bio;
+                sbio->count = 0;
+                sbio->work.func = scrub_checksum;
+                bio->bi_private = sdev->bios[i];
+                bio->bi_end_io = scrub_bio_end_io;
+                bio->bi_sector = 0;
+                bio->bi_bdev = dev->bdev;
+                bio->bi_size = 0;
+                for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) {
+                        struct page *page;
+                        page = alloc_page(GFP_NOFS);
+                        if (!page)
+                                goto nomem;
+                        ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+                        if (!ret)
+                                goto nomem;
+                }
+                WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO);
+                if (i != SCRUB_BIOS_PER_DEV-1)
+                        sdev->bios[i]->next_free = i + 1;
+                 else
+                        sdev->bios[i]->next_free = -1;
+        }
+        sdev->first_free = 0;
+        sdev->curr = -1;
+        atomic_set(&sdev->in_flight, 0);
+        atomic_set(&sdev->cancel_req, 0);
+        sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+        INIT_LIST_HEAD(&sdev->csum_list);
+        spin_lock_init(&sdev->list_lock);
+        spin_lock_init(&sdev->stat_lock);
+        init_waitqueue_head(&sdev->list_wait);
+        return sdev;
+nomem:
+        scrub_free_dev(sdev);
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * scrub_recheck_error gets called when either verification of the page
+ * failed or the bio failed to read, e.g. with EIO. In the latter case,
+ * recheck_error gets called for every page in the bio, even though only
+ * one may be bad
+ */
+static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
+{
+        if (sbio->err) {
+                if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
+                                   (sbio->physical + ix * PAGE_SIZE) >> 9,
+                                   sbio->bio->bi_io_vec[ix].bv_page) == 0) {
+                        if (scrub_fixup_check(sbio, ix) == 0)
+                                return;
+                }
+        }
+        scrub_fixup(sbio, ix);
+}
+static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
+{
+        int ret = 1;
+        struct page *page;
+        void *buffer;
+        u64 flags = sbio->spag[ix].flags;
+        page = sbio->bio->bi_io_vec[ix].bv_page;
+        buffer = kmap_atomic(page, KM_USER0);
+        if (flags & BTRFS_EXTENT_FLAG_DATA) {
+                ret = scrub_checksum_data(sbio->sdev,
+                                          sbio->spag + ix, buffer);
+        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                ret = scrub_checksum_tree_block(sbio->sdev,
+                                                sbio->spag + ix,
+                                                sbio->logical + ix * PAGE_SIZE,
+                                                buffer);
+        } else {
+                WARN_ON(1);
+        }
+        kunmap_atomic(buffer, KM_USER0);
+        return ret;
+}
+static void scrub_fixup_end_io(struct bio *bio, int err)
+{
+        complete((struct completion *)bio->bi_private);
+}
+static void scrub_fixup(struct scrub_bio *sbio, int ix)
+{
+        struct scrub_dev *sdev = sbio->sdev;
+        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+        struct btrfs_multi_bio *multi = NULL;
+        u64 logical = sbio->logical + ix * PAGE_SIZE;
+        u64 length;
+        int i;
+        int ret;
+        DECLARE_COMPLETION_ONSTACK(complete);
+        if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
+            (sbio->spag[ix].have_csum == 0)) {
+                /*
+                 * nodatasum, don't try to fix anything
+                 * FIXME: we can do better, open the inode and trigger a
+                 * writeback
+                 */
+                goto uncorrectable;
+        }
+        length = PAGE_SIZE;
+        ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
+                              &multi, 0);
+        if (ret || !multi || length < PAGE_SIZE) {
+                printk(KERN_ERR
+                       "scrub_fixup: btrfs_map_block failed us for %llu\n",
+                       (unsigned long long)logical);
+                WARN_ON(1);
+                return;
+        }
+        if (multi->num_stripes == 1)
+                /* there aren't any replicas */
+                goto uncorrectable;
+        /*
+         * first find a good copy
+         */
+        for (i = 0; i < multi->num_stripes; ++i) {
+                if (i == sbio->spag[ix].mirror_num)
+                        continue;
+                if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
+                                   multi->stripes[i].physical >> 9,
+                                   sbio->bio->bi_io_vec[ix].bv_page)) {
+                        /* I/O-error, this is not a good copy */
+                        continue;
+                }
+                if (scrub_fixup_check(sbio, ix) == 0)
+                        break;
+        }
+        if (i == multi->num_stripes)
+                goto uncorrectable;
+        if (!sdev->readonly) {
+                /*
+                 * bi_io_vec[ix].bv_page now contains good data, write it back
+                 */
+                if (scrub_fixup_io(WRITE, sdev->dev->bdev,
+                                   (sbio->physical + ix * PAGE_SIZE) >> 9,
+                                   sbio->bio->bi_io_vec[ix].bv_page)) {
+                        /* I/O-error, writeback failed, give up */
+                        goto uncorrectable;
+                }
+        }
+        kfree(multi);
+        spin_lock(&sdev->stat_lock);
+        ++sdev->stat.corrected_errors;
+        spin_unlock(&sdev->stat_lock);
+        if (printk_ratelimit())
+                printk(KERN_ERR "btrfs: fixed up at %llu\n",
+                       (unsigned long long)logical);
+        return;
+uncorrectable:
+        kfree(multi);
+        spin_lock(&sdev->stat_lock);
+        ++sdev->stat.uncorrectable_errors;
+        spin_unlock(&sdev->stat_lock);
+        if (printk_ratelimit())
+                printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
+                         (unsigned long long)logical);
+}
+static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
+                         struct page *page)
+{
+        struct bio *bio = NULL;
+        int ret;
+        DECLARE_COMPLETION_ONSTACK(complete);
+        /* we are going to wait on this IO */
+        rw |= REQ_SYNC;
+        bio = bio_alloc(GFP_NOFS, 1);
+        bio->bi_bdev = bdev;
+        bio->bi_sector = sector;
+        bio_add_page(bio, page, PAGE_SIZE, 0);
+        bio->bi_end_io = scrub_fixup_end_io;
+        bio->bi_private = &complete;
+        submit_bio(rw, bio);
+        wait_for_completion(&complete);
+        ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+        bio_put(bio);
+        return ret;
+}
+static void scrub_bio_end_io(struct bio *bio, int err)
+{
+        struct scrub_bio *sbio = bio->bi_private;
+        struct scrub_dev *sdev = sbio->sdev;
+        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+        sbio->err = err;
+        btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+}
+static void scrub_checksum(struct btrfs_work *work)
+{
+        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+        struct scrub_dev *sdev = sbio->sdev;
+        struct page *page;
+        void *buffer;
+        int i;
+        u64 flags;
+        u64 logical;
+        int ret;
+        if (sbio->err) {
+                for (i = 0; i < sbio->count; ++i)
+                        scrub_recheck_error(sbio, i);
+                sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+                sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
+                sbio->bio->bi_phys_segments = 0;
+                sbio->bio->bi_idx = 0;
+                for (i = 0; i < sbio->count; i++) {
+                        struct bio_vec *bi;
+                        bi = &sbio->bio->bi_io_vec[i];
+                        bi->bv_offset = 0;
+                        bi->bv_len = PAGE_SIZE;
+                }
+                spin_lock(&sdev->stat_lock);
+                ++sdev->stat.read_errors;
+                spin_unlock(&sdev->stat_lock);
+                goto out;
+        }
+        for (i = 0; i < sbio->count; ++i) {
+                page = sbio->bio->bi_io_vec[i].bv_page;
+                buffer = kmap_atomic(page, KM_USER0);
+                flags = sbio->spag[i].flags;
+                logical = sbio->logical + i * PAGE_SIZE;
+                ret = 0;
+                if (flags & BTRFS_EXTENT_FLAG_DATA) {
+                        ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
+                } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                        ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
+                                                        logical, buffer);
+                } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
+                        BUG_ON(i);
+                        (void)scrub_checksum_super(sbio, buffer);
+                } else {
+                        WARN_ON(1);
+                }
+                kunmap_atomic(buffer, KM_USER0);
+                if (ret)
+                        scrub_recheck_error(sbio, i);
+        }
+out:
+        spin_lock(&sdev->list_lock);
+        sbio->next_free = sdev->first_free;
+        sdev->first_free = sbio->index;
+        spin_unlock(&sdev->list_lock);
+        atomic_dec(&sdev->in_flight);
+        wake_up(&sdev->list_wait);
+}
+static int scrub_checksum_data(struct scrub_dev *sdev,
+                               struct scrub_page *spag, void *buffer)
+{
+        u8 csum[BTRFS_CSUM_SIZE];
+        u32 crc = ~(u32)0;
+        int fail = 0;
+        struct btrfs_root *root = sdev->dev->dev_root;
+        if (!spag->have_csum)
+                return 0;
+        crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
+        btrfs_csum_final(crc, csum);
+        if (memcmp(csum, spag->csum, sdev->csum_size))
+                fail = 1;
+        spin_lock(&sdev->stat_lock);
+        ++sdev->stat.data_extents_scrubbed;
+        sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
+        if (fail)
+                ++sdev->stat.csum_errors;
+        spin_unlock(&sdev->stat_lock);
+        return fail;
+}
+static int scrub_checksum_tree_block(struct scrub_dev *sdev,
+                                     struct scrub_page *spag, u64 logical,
+                                     void *buffer)
+{
+        struct btrfs_header *h;
+        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u8 csum[BTRFS_CSUM_SIZE];
+        u32 crc = ~(u32)0;
+        int fail = 0;
+        int crc_fail = 0;
+        /*
+         * we don't use the getter functions here, as we
+         * a) don't have an extent buffer and
+         * b) the page is already kmapped
+         */
+        h = (struct btrfs_header *)buffer;
+        if (logical != le64_to_cpu(h->bytenr))
+                ++fail;
+        if (spag->generation != le64_to_cpu(h->generation))
+                ++fail;
+        if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+                ++fail;
+        if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+                   BTRFS_UUID_SIZE))
+                ++fail;
+        crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
+                              PAGE_SIZE - BTRFS_CSUM_SIZE);
+        btrfs_csum_final(crc, csum);
+        if (memcmp(csum, h->csum, sdev->csum_size))
+                ++crc_fail;
+        spin_lock(&sdev->stat_lock);
+        ++sdev->stat.tree_extents_scrubbed;
+        sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
+        if (crc_fail)
+                ++sdev->stat.csum_errors;
+        if (fail)
+                ++sdev->stat.verify_errors;
+        spin_unlock(&sdev->stat_lock);
+        return fail || crc_fail;
+}
+static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
+{
+        struct btrfs_super_block *s;
+        u64 logical;
+        struct scrub_dev *sdev = sbio->sdev;
+        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u8 csum[BTRFS_CSUM_SIZE];
+        u32 crc = ~(u32)0;
+        int fail = 0;
+        s = (struct btrfs_super_block *)buffer;
+        logical = sbio->logical;
+        if (logical != le64_to_cpu(s->bytenr))
+                ++fail;
+        if (sbio->spag[0].generation != le64_to_cpu(s->generation))
+                ++fail;
+        if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+                ++fail;
+        crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
+                              PAGE_SIZE - BTRFS_CSUM_SIZE);
+        btrfs_csum_final(crc, csum);
+        if (memcmp(csum, s->csum, sbio->sdev->csum_size))
+                ++fail;
+        if (fail) {
+                /*
+                 * if we find an error in a super block, we just report it.
+                 * They will get written with the next transaction commit
+                 * anyway
+                 */
+                spin_lock(&sdev->stat_lock);
+                ++sdev->stat.super_errors;
+                spin_unlock(&sdev->stat_lock);
+        }
+        return fail;
+}
+static int scrub_submit(struct scrub_dev *sdev)
+{
+        struct scrub_bio *sbio;
+        if (sdev->curr == -1)
+                return 0;
+        sbio = sdev->bios[sdev->curr];
+        sbio->bio->bi_sector = sbio->physical >> 9;
+        sbio->bio->bi_size = sbio->count * PAGE_SIZE;
+        sbio->bio->bi_next = NULL;
+        sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
+        sbio->bio->bi_comp_cpu = -1;
+        sbio->bio->bi_bdev = sdev->dev->bdev;
+        sbio->err = 0;
+        sdev->curr = -1;
+        atomic_inc(&sdev->in_flight);
+        submit_bio(0, sbio->bio);
+        return 0;
+}
+static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
+                      u64 physical, u64 flags, u64 gen, u64 mirror_num,
+                      u8 *csum, int force)
+{
+        struct scrub_bio *sbio;
+again:
+        /*
+         * grab a fresh bio or wait for one to become available
+         */
+        while (sdev->curr == -1) {
+                spin_lock(&sdev->list_lock);
+                sdev->curr = sdev->first_free;
+                if (sdev->curr != -1) {
+                        sdev->first_free = sdev->bios[sdev->curr]->next_free;
+                        sdev->bios[sdev->curr]->next_free = -1;
+                        sdev->bios[sdev->curr]->count = 0;
+                        spin_unlock(&sdev->list_lock);
+                } else {
+                        spin_unlock(&sdev->list_lock);
+                        wait_event(sdev->list_wait, sdev->first_free != -1);
+                }
+        }
+        sbio = sdev->bios[sdev->curr];
+        if (sbio->count == 0) {
+                sbio->physical = physical;
+                sbio->logical = logical;
+        } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
+                   sbio->logical + sbio->count * PAGE_SIZE != logical) {
+                scrub_submit(sdev);
+                goto again;
+        }
+        sbio->spag[sbio->count].flags = flags;
+        sbio->spag[sbio->count].generation = gen;
+        sbio->spag[sbio->count].have_csum = 0;
+        sbio->spag[sbio->count].mirror_num = mirror_num;
+        if (csum) {
+                sbio->spag[sbio->count].have_csum = 1;
+                memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
+        }
+        ++sbio->count;
+        if (sbio->count == SCRUB_PAGES_PER_BIO || force)
+                scrub_submit(sdev);
+        return 0;
+}
+static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
+                           u8 *csum)
+{
+        struct btrfs_ordered_sum *sum = NULL;
+        int ret = 0;
+        unsigned long i;
+        unsigned long num_sectors;
+        u32 sectorsize = sdev->dev->dev_root->sectorsize;
+        while (!list_empty(&sdev->csum_list)) {
+                sum = list_first_entry(&sdev->csum_list,
+                                       struct btrfs_ordered_sum, list);
+                if (sum->bytenr > logical)
+                        return 0;
+                if (sum->bytenr + sum->len > logical)
+                        break;
+                ++sdev->stat.csum_discards;
+                list_del(&sum->list);
+                kfree(sum);
+                sum = NULL;
+        }
+        if (!sum)
+                return 0;
+        num_sectors = sum->len / sectorsize;
+        for (i = 0; i < num_sectors; ++i) {
+                if (sum->sums[i].bytenr == logical) {
+                        memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
+                        ret = 1;
+                        break;
+                }
+        }
+        if (ret && i == num_sectors - 1) {
+                list_del(&sum->list);
+                kfree(sum);
+        }
+        return ret;
+}
+/* scrub extent tries to collect up to 64 kB for each bio */
+static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
+                        u64 physical, u64 flags, u64 gen, u64 mirror_num)
+{
+        int ret;
+        u8 csum[BTRFS_CSUM_SIZE];
+        while (len) {
+                u64 l = min_t(u64, len, PAGE_SIZE);
+                int have_csum = 0;
+                if (flags & BTRFS_EXTENT_FLAG_DATA) {
+                        /* push csums to sbio */
+                        have_csum = scrub_find_csum(sdev, logical, l, csum);
+                        if (have_csum == 0)
+                                ++sdev->stat.no_csum;
+                }
+                ret = scrub_page(sdev, logical, l, physical, flags, gen,
+                                 mirror_num, have_csum ? csum : NULL, 0);
+                if (ret)
+                        return ret;
+                len -= l;
+                logical += l;
+                physical += l;
+        }
+        return 0;
+}
+static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
+        struct map_lookup *map, int num, u64 base, u64 length)
+{
+        struct btrfs_path *path;
+        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+        struct btrfs_root *root = fs_info->extent_root;
+        struct btrfs_root *csum_root = fs_info->csum_root;
+        struct btrfs_extent_item *extent;
+        u64 flags;
+        int ret;
+        int slot;
+        int i;
+        u64 nstripes;
+        int start_stripe;
+        struct extent_buffer *l;
+        struct btrfs_key key;
+        u64 physical;
+        u64 logical;
+        u64 generation;
+        u64 mirror_num;
+        u64 increment = map->stripe_len;
+        u64 offset;
+        nstripes = length;
+        offset = 0;
+        do_div(nstripes, map->stripe_len);
+        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                offset = map->stripe_len * num;
+                increment = map->stripe_len * map->num_stripes;
+                mirror_num = 0;
+        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                int factor = map->num_stripes / map->sub_stripes;
+                offset = map->stripe_len * (num / map->sub_stripes);
+                increment = map->stripe_len * factor;
+                mirror_num = num % map->sub_stripes;
+        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+                increment = map->stripe_len;
+                mirror_num = num % map->num_stripes;
+        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+                increment = map->stripe_len;
+                mirror_num = num % map->num_stripes;
+        } else {
+                increment = map->stripe_len;
+                mirror_num = 0;
+        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        /*
+         * find all extents for each stripe and just read them to get
+         * them into the page cache
+         * FIXME: we can do better. build a more intelligent prefetching
+         */
+        logical = base + offset;
+        physical = map->stripes[num].physical;
+        ret = 0;
+        for (i = 0; i < nstripes; ++i) {
+                key.objectid = logical;
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = (u64)0;
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+                l = path->nodes[0];
+                slot = path->slots[0];
+                btrfs_item_key_to_cpu(l, &key, slot);
+                if (key.objectid != logical) {
+                        ret = btrfs_previous_item(root, path, 0,
+                                                  BTRFS_EXTENT_ITEM_KEY);
+                        if (ret < 0)
+                                goto out;
+                }
+                while (1) {
+                        l = path->nodes[0];
+                        slot = path->slots[0];
+                        if (slot >= btrfs_header_nritems(l)) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret == 0)
+                                        continue;
+                                if (ret < 0)
+                                        goto out;
+                                break;
+                        }
+                        btrfs_item_key_to_cpu(l, &key, slot);
+                        if (key.objectid >= logical + map->stripe_len)
+                                break;
+                        path->slots[0]++;
+                }
+                btrfs_release_path(path);
+                logical += increment;
+                physical += map->stripe_len;
+                cond_resched();
+        }
+        /*
+         * collect all data csums for the stripe to avoid seeking during
+         * the scrub. This might currently (crc32) end up to be about 1MB
+         */
+        start_stripe = 0;
+again:
+        logical = base + offset + start_stripe * increment;
+        for (i = start_stripe; i < nstripes; ++i) {
+                ret = btrfs_lookup_csums_range(csum_root, logical,
+                                               logical + map->stripe_len - 1,
+                                               &sdev->csum_list, 1);
+                if (ret)
+                        goto out;
+                logical += increment;
+                cond_resched();
+        }
+        /*
+         * now find all extents for each stripe and scrub them
+         */
+        logical = base + offset + start_stripe * increment;
+        physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+        ret = 0;
+        for (i = start_stripe; i < nstripes; ++i) {
+                /*
+                 * canceled?
+                 */
+                if (atomic_read(&fs_info->scrub_cancel_req) ||
+                    atomic_read(&sdev->cancel_req)) {
+                        ret = -ECANCELED;
+                        goto out;
+                }
+                /*
+                 * check to see if we have to pause
+                 */
+                if (atomic_read(&fs_info->scrub_pause_req)) {
+                        /* push queued extents */
+                        scrub_submit(sdev);
+                        wait_event(sdev->list_wait,
+                                   atomic_read(&sdev->in_flight) == 0);
+                        atomic_inc(&fs_info->scrubs_paused);
+                        wake_up(&fs_info->scrub_pause_wait);
+                        mutex_lock(&fs_info->scrub_lock);
+                        while (atomic_read(&fs_info->scrub_pause_req)) {
+                                mutex_unlock(&fs_info->scrub_lock);
+                                wait_event(fs_info->scrub_pause_wait,
+                                   atomic_read(&fs_info->scrub_pause_req) == 0);
+                                mutex_lock(&fs_info->scrub_lock);
+                        }
+                        atomic_dec(&fs_info->scrubs_paused);
+                        mutex_unlock(&fs_info->scrub_lock);
+                        wake_up(&fs_info->scrub_pause_wait);
+                        scrub_free_csums(sdev);
+                        start_stripe = i;
+                        goto again;
+                }
+                key.objectid = logical;
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = (u64)0;
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+                l = path->nodes[0];
+                slot = path->slots[0];
+                btrfs_item_key_to_cpu(l, &key, slot);
+                if (key.objectid != logical) {
+                        ret = btrfs_previous_item(root, path, 0,
+                                                  BTRFS_EXTENT_ITEM_KEY);
+                        if (ret < 0)
+                                goto out;
+                }
+                while (1) {
+                        l = path->nodes[0];
+                        slot = path->slots[0];
+                        if (slot >= btrfs_header_nritems(l)) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret == 0)
+                                        continue;
+                                if (ret < 0)
+                                        goto out;
+                                break;
+                        }
+                        btrfs_item_key_to_cpu(l, &key, slot);
+                        if (key.objectid + key.offset <= logical)
+                                goto next;
+                        if (key.objectid >= logical + map->stripe_len)
+                                break;
+                        if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
+                                goto next;
+                        extent = btrfs_item_ptr(l, slot,
+                                                struct btrfs_extent_item);
+                        flags = btrfs_extent_flags(l, extent);
+                        generation = btrfs_extent_generation(l, extent);
+                        if (key.objectid < logical &&
+                            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+                                printk(KERN_ERR
+                                       "btrfs scrub: tree block %llu spanning "
+                                       "stripes, ignored. logical=%llu\n",
+                                       (unsigned long long)key.objectid,
+                                       (unsigned long long)logical);
+                                goto next;
+                        }
+                        /*
+                         * trim extent to this stripe
+                         */
+                        if (key.objectid < logical) {
+                                key.offset -= logical - key.objectid;
+                                key.objectid = logical;
+                        }
+                        if (key.objectid + key.offset >
+                            logical + map->stripe_len) {
+                                key.offset = logical + map->stripe_len -
+                                             key.objectid;
+                        }
+                        ret = scrub_extent(sdev, key.objectid, key.offset,
+                                           key.objectid - logical + physical,
+                                           flags, generation, mirror_num);
+                        if (ret)
+                                goto out;
+next:
+                        path->slots[0]++;
+                }
+                btrfs_release_path(path);
+                logical += increment;
+                physical += map->stripe_len;
+                spin_lock(&sdev->stat_lock);
+                sdev->stat.last_physical = physical;
+                spin_unlock(&sdev->stat_lock);
+        }
+        /* push queued extents */
+        scrub_submit(sdev);
+out:
+        btrfs_free_path(path);
+        return ret < 0 ? ret : 0;
+}
+static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
+        u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
+{
+        struct btrfs_mapping_tree *map_tree =
+                &sdev->dev->dev_root->fs_info->mapping_tree;
+        struct map_lookup *map;
+        struct extent_map *em;
+        int i;
+        int ret = -EINVAL;
+        read_lock(&map_tree->map_tree.lock);
+        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+        read_unlock(&map_tree->map_tree.lock);
+        if (!em)
+                return -EINVAL;
+        map = (struct map_lookup *)em->bdev;
+        if (em->start != chunk_offset)
+                goto out;
+        if (em->len < length)
+                goto out;
+        for (i = 0; i < map->num_stripes; ++i) {
+                if (map->stripes[i].dev == sdev->dev) {
+                        ret = scrub_stripe(sdev, map, i, chunk_offset, length);
+                        if (ret)
+                                goto out;
+                }
+        }
+out:
+        free_extent_map(em);
+        return ret;
+}
+static noinline_for_stack
+int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
+{
+        struct btrfs_dev_extent *dev_extent = NULL;
+        struct btrfs_path *path;
+        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u64 length;
+        u64 chunk_tree;
+        u64 chunk_objectid;
+        u64 chunk_offset;
+        int ret;
+        int slot;
+        struct extent_buffer *l;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_block_group_cache *cache;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        key.objectid = sdev->dev->devid;
+        key.offset = 0ull;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        while (1) {
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+                ret = 0;
+                l = path->nodes[0];
+                slot = path->slots[0];
+                btrfs_item_key_to_cpu(l, &found_key, slot);
+                if (found_key.objectid != sdev->dev->devid)
+                        break;
+                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                        break;
+                if (found_key.offset >= end)
+                        break;
+                if (found_key.offset < key.offset)
+                        break;
+                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+                length = btrfs_dev_extent_length(l, dev_extent);
+                if (found_key.offset + length <= start) {
+                        key.offset = found_key.offset + length;
+                        btrfs_release_path(path);
+                        continue;
+                }
+                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+                /*
+                 * get a reference on the corresponding block group to prevent
+                 * the chunk from going away while we scrub it
+                 */
+                cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+                if (!cache) {
+                        ret = -ENOENT;
+                        goto out;
+                }
+                ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
+                                  chunk_offset, length);
+                btrfs_put_block_group(cache);
+                if (ret)
+                        break;
+                key.offset = found_key.offset + length;
+                btrfs_release_path(path);
+        }
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
+{
+        int     i;
+        u64     bytenr;
+        u64     gen;
+        int     ret;
+        struct btrfs_device *device = sdev->dev;
+        struct btrfs_root *root = device->dev_root;
+        gen = root->fs_info->last_trans_committed;
+        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                bytenr = btrfs_sb_offset(i);
+                if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+                        break;
+                ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
+                                 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+                if (ret)
+                        return ret;
+        }
+        wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+        return 0;
+}
+/*
+ * get a reference count on fs_info->scrub_workers. start worker if necessary
+ */
+static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        mutex_lock(&fs_info->scrub_lock);
+        if (fs_info->scrub_workers_refcnt == 0)
+                btrfs_start_workers(&fs_info->scrub_workers, 1);
+        ++fs_info->scrub_workers_refcnt;
+        mutex_unlock(&fs_info->scrub_lock);
+        return 0;
+}
+static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        mutex_lock(&fs_info->scrub_lock);
+        if (--fs_info->scrub_workers_refcnt == 0)
+                btrfs_stop_workers(&fs_info->scrub_workers);
+        WARN_ON(fs_info->scrub_workers_refcnt < 0);
+        mutex_unlock(&fs_info->scrub_lock);
+}
+int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+                    struct btrfs_scrub_progress *progress, int readonly)
+{
+        struct scrub_dev *sdev;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        int ret;
+        struct btrfs_device *dev;
+        if (root->fs_info->closing)
+                return -EINVAL;
+        /*
+         * check some assumptions
+         */
+        if (root->sectorsize != PAGE_SIZE ||
+            root->sectorsize != root->leafsize ||
+            root->sectorsize != root->nodesize) {
+                printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
+                return -EINVAL;
+        }
+        ret = scrub_workers_get(root);
+        if (ret)
+                return ret;
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        dev = btrfs_find_device(root, devid, NULL, NULL);
+        if (!dev || dev->missing) {
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                scrub_workers_put(root);
+                return -ENODEV;
+        }
+        mutex_lock(&fs_info->scrub_lock);
+        if (!dev->in_fs_metadata) {
+                mutex_unlock(&fs_info->scrub_lock);
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                scrub_workers_put(root);
+                return -ENODEV;
+        }
+        if (dev->scrub_device) {
+                mutex_unlock(&fs_info->scrub_lock);
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                scrub_workers_put(root);
+                return -EINPROGRESS;
+        }
+        sdev = scrub_setup_dev(dev);
+        if (IS_ERR(sdev)) {
+                mutex_unlock(&fs_info->scrub_lock);
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                scrub_workers_put(root);
+                return PTR_ERR(sdev);
+        }
+        sdev->readonly = readonly;
+        dev->scrub_device = sdev;
+        atomic_inc(&fs_info->scrubs_running);
+        mutex_unlock(&fs_info->scrub_lock);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+        down_read(&fs_info->scrub_super_lock);
+        ret = scrub_supers(sdev);
+        up_read(&fs_info->scrub_super_lock);
+        if (!ret)
+                ret = scrub_enumerate_chunks(sdev, start, end);
+        wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+        atomic_dec(&fs_info->scrubs_running);
+        wake_up(&fs_info->scrub_pause_wait);
+        if (progress)
+                memcpy(progress, &sdev->stat, sizeof(*progress));
+        mutex_lock(&fs_info->scrub_lock);
+        dev->scrub_device = NULL;
+        mutex_unlock(&fs_info->scrub_lock);
+        scrub_free_dev(sdev);
+        scrub_workers_put(root);
+        return ret;
+}
+int btrfs_scrub_pause(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        mutex_lock(&fs_info->scrub_lock);
+        atomic_inc(&fs_info->scrub_pause_req);
+        while (atomic_read(&fs_info->scrubs_paused) !=
+               atomic_read(&fs_info->scrubs_running)) {
+                mutex_unlock(&fs_info->scrub_lock);
+                wait_event(fs_info->scrub_pause_wait,
+                           atomic_read(&fs_info->scrubs_paused) ==
+                           atomic_read(&fs_info->scrubs_running));
+                mutex_lock(&fs_info->scrub_lock);
+        }
+        mutex_unlock(&fs_info->scrub_lock);
+        return 0;
+}
+int btrfs_scrub_continue(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        atomic_dec(&fs_info->scrub_pause_req);
+        wake_up(&fs_info->scrub_pause_wait);
+        return 0;
+}
+int btrfs_scrub_pause_super(struct btrfs_root *root)
+{
+        down_write(&root->fs_info->scrub_super_lock);
+        return 0;
+}
+int btrfs_scrub_continue_super(struct btrfs_root *root)
+{
+        up_write(&root->fs_info->scrub_super_lock);
+        return 0;
+}
+int btrfs_scrub_cancel(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        mutex_lock(&fs_info->scrub_lock);
+        if (!atomic_read(&fs_info->scrubs_running)) {
+                mutex_unlock(&fs_info->scrub_lock);
+                return -ENOTCONN;
+        }
+        atomic_inc(&fs_info->scrub_cancel_req);
+        while (atomic_read(&fs_info->scrubs_running)) {
+                mutex_unlock(&fs_info->scrub_lock);
+                wait_event(fs_info->scrub_pause_wait,
+                           atomic_read(&fs_info->scrubs_running) == 0);
+                mutex_lock(&fs_info->scrub_lock);
+        }
+        atomic_dec(&fs_info->scrub_cancel_req);
+        mutex_unlock(&fs_info->scrub_lock);
+        return 0;
+}
+int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct scrub_dev *sdev;
+        mutex_lock(&fs_info->scrub_lock);
+        sdev = dev->scrub_device;
+        if (!sdev) {
+                mutex_unlock(&fs_info->scrub_lock);
+                return -ENOTCONN;
+        }
+        atomic_inc(&sdev->cancel_req);
+        while (dev->scrub_device) {
+                mutex_unlock(&fs_info->scrub_lock);
+                wait_event(fs_info->scrub_pause_wait,
+                           dev->scrub_device == NULL);
+                mutex_lock(&fs_info->scrub_lock);
+        }
+        mutex_unlock(&fs_info->scrub_lock);
+        return 0;
+}
+int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_device *dev;
+        int ret;
+        /*
+         * we have to hold the device_list_mutex here so the device
+         * does not go away in cancel_dev. FIXME: find a better solution
+         */
+        mutex_lock(&fs_info->fs_devices->device_list_mutex);
+        dev = btrfs_find_device(root, devid, NULL, NULL);
+        if (!dev) {
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+                return -ENODEV;
+        }
+        ret = btrfs_scrub_cancel_dev(root, dev);
+        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+        return ret;
+}
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+                         struct btrfs_scrub_progress *progress)
+{
+        struct btrfs_device *dev;
+        struct scrub_dev *sdev = NULL;
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        dev = btrfs_find_device(root, devid, NULL, NULL);
+        if (dev)
+                sdev = dev->scrub_device;
+        if (sdev)
+                memcpy(progress, &sdev->stat, sizeof(*progress));
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+        return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index be4ffa12f3ef..9b2e7e5bc3ef 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/cleancache.h>
 #include "compat.h"
+#include "delayed-inode.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -160,7 +161,7 @@ enum {
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-        Opt_enospc_debug, Opt_subvolrootid, Opt_err,
+        Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err,
 };
 static match_table_t tokens = {
@@ -191,6 +192,7 @@ static match_table_t tokens = {
        {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
        {Opt_enospc_debug, "enospc_debug"},
        {Opt_subvolrootid, "subvolrootid=%d"},
+        {Opt_defrag, "autodefrag"},
        {Opt_err, NULL},
 };
@@ -369,6 +371,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_enospc_debug:
                        btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
                        break;
+                case Opt_defrag:
+                        printk(KERN_INFO "btrfs: enabling auto defrag");
+                        btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
+                        break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@ -507,8 +513,10 @@ static struct dentry *get_default_root(struct super_block *sb,
         */
        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
-        if (IS_ERR(di))
+        if (IS_ERR(di)) {
+                btrfs_free_path(path);
                return ERR_CAST(di);
+        }
        if (!di) {
                /*
                 * Ok the default dir item isn't there.  This is weird since
@@ -741,7 +749,7 @@ static int btrfs_set_super(struct super_block *s, void *data)
 *        for multiple device setup.  Make sure to keep it in sync.
 */
 static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
-                const char *dev_name, void *data)
+                const char *device_name, void *data)
 {
        struct block_device *bdev = NULL;
        struct super_block *s;
@@ -764,7 +772,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        if (error)
                return ERR_PTR(error);
-        error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
+        error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
        if (error)
                goto error_free_subvol_name;
@@ -915,6 +923,32 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        return 0;
 }
+/* Used to sort the devices by max_avail(descending sort) */
+static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+                                       const void *dev_info2)
+{
+        if (((struct btrfs_device_info *)dev_info1)->max_avail >
+            ((struct btrfs_device_info *)dev_info2)->max_avail)
+                return -1;
+        else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+                 ((struct btrfs_device_info *)dev_info2)->max_avail)
+                return 1;
+        else
+        return 0;
+}
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+                                        struct btrfs_device_info *devices,
+                                        size_t nr_devices)
+{
+        sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+             btrfs_cmp_device_free_bytes, NULL);
+}
 /*
 * The helper to calc the free space on the devices that can be used to store
 * file data.
@@ -1208,10 +1242,14 @@ static int __init init_btrfs_fs(void)
        if (err)
                goto free_extent_io;
-        err = btrfs_interface_init();
+        err = btrfs_delayed_inode_init();
        if (err)
                goto free_extent_map;
+        err = btrfs_interface_init();
+        if (err)
+                goto free_delayed_inode;
        err = register_filesystem(&btrfs_fs_type);
        if (err)
                goto unregister_ioctl;
@@ -1221,6 +1259,8 @@ static int __init init_btrfs_fs(void)
 unregister_ioctl:
        btrfs_interface_exit();
+free_delayed_inode:
+        btrfs_delayed_inode_exit();
 free_extent_map:
        extent_map_exit();
 free_extent_io:
@@ -1237,6 +1277,7 @@ free_sysfs:
 static void __exit exit_btrfs_fs(void)
 {
        btrfs_destroy_cachep();
+        btrfs_delayed_inode_exit();
        extent_map_exit();
        extent_io_exit();
        btrfs_interface_exit();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4ce16ef702a3..c3c223ae6691 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -174,86 +174,9 @@ static const struct sysfs_ops btrfs_root_attr_ops = {
        .store  = btrfs_root_attr_store,
 };
-static struct kobj_type btrfs_root_ktype = {
-        .default_attrs  = btrfs_root_attrs,
-        .sysfs_ops      = &btrfs_root_attr_ops,
-        .release        = btrfs_root_release,
-};
-static struct kobj_type btrfs_super_ktype = {
-        .default_attrs  = btrfs_super_attrs,
-        .sysfs_ops      = &btrfs_super_attr_ops,
-        .release        = btrfs_super_release,
-};
 /* /sys/fs/btrfs/ entry */
 static struct kset *btrfs_kset;
-int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
-{
-        int error;
-        char *name;
-        char c;
-        int len = strlen(fs->sb->s_id) + 1;
-        int i;
-        name = kmalloc(len, GFP_NOFS);
-        if (!name) {
-                error = -ENOMEM;
-                goto fail;
-        }
-        for (i = 0; i < len; i++) {
-                c = fs->sb->s_id[i];
-                if (c == '/' || c == '\\')
-                        c = '!';
-                name[i] = c;
-        }
-        name[len] = '\0';
-        fs->super_kobj.kset = btrfs_kset;
-        error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
-                                     NULL, "%s", name);
-        kfree(name);
-        if (error)
-                goto fail;
-        return 0;
-fail:
-        printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
-        return error;
-}
-int btrfs_sysfs_add_root(struct btrfs_root *root)
-{
-        int error;
-        error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
-                                     &root->fs_info->super_kobj,
-                                     "%s", root->name);
-        if (error)
-                goto fail;
-        return 0;
-fail:
-        printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
-        return error;
-}
-void btrfs_sysfs_del_root(struct btrfs_root *root)
-{
-        kobject_put(&root->root_kobj);
-        wait_for_completion(&root->kobj_unregister);
-}
-void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
-{
-        kobject_put(&fs->super_kobj);
-        wait_for_completion(&fs->kobj_unregister);
-}
 int btrfs_init_sysfs(void)
 {
        btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c571734d5e5a..dc80f7156923 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -27,6 +27,7 @@
 #include "transaction.h"
 #include "locking.h"
 #include "tree-log.h"
+#include "inode-map.h"
 #define BTRFS_ROOT_TRANS_TAG 0
@@ -80,8 +81,7 @@ static noinline int join_transaction(struct btrfs_root *root)
                INIT_LIST_HEAD(&cur_trans->pending_snapshots);
                list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
                extent_io_tree_init(&cur_trans->dirty_pages,
-                                     root->fs_info->btree_inode->i_mapping,
+                                     root->fs_info->btree_inode->i_mapping);
-                                     GFP_NOFS);
                spin_lock(&root->fs_info->new_trans_lock);
                root->fs_info->running_transaction = cur_trans;
                spin_unlock(&root->fs_info->new_trans_lock);
@@ -347,49 +347,6 @@ out_unlock:
        return ret;
 }
-#if 0
-/*
- * rate limit against the drop_snapshot code.  This helps to slow down new
- * operations if the drop_snapshot code isn't able to keep up.
- */
-static void throttle_on_drops(struct btrfs_root *root)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        int harder_count = 0;
-harder:
-        if (atomic_read(&info->throttles)) {
-                DEFINE_WAIT(wait);
-                int thr;
-                thr = atomic_read(&info->throttle_gen);
-                do {
-                        prepare_to_wait(&info->transaction_throttle,
-                                        &wait, TASK_UNINTERRUPTIBLE);
-                        if (!atomic_read(&info->throttles)) {
-                                finish_wait(&info->transaction_throttle, &wait);
-                                break;
-                        }
-                        schedule();
-                        finish_wait(&info->transaction_throttle, &wait);
-                } while (thr == atomic_read(&info->throttle_gen));
-                harder_count++;
-                if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
-                    harder_count < 2)
-                        goto harder;
-                if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
-                    harder_count < 10)
-                        goto harder;
-                if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
-                    harder_count < 20)
-                        goto harder;
-        }
-}
-#endif
 void btrfs_throttle(struct btrfs_root *root)
 {
        mutex_lock(&root->fs_info->trans_mutex);
@@ -487,19 +444,40 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root)
 {
-        return __btrfs_end_transaction(trans, root, 0, 1);
+        int ret;
+        ret = __btrfs_end_transaction(trans, root, 0, 1);
+        if (ret)
+                return ret;
+        return 0;
 }
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
-        return __btrfs_end_transaction(trans, root, 1, 1);
+        int ret;
+        ret = __btrfs_end_transaction(trans, root, 1, 1);
+        if (ret)
+                return ret;
+        return 0;
 }
 int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root)
 {
-        return __btrfs_end_transaction(trans, root, 0, 0);
+        int ret;
+        ret = __btrfs_end_transaction(trans, root, 0, 0);
+        if (ret)
+                return ret;
+        return 0;
+}
+int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root)
+{
+        return __btrfs_end_transaction(trans, root, 1, 1);
 }
 /*
@@ -760,8 +738,14 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        btrfs_update_reloc_root(trans, root);
                        btrfs_orphan_commit_root(trans, root);
+                        btrfs_save_ino_cache(root, trans);
                        if (root->commit_root != root->node) {
+                                mutex_lock(&root->fs_commit_mutex);
                                switch_commit_root(root);
+                                btrfs_unpin_free_ino(root);
+                                mutex_unlock(&root->fs_commit_mutex);
                                btrfs_set_root_node(&root->root_item,
                                                    root->node);
                        }
@@ -809,97 +793,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
        return ret;
 }
-#if 0
-/*
- * when dropping snapshots, we generate a ton of delayed refs, and it makes
- * sense not to join the transaction while it is trying to flush the current
- * queue of delayed refs out.
- *
- * This is used by the drop snapshot code only
- */
-static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
-{
-        DEFINE_WAIT(wait);
-        mutex_lock(&info->trans_mutex);
-        while (info->running_transaction &&
-               info->running_transaction->delayed_refs.flushing) {
-                prepare_to_wait(&info->transaction_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                mutex_unlock(&info->trans_mutex);
-                schedule();
-                mutex_lock(&info->trans_mutex);
-                finish_wait(&info->transaction_wait, &wait);
-        }
-        mutex_unlock(&info->trans_mutex);
-        return 0;
-}
-/*
- * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
- * all of them
- */
-int btrfs_drop_dead_root(struct btrfs_root *root)
-{
-        struct btrfs_trans_handle *trans;
-        struct btrfs_root *tree_root = root->fs_info->tree_root;
-        unsigned long nr;
-        int ret;
-        while (1) {
-                /*
-                 * we don't want to jump in and create a bunch of
-                 * delayed refs if the transaction is starting to close
-                 */
-                wait_transaction_pre_flush(tree_root->fs_info);
-                trans = btrfs_start_transaction(tree_root, 1);
-                /*
-                 * we've joined a transaction, make sure it isn't
-                 * closing right now
-                 */
-                if (trans->transaction->delayed_refs.flushing) {
-                        btrfs_end_transaction(trans, tree_root);
-                        continue;
-                }
-                ret = btrfs_drop_snapshot(trans, root);
-                if (ret != -EAGAIN)
-                        break;
-                ret = btrfs_update_root(trans, tree_root,
-                                        &root->root_key,
-                                        &root->root_item);
-                if (ret)
-                        break;
-                nr = trans->blocks_used;
-                ret = btrfs_end_transaction(trans, tree_root);
-                BUG_ON(ret);
-                btrfs_btree_balance_dirty(tree_root, nr);
-                cond_resched();
-        }
-        BUG_ON(ret);
-        ret = btrfs_del_root(trans, tree_root, &root->root_key);
-        BUG_ON(ret);
-        nr = trans->blocks_used;
-        ret = btrfs_end_transaction(trans, tree_root);
-        BUG_ON(ret);
-        free_extent_buffer(root->node);
-        free_extent_buffer(root->commit_root);
-        kfree(root);
-        btrfs_btree_balance_dirty(tree_root, nr);
-        return ret;
-}
-#endif
 /*
 * new snapshots need to be created at a very specific time in the
 * transaction commit.  This does the actual creation
@@ -930,7 +823,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                goto fail;
        }
-        ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
+        ret = btrfs_find_free_objectid(tree_root, &objectid);
        if (ret) {
                pending->error = ret;
                goto fail;
@@ -967,7 +860,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        ret = btrfs_insert_dir_item(trans, parent_root,
                                dentry->d_name.name, dentry->d_name.len,
-                                parent_inode->i_ino, &key,
+                                parent_inode, &key,
                                BTRFS_FT_DIR, index);
        BUG_ON(ret);
@@ -1009,7 +902,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         */
        ret = btrfs_add_root_ref(trans, tree_root, objectid,
                                 parent_root->root_key.objectid,
-                                 parent_inode->i_ino, index,
+                                 btrfs_ino(parent_inode), index,
                                 dentry->d_name.name, dentry->d_name.len);
        BUG_ON(ret);
        dput(parent);
@@ -1037,6 +930,14 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
        int ret;
        list_for_each_entry(pending, head, list) {
+                /*
+                 * We must deal with the delayed items before creating
+                 * snapshots, or we will create a snapthot with inconsistent
+                 * information.
+                */
+                ret = btrfs_run_delayed_items(trans, fs_info->fs_root);
+                BUG_ON(ret);
                ret = create_pending_snapshot(trans, fs_info, pending);
                BUG_ON(ret);
        }
@@ -1290,6 +1191,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
+                ret = btrfs_run_delayed_items(trans, root);
+                BUG_ON(ret);
                /*
                 * rename don't use btrfs_join_transaction, so, once we
                 * set the transaction to blocked above, we aren't going
@@ -1316,11 +1220,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        ret = create_pending_snapshots(trans, root->fs_info);
        BUG_ON(ret);
+        ret = btrfs_run_delayed_items(trans, root);
+        BUG_ON(ret);
        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
        BUG_ON(ret);
        WARN_ON(cur_trans != trans->transaction);
+        btrfs_scrub_pause(root);
        /* btrfs_commit_tree_roots is responsible for getting the
         * various roots consistent with each other.  Every pointer
         * in the tree of tree roots has to point to the most up to date
@@ -1405,6 +1313,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        mutex_unlock(&root->fs_info->trans_mutex);
+        btrfs_scrub_continue(root);
        if (current->journal_info == trans)
                current->journal_info = NULL;
@@ -1432,6 +1342,8 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
                root = list_entry(list.next, struct btrfs_root, root_list);
                list_del(&root->root_list);
+                btrfs_kill_all_delayed_nodes(root);
                if (btrfs_header_backref_rev(root->node) <
                    BTRFS_MIXED_BACKREF_REV)
                        btrfs_drop_snapshot(root, NULL, 0);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e441acc6c584..804c88639e5d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -101,11 +101,8 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
-int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root);
 int btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_drop_dead_root(struct btrfs_root *root);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -115,6 +112,8 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
                                   int wait_for_unblock);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root);
+int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root);
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 992ab425599d..3b580ee8ab1d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -97,7 +97,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                ret = 0;
                goto out;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (wret < 0) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f997ec0c1ba4..592396c6dc47 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -333,13 +333,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
                        goto insert;
                if (item_size == 0) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        return 0;
                }
                dst_copy = kmalloc(item_size, GFP_NOFS);
                src_copy = kmalloc(item_size, GFP_NOFS);
                if (!dst_copy || !src_copy) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        kfree(dst_copy);
                        kfree(src_copy);
                        return -ENOMEM;
@@ -361,13 +361,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
                 * sync
                 */
                if (ret == 0) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        return 0;
                }
        }
 insert:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        /* try to insert the key into the destination tree */
        ret = btrfs_insert_empty_item(trans, root, path,
                                      key, item_size);
@@ -382,7 +382,6 @@ insert:
                } else if (found_size < item_size) {
                        ret = btrfs_extend_item(trans, root, path,
                                                item_size - found_size);
-                        BUG_ON(ret);
                }
        } else if (ret) {
                return ret;
@@ -438,7 +437,7 @@ insert:
        }
 no_copy:
        btrfs_mark_buffer_dirty(path->nodes[0]);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        return 0;
 }
@@ -519,7 +518,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
         * file.  This must be done before the btrfs_drop_extents run
         * so we don't try to drop this extent.
         */
-        ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+        ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
                                       start, 0);
        if (ret == 0 &&
@@ -544,11 +543,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                 * we don't have to do anything
                 */
                if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto out;
                }
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        saved_nbytes = inode_get_bytes(inode);
        /* drop any overlapping extents */
@@ -590,6 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                                ins.objectid, ins.offset,
                                                0, root->root_key.objectid,
                                                key->objectid, offset);
+                                BUG_ON(ret);
                        } else {
                                /*
                                 * insert the extent pointer in the extent
@@ -600,7 +600,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                                key->objectid, offset, &ins);
                                BUG_ON(ret);
                        }
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        if (btrfs_file_extent_compression(eb, item)) {
                                csum_start = ins.objectid;
@@ -614,7 +614,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                        ret = btrfs_lookup_csums_range(root->log_root,
                                                csum_start, csum_end - 1,
-                                                &ordered_sums);
+                                                &ordered_sums, 0);
                        BUG_ON(ret);
                        while (!list_empty(&ordered_sums)) {
                                struct btrfs_ordered_sum *sums;
@@ -629,7 +629,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                kfree(sums);
                        }
                } else {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                }
        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                /* inline extents are easy, we just overwrite them */
@@ -675,10 +675,13 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        inode = read_one_inode(root, location.objectid);
-        BUG_ON(!inode);
+        if (!inode) {
+                kfree(name);
+                return -EIO;
+        }
        ret = link_to_fixup_dir(trans, root, path, location.objectid);
        BUG_ON(ret);
@@ -713,7 +716,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
                        goto out;
        } else
                goto out;
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
        if (di && !IS_ERR(di)) {
@@ -724,7 +727,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
                goto out;
        match = 1;
 out:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        return match;
 }
@@ -817,7 +820,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                return -ENOENT;
        inode = read_one_inode(root, key->objectid);
-        BUG_ON(!inode);
+        if (!inode) {
+                iput(dir);
+                return -EIO;
+        }
        ref_ptr = btrfs_item_ptr_offset(eb, slot);
        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
@@ -832,7 +838,7 @@ again:
        read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
        /* if we already have a perfect match, we're done */
-        if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
+        if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
                         btrfs_inode_ref_index(eb, ref),
                         name, namelen)) {
                goto out;
@@ -884,7 +890,7 @@ again:
                        if (!backref_in_log(log, key, victim_name,
                                            victim_name_len)) {
                                btrfs_inc_nlink(inode);
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                                ret = btrfs_unlink_inode(trans, root, dir,
                                                         inode, victim_name,
@@ -901,7 +907,7 @@ again:
                 */
                search_done = 1;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
 insert:
        /* insert our name */
@@ -922,7 +928,7 @@ out:
        BUG_ON(ret);
 out_nowrite:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        iput(dir);
        iput(inode);
        return 0;
@@ -960,8 +966,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        unsigned long ptr;
        unsigned long ptr_end;
        int name_len;
+        u64 ino = btrfs_ino(inode);
-        key.objectid = inode->i_ino;
+        key.objectid = ino;
        key.type = BTRFS_INODE_REF_KEY;
        key.offset = (u64)-1;
@@ -980,7 +987,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
                }
                btrfs_item_key_to_cpu(path->nodes[0], &key,
                                      path->slots[0]);
-                if (key.objectid != inode->i_ino ||
+                if (key.objectid != ino ||
                    key.type != BTRFS_INODE_REF_KEY)
                        break;
                ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
@@ -999,9 +1006,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
                if (key.offset == 0)
                        break;
                key.offset--;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (nlink != inode->i_nlink) {
                inode->i_nlink = nlink;
                btrfs_update_inode(trans, root, inode);
@@ -1011,10 +1018,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        if (inode->i_nlink == 0) {
                if (S_ISDIR(inode->i_mode)) {
                        ret = replay_dir_deletes(trans, root, NULL, path,
-                                                 inode->i_ino, 1);
+                                                 ino, 1);
                        BUG_ON(ret);
                }
-                ret = insert_orphan_item(trans, root, inode->i_ino);
+                ret = insert_orphan_item(trans, root, ino);
                BUG_ON(ret);
        }
        btrfs_free_path(path);
@@ -1050,11 +1057,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
                        break;
                ret = btrfs_del_item(trans, root, path);
-                BUG_ON(ret);
+                if (ret)
+                        goto out;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                inode = read_one_inode(root, key.offset);
-                BUG_ON(!inode);
+                if (!inode)
+                        return -EIO;
                ret = fixup_inode_link_count(trans, root, inode);
                BUG_ON(ret);
@@ -1068,8 +1077,10 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
                 */
                key.offset = (u64)-1;
        }
-        btrfs_release_path(root, path);
+        ret = 0;
-        return 0;
+out:
+        btrfs_release_path(path);
+        return ret;
 }
@@ -1088,7 +1099,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
        struct inode *inode;
        inode = read_one_inode(root, objectid);
-        BUG_ON(!inode);
+        if (!inode)
+                return -EIO;
        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
@@ -1096,7 +1108,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (ret == 0) {
                btrfs_inc_nlink(inode);
                btrfs_update_inode(trans, root, inode);
@@ -1175,7 +1187,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
        int ret;
        dir = read_one_inode(root, key->objectid);
-        BUG_ON(!dir);
+        if (!dir)
+                return -EIO;
        name_len = btrfs_dir_name_len(eb, di);
        name = kmalloc(name_len, GFP_NOFS);
@@ -1192,7 +1205,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
                exists = 1;
        else
                exists = 0;
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (key->type == BTRFS_DIR_ITEM_KEY) {
                dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
@@ -1205,7 +1218,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
        } else {
                BUG();
        }
-        if (!dst_di || IS_ERR(dst_di)) {
+        if (IS_ERR_OR_NULL(dst_di)) {
                /* we need a sequence number to insert, so we only
                 * do inserts for the BTRFS_DIR_INDEX_KEY types
                 */
@@ -1236,13 +1249,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
        if (key->type == BTRFS_DIR_INDEX_KEY)
                goto insert;
 out:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        kfree(name);
        iput(dir);
        return 0;
 insert:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ret = insert_one_name(trans, root, path, key->objectid, key->offset,
                              name, name_len, log_type, &log_key);
@@ -1363,7 +1376,7 @@ next:
        *end_ret = found_end;
        ret = 0;
 out:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        return ret;
 }
@@ -1426,12 +1439,15 @@ again:
                                                     dir_key->offset,
                                                     name, name_len, 0);
                }
-                if (!log_di || IS_ERR(log_di)) {
+                if (IS_ERR_OR_NULL(log_di)) {
                        btrfs_dir_item_key_to_cpu(eb, di, &location);
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
-                        btrfs_release_path(log, log_path);
+                        btrfs_release_path(log_path);
                        inode = read_one_inode(root, location.objectid);
-                        BUG_ON(!inode);
+                        if (!inode) {
+                                kfree(name);
+                                return -EIO;
+                        }
                        ret = link_to_fixup_dir(trans, root,
                                                path, location.objectid);
@@ -1453,7 +1469,7 @@ again:
                        ret = 0;
                        goto out;
                }
-                btrfs_release_path(log, log_path);
+                btrfs_release_path(log_path);
                kfree(name);
                ptr = (unsigned long)(di + 1);
@@ -1461,8 +1477,8 @@ again:
        }
        ret = 0;
 out:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
-        btrfs_release_path(log, log_path);
+        btrfs_release_path(log_path);
        return ret;
 }
@@ -1550,7 +1566,7 @@ again:
                                break;
                        dir_key.offset = found_key.offset + 1;
                }
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                if (range_end == (u64)-1)
                        break;
                range_start = range_end + 1;
@@ -1561,11 +1577,11 @@ next_type:
        if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
                key_type = BTRFS_DIR_LOG_INDEX_KEY;
                dir_key.type = BTRFS_DIR_INDEX_KEY;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                goto again;
        }
 out:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        btrfs_free_path(log_path);
        iput(dir);
        return ret;
@@ -2093,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * the running transaction open, so a full commit can't hop
         * in and cause problems either.
         */
+        btrfs_scrub_pause_super(root);
        write_ctree_super(trans, root->fs_info->tree_root, 1);
+        btrfs_scrub_continue_super(root);
        ret = 0;
        mutex_lock(&root->log_mutex);
@@ -2197,6 +2215,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        int ret;
        int err = 0;
        int bytes_del = 0;
+        u64 dir_ino = btrfs_ino(dir);
        if (BTRFS_I(dir)->logged_trans < trans->transid)
                return 0;
@@ -2214,7 +2233,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                goto out_unlock;
        }
-        di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
+        di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
                                   name, name_len, -1);
        if (IS_ERR(di)) {
                err = PTR_ERR(di);
@@ -2225,8 +2244,8 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                bytes_del += name_len;
                BUG_ON(ret);
        }
-        btrfs_release_path(log, path);
+        btrfs_release_path(path);
-        di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
+        di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
                                         index, name, name_len, -1);
        if (IS_ERR(di)) {
                err = PTR_ERR(di);
@@ -2244,10 +2263,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        if (bytes_del) {
                struct btrfs_key key;
-                key.objectid = dir->i_ino;
+                key.objectid = dir_ino;
                key.offset = 0;
                key.type = BTRFS_INODE_ITEM_KEY;
-                btrfs_release_path(log, path);
+                btrfs_release_path(path);
                ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
                if (ret < 0) {
@@ -2269,7 +2288,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                        btrfs_mark_buffer_dirty(path->nodes[0]);
                } else
                        ret = 0;
-                btrfs_release_path(log, path);
+                btrfs_release_path(path);
        }
 fail:
        btrfs_free_path(path);
@@ -2303,7 +2322,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
        log = root->log_root;
        mutex_lock(&BTRFS_I(inode)->log_mutex);
-        ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+        ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
                                  dirid, &index);
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
        if (ret == -ENOSPC) {
@@ -2344,7 +2363,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
                              struct btrfs_dir_log_item);
        btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
        btrfs_mark_buffer_dirty(path->nodes[0]);
-        btrfs_release_path(log, path);
+        btrfs_release_path(path);
        return 0;
 }
@@ -2369,13 +2388,14 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
        int nritems;
        u64 first_offset = min_offset;
        u64 last_offset = (u64)-1;
+        u64 ino = btrfs_ino(inode);
        log = root->log_root;
-        max_key.objectid = inode->i_ino;
+        max_key.objectid = ino;
        max_key.offset = (u64)-1;
        max_key.type = key_type;
-        min_key.objectid = inode->i_ino;
+        min_key.objectid = ino;
        min_key.type = key_type;
        min_key.offset = min_offset;
@@ -2388,18 +2408,17 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
         * we didn't find anything from this transaction, see if there
         * is anything at all
         */
-        if (ret != 0 || min_key.objectid != inode->i_ino ||
+        if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
-            min_key.type != key_type) {
+                min_key.objectid = ino;
-                min_key.objectid = inode->i_ino;
                min_key.type = key_type;
                min_key.offset = (u64)-1;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
                if (ret < 0) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        return ret;
                }
-                ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+                ret = btrfs_previous_item(root, path, ino, key_type);
                /* if ret == 0 there are items for this type,
                 * create a range to tell us the last key of this type.
@@ -2417,7 +2436,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
        }
        /* go backward to find any previous key */
-        ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+        ret = btrfs_previous_item(root, path, ino, key_type);
        if (ret == 0) {
                struct btrfs_key tmp;
                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
@@ -2432,7 +2451,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                        }
                }
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        /* find the first key from this transaction again */
        ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
@@ -2452,8 +2471,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                for (i = path->slots[0]; i < nritems; i++) {
                        btrfs_item_key_to_cpu(src, &min_key, i);
-                        if (min_key.objectid != inode->i_ino ||
+                        if (min_key.objectid != ino || min_key.type != key_type)
-                            min_key.type != key_type)
                                goto done;
                        ret = overwrite_item(trans, log, dst_path, src, i,
                                             &min_key);
@@ -2474,7 +2492,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                        goto done;
                }
                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
-                if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
+                if (tmp.objectid != ino || tmp.type != key_type) {
                        last_offset = (u64)-1;
                        goto done;
                }
@@ -2490,8 +2508,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                }
        }
 done:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
-        btrfs_release_path(log, dst_path);
+        btrfs_release_path(dst_path);
        if (err == 0) {
                *last_offset_ret = last_offset;
@@ -2500,8 +2518,7 @@ done:
                 * is valid
                 */
                ret = insert_dir_log_key(trans, log, path, key_type,
-                                         inode->i_ino, first_offset,
+                                         ino, first_offset, last_offset);
-                                         last_offset);
                if (ret)
                        err = ret;
        }
@@ -2587,10 +2604,11 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                        break;
                ret = btrfs_del_item(trans, log, path);
-                BUG_ON(ret);
+                if (ret)
-                btrfs_release_path(log, path);
+                        break;
+                btrfs_release_path(path);
        }
-        btrfs_release_path(log, path);
+        btrfs_release_path(path);
        return ret;
 }
@@ -2665,6 +2683,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                        extent = btrfs_item_ptr(src, start_slot + i,
                                                struct btrfs_file_extent_item);
+                        if (btrfs_file_extent_generation(src, extent) < trans->transid)
+                                continue;
                        found_type = btrfs_file_extent_type(src, extent);
                        if (found_type == BTRFS_FILE_EXTENT_REG ||
                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -2689,14 +2710,14 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                ret = btrfs_lookup_csums_range(
                                                log->fs_info->csum_root,
                                                ds + cs, ds + cs + cl - 1,
-                                                &ordered_sums);
+                                                &ordered_sums, 0);
                                BUG_ON(ret);
                        }
                }
        }
        btrfs_mark_buffer_dirty(dst_path->nodes[0]);
-        btrfs_release_path(log, dst_path);
+        btrfs_release_path(dst_path);
        kfree(ins_data);
        /*
@@ -2745,6 +2766,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        int nritems;
        int ins_start_slot = 0;
        int ins_nr;
+        u64 ino = btrfs_ino(inode);
        log = root->log_root;
@@ -2757,11 +2779,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        }
-        min_key.objectid = inode->i_ino;
+        min_key.objectid = ino;
        min_key.type = BTRFS_INODE_ITEM_KEY;
        min_key.offset = 0;
-        max_key.objectid = inode->i_ino;
+        max_key.objectid = ino;
        /* today the code can only do partial logging of directories */
        if (!S_ISDIR(inode->i_mode))
@@ -2773,6 +2795,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                max_key.type = (u8)-1;
        max_key.offset = (u64)-1;
+        ret = btrfs_commit_inode_delayed_items(trans, inode);
+        if (ret) {
+                btrfs_free_path(path);
+                btrfs_free_path(dst_path);
+                return ret;
+        }
        mutex_lock(&BTRFS_I(inode)->log_mutex);
        /*
@@ -2784,8 +2813,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                if (inode_only == LOG_INODE_EXISTS)
                        max_key_type = BTRFS_XATTR_ITEM_KEY;
-                ret = drop_objectid_items(trans, log, path,
+                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
-                                          inode->i_ino, max_key_type);
        } else {
                ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
        }
@@ -2803,7 +2831,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                        break;
 again:
                /* note, ins_nr might be > 0 here, cleanup outside the loop */
-                if (min_key.objectid != inode->i_ino)
+                if (min_key.objectid != ino)
                        break;
                if (min_key.type > max_key.type)
                        break;
@@ -2845,7 +2873,7 @@ next_slot:
                        }
                        ins_nr = 0;
                }
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                if (min_key.offset < (u64)-1)
                        min_key.offset++;
@@ -2868,8 +2896,8 @@ next_slot:
        }
        WARN_ON(ins_nr);
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
-                btrfs_release_path(log, dst_path);
+                btrfs_release_path(dst_path);
                ret = log_directory_changes(trans, root, inode, path, dst_path);
                if (ret) {
                        err = ret;
@@ -3136,7 +3164,7 @@ again:
                }
                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                      path->slots[0]);
-                btrfs_release_path(log_root_tree, path);
+                btrfs_release_path(path);
                if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
                        break;
@@ -3171,7 +3199,7 @@ again:
                if (found_key.offset == 0)
                        break;
        }
-        btrfs_release_path(log_root_tree, path);
+        btrfs_release_path(path);
        /* step one is to pin it all, step two is to replay just inodes */
        if (wc.pin) {
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 3dfae84c8cc8..2270ac58d746 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -38,7 +38,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               const char *name, int name_len,
                               struct inode *inode, u64 dirid);
-int btrfs_join_running_log_trans(struct btrfs_root *root);
 int btrfs_end_log_trans(struct btrfs_root *root);
 int btrfs_pin_log_trans(struct btrfs_root *root);
 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
deleted file mode 100644
index 1ca1952fd917..000000000000
--- a/fs/btrfs/version.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-#
-# determine-version -- report a useful version for releases
-#
-# Copyright 2008, Aron Griffis <agriffis@n01se.net>
-# Copyright 2008, Oracle
-# Released under the GNU GPLv2
- 
-v="v0.16"
-which git &> /dev/null
-if [ $? == 0 ]; then
-    git branch >& /dev/null
-    if [ $? == 0 ]; then
-            if head=`git rev-parse --verify HEAD 2>/dev/null`; then
-                if tag=`git describe --tags 2>/dev/null`; then
-                    v="$tag"
-                fi
-                # Are there uncommitted changes?
-                git update-index --refresh --unmerged > /dev/null
-                if git diff-index --name-only HEAD | \
-                    grep -v "^scripts/package" \
-                    | read dummy; then
-                    v="$v"-dirty
-                fi
-            fi
-    fi
-fi
- 
-echo "#ifndef __BUILD_VERSION" > .build-version.h
-echo "#define __BUILD_VERSION" >> .build-version.h
-echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
-echo "#endif" >> .build-version.h
-diff -q version.h .build-version.h >& /dev/null
-if [ $? == 0 ]; then
-    rm .build-version.h
-    exit 0
-fi
-mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c7367ae5a3e6..c48214ef5c09 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -38,22 +38,9 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_device *device);
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
-#define map_lookup_size(n) (sizeof(struct map_lookup) + \
-                            (sizeof(struct btrfs_bio_stripe) * (n)))
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
-void btrfs_lock_volumes(void)
-{
-        mutex_lock(&uuid_mutex);
-}
-void btrfs_unlock_volumes(void)
-{
-        mutex_unlock(&uuid_mutex);
-}
 static void lock_chunks(struct btrfs_root *root)
 {
        mutex_lock(&root->fs_info->chunk_mutex);
@@ -363,7 +350,7 @@ static noinline int device_list_add(const char *path,
                INIT_LIST_HEAD(&device->dev_alloc_list);
                mutex_lock(&fs_devices->device_list_mutex);
-                list_add(&device->dev_list, &fs_devices->devices);
+                list_add_rcu(&device->dev_list, &fs_devices->devices);
                mutex_unlock(&fs_devices->device_list_mutex);
                device->fs_devices = fs_devices;
@@ -406,7 +393,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
        fs_devices->latest_trans = orig->latest_trans;
        memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
-        mutex_lock(&orig->device_list_mutex);
+        /* We have held the volume lock, it is safe to get the devices. */
        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
                device = kzalloc(sizeof(*device), GFP_NOFS);
                if (!device)
@@ -429,10 +416,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
        }
-        mutex_unlock(&orig->device_list_mutex);
        return fs_devices;
 error:
-        mutex_unlock(&orig->device_list_mutex);
        free_fs_devices(fs_devices);
        return ERR_PTR(-ENOMEM);
 }
@@ -443,7 +428,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
        mutex_lock(&uuid_mutex);
 again:
-        mutex_lock(&fs_devices->device_list_mutex);
+        /* This is the initialized path, it is safe to release the devices. */
        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
                if (device->in_fs_metadata)
                        continue;
@@ -463,7 +448,6 @@ again:
                kfree(device->name);
                kfree(device);
        }
-        mutex_unlock(&fs_devices->device_list_mutex);
        if (fs_devices->seed) {
                fs_devices = fs_devices->seed;
@@ -474,6 +458,29 @@ again:
        return 0;
 }
+static void __free_device(struct work_struct *work)
+{
+        struct btrfs_device *device;
+        device = container_of(work, struct btrfs_device, rcu_work);
+        if (device->bdev)
+                blkdev_put(device->bdev, device->mode);
+        kfree(device->name);
+        kfree(device);
+}
+static void free_device(struct rcu_head *head)
+{
+        struct btrfs_device *device;
+        device = container_of(head, struct btrfs_device, rcu);
+        INIT_WORK(&device->rcu_work, __free_device);
+        schedule_work(&device->rcu_work);
+}
 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
        struct btrfs_device *device;
@@ -481,20 +488,32 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
        if (--fs_devices->opened > 0)
                return 0;
+        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
-                if (device->bdev) {
+                struct btrfs_device *new_device;
-                        blkdev_put(device->bdev, device->mode);
+                if (device->bdev)
                        fs_devices->open_devices--;
-                }
                if (device->writeable) {
                        list_del_init(&device->dev_alloc_list);
                        fs_devices->rw_devices--;
                }
-                device->bdev = NULL;
+                new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
-                device->writeable = 0;
+                BUG_ON(!new_device);
-                device->in_fs_metadata = 0;
+                memcpy(new_device, device, sizeof(*new_device));
+                new_device->name = kstrdup(device->name, GFP_NOFS);
+                BUG_ON(!new_device->name);
+                new_device->bdev = NULL;
+                new_device->writeable = 0;
+                new_device->in_fs_metadata = 0;
+                list_replace_rcu(&device->dev_list, &new_device->dev_list);
+                call_rcu(&device->rcu, free_device);
        }
+        mutex_unlock(&fs_devices->device_list_mutex);
        WARN_ON(fs_devices->open_devices);
        WARN_ON(fs_devices->rw_devices);
        fs_devices->opened = 0;
@@ -597,6 +616,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                        list_add(&device->dev_alloc_list,
                                 &fs_devices->alloc_list);
                }
+                brelse(bh);
                continue;
 error_brelse:
@@ -815,10 +835,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
        /* we don't want to overwrite the superblock on the drive,
         * so we make sure to start at an offset of at least 1MB
         */
-        search_start = 1024 * 1024;
+        search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
-        if (root->fs_info->alloc_start + num_bytes <= search_end)
-                search_start = max(root->fs_info->alloc_start, search_start);
        max_hole_start = search_start;
        max_hole_size = 0;
@@ -949,14 +966,14 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
        if (ret > 0) {
                ret = btrfs_previous_item(root, path, key.objectid,
                                          BTRFS_DEV_EXTENT_KEY);
-                BUG_ON(ret);
+                if (ret)
+                        goto out;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                extent = btrfs_item_ptr(leaf, path->slots[0],
                                        struct btrfs_dev_extent);
                BUG_ON(found_key.offset > start || found_key.offset +
                       btrfs_dev_extent_length(leaf, extent) < start);
-                ret = 0;
        } else if (ret == 0) {
                leaf = path->nodes[0];
                extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -967,8 +984,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
        if (device->bytes_used > 0)
                device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
        ret = btrfs_del_item(trans, root, path);
-        BUG_ON(ret);
+out:
        btrfs_free_path(path);
        return ret;
 }
@@ -1203,11 +1220,13 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        struct block_device *bdev;
        struct buffer_head *bh = NULL;
        struct btrfs_super_block *disk_super;
+        struct btrfs_fs_devices *cur_devices;
        u64 all_avail;
        u64 devid;
        u64 num_devices;
        u8 *dev_uuid;
        int ret = 0;
+        bool clear_super = false;
        mutex_lock(&uuid_mutex);
        mutex_lock(&root->fs_info->volume_mutex);
@@ -1238,14 +1257,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                device = NULL;
                devices = &root->fs_info->fs_devices->devices;
-                mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+                /*
+                 * It is safe to read the devices since the volume_mutex
+                 * is held.
+                 */
                list_for_each_entry(tmp, devices, dev_list) {
                        if (tmp->in_fs_metadata && !tmp->bdev) {
                                device = tmp;
                                break;
                        }
                }
-                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
                bdev = NULL;
                bh = NULL;
                disk_super = NULL;
@@ -1287,8 +1308,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        }
        if (device->writeable) {
+                lock_chunks(root);
                list_del_init(&device->dev_alloc_list);
+                unlock_chunks(root);
                root->fs_info->fs_devices->rw_devices--;
+                clear_super = true;
        }
        ret = btrfs_shrink_device(device, 0);
@@ -1300,15 +1324,17 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                goto error_undo;
        device->in_fs_metadata = 0;
+        btrfs_scrub_cancel_dev(root, device);
        /*
         * the device list mutex makes sure that we don't change
         * the device list while someone else is writing out all
         * the device supers.
         */
+        cur_devices = device->fs_devices;
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-        list_del_init(&device->dev_list);
+        list_del_rcu(&device->dev_list);
-        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        device->fs_devices->num_devices--;
@@ -1322,34 +1348,36 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        if (device->bdev == root->fs_info->fs_devices->latest_bdev)
                root->fs_info->fs_devices->latest_bdev = next_device->bdev;
-        if (device->bdev) {
+        if (device->bdev)
-                blkdev_put(device->bdev, device->mode);
-                device->bdev = NULL;
                device->fs_devices->open_devices--;
-        }
+        call_rcu(&device->rcu, free_device);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
        btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
-        if (device->fs_devices->open_devices == 0) {
+        if (cur_devices->open_devices == 0) {
                struct btrfs_fs_devices *fs_devices;
                fs_devices = root->fs_info->fs_devices;
                while (fs_devices) {
-                        if (fs_devices->seed == device->fs_devices)
+                        if (fs_devices->seed == cur_devices)
                                break;
                        fs_devices = fs_devices->seed;
                }
-                fs_devices->seed = device->fs_devices->seed;
+                fs_devices->seed = cur_devices->seed;
-                device->fs_devices->seed = NULL;
+                cur_devices->seed = NULL;
-                __btrfs_close_devices(device->fs_devices);
+                lock_chunks(root);
-                free_fs_devices(device->fs_devices);
+                __btrfs_close_devices(cur_devices);
+                unlock_chunks(root);
+                free_fs_devices(cur_devices);
        }
        /*
         * at this point, the device is zero sized.  We want to
         * remove it from the devices list and zero out the old super
         */
-        if (device->writeable) {
+        if (clear_super) {
                /* make sure this device isn't detected as part of
                 * the FS anymore
                 */
@@ -1358,8 +1386,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                sync_dirty_buffer(bh);
        }
-        kfree(device->name);
-        kfree(device);
        ret = 0;
 error_brelse:
@@ -1373,8 +1399,10 @@ out:
        return ret;
 error_undo:
        if (device->writeable) {
+                lock_chunks(root);
                list_add(&device->dev_alloc_list,
                         &root->fs_info->fs_devices->alloc_list);
+                unlock_chunks(root);
                root->fs_info->fs_devices->rw_devices++;
        }
        goto error_brelse;
@@ -1414,7 +1442,12 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&seed_devices->devices);
        INIT_LIST_HEAD(&seed_devices->alloc_list);
        mutex_init(&seed_devices->device_list_mutex);
-        list_splice_init(&fs_devices->devices, &seed_devices->devices);
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
+                              synchronize_rcu);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
        list_for_each_entry(device, &seed_devices->devices, dev_list) {
                device->fs_devices = seed_devices;
@@ -1475,7 +1508,7 @@ next_slot:
                                goto error;
                        leaf = path->nodes[0];
                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        continue;
                }
@@ -1611,7 +1644,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         * half setup
         */
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-        list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+        list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
        list_add(&device->dev_alloc_list,
                 &root->fs_info->fs_devices->alloc_list);
        root->fs_info->fs_devices->num_devices++;
@@ -1769,10 +1802,9 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        ret = btrfs_del_item(trans, root, path);
-        BUG_ON(ret);
        btrfs_free_path(path);
-        return 0;
+        return ret;
 }
 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
@@ -1947,7 +1979,7 @@ again:
                chunk = btrfs_item_ptr(leaf, path->slots[0],
                                       struct btrfs_chunk);
                chunk_type = btrfs_chunk_type(leaf, chunk);
-                btrfs_release_path(chunk_root, path);
+                btrfs_release_path(path);
                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
                        ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
@@ -2065,7 +2097,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
                if (found_key.offset == 0)
                        break;
-                btrfs_release_path(chunk_root, path);
+                btrfs_release_path(path);
                ret = btrfs_relocate_chunk(chunk_root,
                                           chunk_root->root_key.objectid,
                                           found_key.objectid,
@@ -2137,7 +2169,7 @@ again:
                        goto done;
                if (ret) {
                        ret = 0;
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        break;
                }
@@ -2146,7 +2178,7 @@ again:
                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
                if (key.objectid != device->devid) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        break;
                }
@@ -2154,14 +2186,14 @@ again:
                length = btrfs_dev_extent_length(l, dev_extent);
                if (key.offset + length <= new_size) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        break;
                }
                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
                                           chunk_offset);
@@ -2237,275 +2269,204 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
        return 0;
 }
-static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
+/*
-                                        int num_stripes, int sub_stripes)
+ * sort the devices in descending order by max_avail, total_avail
+ */
+static int btrfs_cmp_device_info(const void *a, const void *b)
 {
-        if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
+        const struct btrfs_device_info *di_a = a;
-                return calc_size;
+        const struct btrfs_device_info *di_b = b;
-        else if (type & BTRFS_BLOCK_GROUP_RAID10)
-                return calc_size * (num_stripes / sub_stripes);
-        else
-                return calc_size * num_stripes;
-}
-/* Used to sort the devices by max_avail(descending sort) */
+        if (di_a->max_avail > di_b->max_avail)
-int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
-{
-        if (((struct btrfs_device_info *)dev_info1)->max_avail >
-            ((struct btrfs_device_info *)dev_info2)->max_avail)
                return -1;
-        else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+        if (di_a->max_avail < di_b->max_avail)
-                 ((struct btrfs_device_info *)dev_info2)->max_avail)
                return 1;
-        else
+        if (di_a->total_avail > di_b->total_avail)
-                return 0;
+                return -1;
+        if (di_a->total_avail < di_b->total_avail)
+                return 1;
+        return 0;
 }
-static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-                                 int *num_stripes, int *min_stripes,
+                               struct btrfs_root *extent_root,
-                                 int *sub_stripes)
+                               struct map_lookup **map_ret,
+                               u64 *num_bytes_out, u64 *stripe_size_out,
+                               u64 start, u64 type)
 {
-        *num_stripes = 1;
+        struct btrfs_fs_info *info = extent_root->fs_info;
-        *min_stripes = 1;
+        struct btrfs_fs_devices *fs_devices = info->fs_devices;
-        *sub_stripes = 0;
+        struct list_head *cur;
+        struct map_lookup *map = NULL;
+        struct extent_map_tree *em_tree;
+        struct extent_map *em;
+        struct btrfs_device_info *devices_info = NULL;
+        u64 total_avail;
+        int num_stripes;        /* total number of stripes to allocate */
+        int sub_stripes;        /* sub_stripes info for map */
+        int dev_stripes;        /* stripes per dev */
+        int devs_max;           /* max devs to use */
+        int devs_min;           /* min devs needed */
+        int devs_increment;     /* ndevs has to be a multiple of this */
+        int ncopies;            /* how many copies to data has */
+        int ret;
+        u64 max_stripe_size;
+        u64 max_chunk_size;
+        u64 stripe_size;
+        u64 num_bytes;
+        int ndevs;
+        int i;
+        int j;
-        if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
+        if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
-                *num_stripes = fs_devices->rw_devices;
+            (type & BTRFS_BLOCK_GROUP_DUP)) {
-                *min_stripes = 2;
+                WARN_ON(1);
-        }
+                type &= ~BTRFS_BLOCK_GROUP_DUP;
-        if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-                *num_stripes = 2;
-                *min_stripes = 2;
-        }
-        if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-                if (fs_devices->rw_devices < 2)
-                        return -ENOSPC;
-                *num_stripes = 2;
-                *min_stripes = 2;
-        }
-        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-                *num_stripes = fs_devices->rw_devices;
-                if (*num_stripes < 4)
-                        return -ENOSPC;
-                *num_stripes &= ~(u32)1;
-                *sub_stripes = 2;
-                *min_stripes = 4;
        }
-        return 0;
+        if (list_empty(&fs_devices->alloc_list))
-}
+                return -ENOSPC;
-static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
+        sub_stripes = 1;
-                                    u64 proposed_size, u64 type,
+        dev_stripes = 1;
-                                    int num_stripes, int small_stripe)
+        devs_increment = 1;
-{
+        ncopies = 1;
-        int min_stripe_size = 1 * 1024 * 1024;
+        devs_max = 0;   /* 0 == as many as possible */
-        u64 calc_size = proposed_size;
+        devs_min = 1;
-        u64 max_chunk_size = calc_size;
-        int ncopies = 1;
-        if (type & (BTRFS_BLOCK_GROUP_RAID1 |
+        /*
-                    BTRFS_BLOCK_GROUP_DUP |
+         * define the properties of each RAID type.
-                    BTRFS_BLOCK_GROUP_RAID10))
+         * FIXME: move this to a global table and use it in all RAID
+         * calculation code
+         */
+        if (type & (BTRFS_BLOCK_GROUP_DUP)) {
+                dev_stripes = 2;
+                ncopies = 2;
+                devs_max = 1;
+        } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
+                devs_min = 2;
+        } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
+                devs_increment = 2;
                ncopies = 2;
+                devs_max = 2;
+                devs_min = 2;
+        } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+                sub_stripes = 2;
+                devs_increment = 2;
+                ncopies = 2;
+                devs_min = 4;
+        } else {
+                devs_max = 1;
+        }
        if (type & BTRFS_BLOCK_GROUP_DATA) {
-                max_chunk_size = 10 * calc_size;
+                max_stripe_size = 1024 * 1024 * 1024;
-                min_stripe_size = 64 * 1024 * 1024;
+                max_chunk_size = 10 * max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-                max_chunk_size = 256 * 1024 * 1024;
+                max_stripe_size = 256 * 1024 * 1024;
-                min_stripe_size = 32 * 1024 * 1024;
+                max_chunk_size = max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-                calc_size = 8 * 1024 * 1024;
+                max_stripe_size = 8 * 1024 * 1024;
-                max_chunk_size = calc_size * 2;
+                max_chunk_size = 2 * max_stripe_size;
-                min_stripe_size = 1 * 1024 * 1024;
+        } else {
+                printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
+                       type);
+                BUG_ON(1);
        }
        /* we don't want a chunk larger than 10% of writeable space */
        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
                             max_chunk_size);
-        if (calc_size * num_stripes > max_chunk_size * ncopies) {
+        devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
-                calc_size = max_chunk_size * ncopies;
+                               GFP_NOFS);
-                do_div(calc_size, num_stripes);
+        if (!devices_info)
-                do_div(calc_size, BTRFS_STRIPE_LEN);
+                return -ENOMEM;
-                calc_size *= BTRFS_STRIPE_LEN;
-        }
-        /* we don't want tiny stripes */
+        cur = fs_devices->alloc_list.next;
-        if (!small_stripe)
-                calc_size = max_t(u64, min_stripe_size, calc_size);
        /*
-         * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
+         * in the first pass through the devices list, we gather information
-         * we end up with something bigger than a stripe
+         * about the available holes on each device.
         */
-        calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
+        ndevs = 0;
+        while (cur != &fs_devices->alloc_list) {
-        do_div(calc_size, BTRFS_STRIPE_LEN);
+                struct btrfs_device *device;
-        calc_size *= BTRFS_STRIPE_LEN;
+                u64 max_avail;
+                u64 dev_offset;
-        return calc_size;
-}
-static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
-                                                      int num_stripes)
-{
-        struct map_lookup *new;
-        size_t len = map_lookup_size(num_stripes);
-        BUG_ON(map->num_stripes < num_stripes);
-        if (map->num_stripes == num_stripes)
-                return map;
-        new = kmalloc(len, GFP_NOFS);
-        if (!new) {
-                /* just change map->num_stripes */
-                map->num_stripes = num_stripes;
-                return map;
-        }
-        memcpy(new, map, len);
-        new->num_stripes = num_stripes;
-        kfree(map);
-        return new;
-}
-/*
+                device = list_entry(cur, struct btrfs_device, dev_alloc_list);
- * helper to allocate device space from btrfs_device_info, in which we stored
- * max free space information of every device. It is used when we can not
- * allocate chunks by default size.
- *
- * By this helper, we can allocate a new chunk as larger as possible.
- */
-static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
-                                    struct btrfs_fs_devices *fs_devices,
-                                    struct btrfs_device_info *devices,
-                                    int nr_device, u64 type,
-                                    struct map_lookup **map_lookup,
-                                    int min_stripes, u64 *stripe_size)
-{
-        int i, index, sort_again = 0;
-        int min_devices = min_stripes;
-        u64 max_avail, min_free;
-        struct map_lookup *map = *map_lookup;
-        int ret;
-        if (nr_device < min_stripes)
+                cur = cur->next;
-                return -ENOSPC;
-        btrfs_descending_sort_devices(devices, nr_device);
+                if (!device->writeable) {
+                        printk(KERN_ERR
+                               "btrfs: read-only device in alloc_list\n");
+                        WARN_ON(1);
+                        continue;
+                }
-        max_avail = devices[0].max_avail;
+                if (!device->in_fs_metadata)
-        if (!max_avail)
+                        continue;
-                return -ENOSPC;
-        for (i = 0; i < nr_device; i++) {
+                if (device->total_bytes > device->bytes_used)
-                /*
+                        total_avail = device->total_bytes - device->bytes_used;
-                 * if dev_offset = 0, it means the free space of this device
+                else
-                 * is less than what we need, and we didn't search max avail
+                        total_avail = 0;
-                 * extent on this device, so do it now.
+                /* avail is off by max(alloc_start, 1MB), but that is the same
+                 * for all devices, so it doesn't hurt the sorting later on
                 */
-                if (!devices[i].dev_offset) {
-                        ret = find_free_dev_extent(trans, devices[i].dev,
-                                                   max_avail,
-                                                   &devices[i].dev_offset,
-                                                   &devices[i].max_avail);
-                        if (ret != 0 && ret != -ENOSPC)
-                                return ret;
-                        sort_again = 1;
-                }
-        }
-        /* we update the max avail free extent of each devices, sort again */
-        if (sort_again)
-                btrfs_descending_sort_devices(devices, nr_device);
-        if (type & BTRFS_BLOCK_GROUP_DUP)
+                ret = find_free_dev_extent(trans, device,
-                min_devices = 1;
+                                           max_stripe_size * dev_stripes,
+                                           &dev_offset, &max_avail);
+                if (ret && ret != -ENOSPC)
+                        goto error;
-        if (!devices[min_devices - 1].max_avail)
+                if (ret == 0)
-                return -ENOSPC;
+                        max_avail = max_stripe_size * dev_stripes;
-        max_avail = devices[min_devices - 1].max_avail;
+                if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
-        if (type & BTRFS_BLOCK_GROUP_DUP)
+                        continue;
-                do_div(max_avail, 2);
-        max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
+                devices_info[ndevs].dev_offset = dev_offset;
-                                             min_stripes, 1);
+                devices_info[ndevs].max_avail = max_avail;
-        if (type & BTRFS_BLOCK_GROUP_DUP)
+                devices_info[ndevs].total_avail = total_avail;
-                min_free = max_avail * 2;
+                devices_info[ndevs].dev = device;
-        else
+                ++ndevs;
-                min_free = max_avail;
+        }
-        if (min_free > devices[min_devices - 1].max_avail)
+        /*
-                return -ENOSPC;
+         * now sort the devices by hole size / available space
+         */
+        sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+             btrfs_cmp_device_info, NULL);
-        map = __shrink_map_lookup_stripes(map, min_stripes);
+        /* round down to number of usable stripes */
-        *stripe_size = max_avail;
+        ndevs -= ndevs % devs_increment;
-        index = 0;
+        if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
-        for (i = 0; i < min_stripes; i++) {
+                ret = -ENOSPC;
-                map->stripes[i].dev = devices[index].dev;
+                goto error;
-                map->stripes[i].physical = devices[index].dev_offset;
-                if (type & BTRFS_BLOCK_GROUP_DUP) {
-                        i++;
-                        map->stripes[i].dev = devices[index].dev;
-                        map->stripes[i].physical = devices[index].dev_offset +
-                                                   max_avail;
-                }
-                index++;
        }
-        *map_lookup = map;
-        return 0;
+        if (devs_max && ndevs > devs_max)
-}
+                ndevs = devs_max;
+        /*
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+         * the primary goal is to maximize the number of stripes, so use as many
-                               struct btrfs_root *extent_root,
+         * devices as possible, even if the stripes are not maximum sized.
-                               struct map_lookup **map_ret,
+         */
-                               u64 *num_bytes, u64 *stripe_size,
+        stripe_size = devices_info[ndevs-1].max_avail;
-                               u64 start, u64 type)
+        num_stripes = ndevs * dev_stripes;
-{
-        struct btrfs_fs_info *info = extent_root->fs_info;
-        struct btrfs_device *device = NULL;
-        struct btrfs_fs_devices *fs_devices = info->fs_devices;
-        struct list_head *cur;
-        struct map_lookup *map;
-        struct extent_map_tree *em_tree;
-        struct extent_map *em;
-        struct btrfs_device_info *devices_info;
-        struct list_head private_devs;
-        u64 calc_size = 1024 * 1024 * 1024;
-        u64 min_free;
-        u64 avail;
-        u64 dev_offset;
-        int num_stripes;
-        int min_stripes;
-        int sub_stripes;
-        int min_devices;        /* the min number of devices we need */
-        int i;
-        int ret;
-        int index;
-        if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+        if (stripe_size * num_stripes > max_chunk_size * ncopies) {
-            (type & BTRFS_BLOCK_GROUP_DUP)) {
+                stripe_size = max_chunk_size * ncopies;
-                WARN_ON(1);
+                do_div(stripe_size, num_stripes);
-                type &= ~BTRFS_BLOCK_GROUP_DUP;
        }
-        if (list_empty(&fs_devices->alloc_list))
-                return -ENOSPC;
-        ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
-                                    &min_stripes, &sub_stripes);
-        if (ret)
-                return ret;
-        devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+        do_div(stripe_size, dev_stripes);
-                               GFP_NOFS);
+        do_div(stripe_size, BTRFS_STRIPE_LEN);
-        if (!devices_info)
+        stripe_size *= BTRFS_STRIPE_LEN;
-                return -ENOMEM;
        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
        if (!map) {
@@ -2514,85 +2475,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        }
        map->num_stripes = num_stripes;
-        cur = fs_devices->alloc_list.next;
+        for (i = 0; i < ndevs; ++i) {
-        index = 0;
+                for (j = 0; j < dev_stripes; ++j) {
-        i = 0;
+                        int s = i * dev_stripes + j;
+                        map->stripes[s].dev = devices_info[i].dev;
-        calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
+                        map->stripes[s].physical = devices_info[i].dev_offset +
-                                             num_stripes, 0);
+                                                   j * stripe_size;
-        if (type & BTRFS_BLOCK_GROUP_DUP) {
-                min_free = calc_size * 2;
-                min_devices = 1;
-        } else {
-                min_free = calc_size;
-                min_devices = min_stripes;
-        }
-        INIT_LIST_HEAD(&private_devs);
-        while (index < num_stripes) {
-                device = list_entry(cur, struct btrfs_device, dev_alloc_list);
-                BUG_ON(!device->writeable);
-                if (device->total_bytes > device->bytes_used)
-                        avail = device->total_bytes - device->bytes_used;
-                else
-                        avail = 0;
-                cur = cur->next;
-                if (device->in_fs_metadata && avail >= min_free) {
-                        ret = find_free_dev_extent(trans, device, min_free,
-                                                   &devices_info[i].dev_offset,
-                                                   &devices_info[i].max_avail);
-                        if (ret == 0) {
-                                list_move_tail(&device->dev_alloc_list,
-                                               &private_devs);
-                                map->stripes[index].dev = device;
-                                map->stripes[index].physical =
-                                                devices_info[i].dev_offset;
-                                index++;
-                                if (type & BTRFS_BLOCK_GROUP_DUP) {
-                                        map->stripes[index].dev = device;
-                                        map->stripes[index].physical =
-                                                devices_info[i].dev_offset +
-                                                calc_size;
-                                        index++;
-                                }
-                        } else if (ret != -ENOSPC)
-                                goto error;
-                        devices_info[i].dev = device;
-                        i++;
-                } else if (device->in_fs_metadata &&
-                           avail >= BTRFS_STRIPE_LEN) {
-                        devices_info[i].dev = device;
-                        devices_info[i].max_avail = avail;
-                        i++;
-                }
-                if (cur == &fs_devices->alloc_list)
-                        break;
-        }
-        list_splice(&private_devs, &fs_devices->alloc_list);
-        if (index < num_stripes) {
-                if (index >= min_stripes) {
-                        num_stripes = index;
-                        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-                                num_stripes /= sub_stripes;
-                                num_stripes *= sub_stripes;
-                        }
-                        map = __shrink_map_lookup_stripes(map, num_stripes);
-                } else if (i >= min_devices) {
-                        ret = __btrfs_alloc_tiny_space(trans, fs_devices,
-                                                       devices_info, i, type,
-                                                       &map, min_stripes,
-                                                       &calc_size);
-                        if (ret)
-                                goto error;
-                } else {
-                        ret = -ENOSPC;
-                        goto error;
                }
        }
        map->sector_size = extent_root->sectorsize;
@@ -2603,20 +2491,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        map->sub_stripes = sub_stripes;
        *map_ret = map;
-        *stripe_size = calc_size;
+        num_bytes = stripe_size * (num_stripes / ncopies);
-        *num_bytes = chunk_bytes_by_type(type, calc_size,
-                                         map->num_stripes, sub_stripes);
-        trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes);
+        *stripe_size_out = stripe_size;
+        *num_bytes_out = num_bytes;
-        em = alloc_extent_map(GFP_NOFS);
+        trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
+        em = alloc_extent_map();
        if (!em) {
                ret = -ENOMEM;
                goto error;
        }
        em->bdev = (struct block_device *)map;
        em->start = start;
-        em->len = *num_bytes;
+        em->len = num_bytes;
        em->block_start = 0;
        em->block_len = em->len;
@@ -2629,20 +2518,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        ret = btrfs_make_block_group(trans, extent_root, 0, type,
                                     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                                     start, *num_bytes);
+                                     start, num_bytes);
        BUG_ON(ret);
-        index = 0;
+        for (i = 0; i < map->num_stripes; ++i) {
-        while (index < map->num_stripes) {
+                struct btrfs_device *device;
-                device = map->stripes[index].dev;
+                u64 dev_offset;
-                dev_offset = map->stripes[index].physical;
+                device = map->stripes[i].dev;
+                dev_offset = map->stripes[i].physical;
                ret = btrfs_alloc_dev_extent(trans, device,
                                info->chunk_root->root_key.objectid,
                                BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                                start, dev_offset, calc_size);
+                                start, dev_offset, stripe_size);
                BUG_ON(ret);
-                index++;
        }
        kfree(devices_info);
@@ -2849,7 +2739,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
 {
-        extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+        extent_map_tree_init(&tree->map_tree);
 }
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
@@ -3499,7 +3389,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                free_extent_map(em);
        }
-        em = alloc_extent_map(GFP_NOFS);
+        em = alloc_extent_map();
        if (!em)
                return -ENOMEM;
        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3688,15 +3578,6 @@ static int read_one_dev(struct btrfs_root *root,
        return ret;
 }
-int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
-{
-        struct btrfs_dev_item *dev_item;
-        dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
-                                                     dev_item);
-        return read_one_dev(root, buf, dev_item);
-}
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
@@ -3813,7 +3694,7 @@ again:
        }
        if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
                key.objectid = 0;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                goto again;
        }
        ret = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index cc2eadaf7a27..7c12d61ae7ae 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -85,7 +85,12 @@ struct btrfs_device {
        /* physical drive uuid (or lvm uuid) */
        u8 uuid[BTRFS_UUID_SIZE];
+        /* per-device scrub information */
+        struct scrub_dev *scrub_device;
        struct btrfs_work work;
+        struct rcu_head rcu;
+        struct work_struct rcu_work;
 };
 struct btrfs_fs_devices {
@@ -144,6 +149,7 @@ struct btrfs_device_info {
        struct btrfs_device *dev;
        u64 dev_offset;
        u64 max_avail;
+        u64 total_avail;
 };
 struct map_lookup {
@@ -157,20 +163,8 @@ struct map_lookup {
        struct btrfs_bio_stripe stripes[];
 };
-/* Used to sort the devices by max_avail(descending sort) */
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
-int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
+                            (sizeof(struct btrfs_bio_stripe) * (n)))
-/*
- * sort the devices by max_avail, in which max free extent size of each device
- * is stored.(Descending Sort)
- */
-static inline void btrfs_descending_sort_devices(
-                                        struct btrfs_device_info *devices,
-                                        size_t nr_devices)
-{
-        sort(devices, nr_devices, sizeof(struct btrfs_device_info),
-             btrfs_cmp_device_free_bytes, NULL);
-}
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                                   u64 end, u64 *length);
@@ -196,7 +190,6 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                  int mirror_num, int async_submit);
-int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                       fmode_t flags, void *holder);
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
@@ -209,8 +202,6 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 int btrfs_rm_device(struct btrfs_root *root, char *device_path);
 int btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
-int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
-                      u64 logical, struct page *page);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
                      struct btrfs_device *device, u64 new_size);
 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
@@ -218,8 +209,6 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_root *dev_root);
-void btrfs_unlock_volumes(void);
-void btrfs_lock_volumes(void);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_device *device, u64 num_bytes,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index cfd660550ded..f3107e4b4d56 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -44,7 +44,7 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
                return -ENOMEM;
        /* lookup the xattr by name */
-        di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
+        di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name,
                                strlen(name), 0);
        if (!di) {
                ret = -ENODATA;
@@ -103,7 +103,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        /* first lets see if we already have this xattr */
-        di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
+        di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
                                strlen(name), -1);
        if (IS_ERR(di)) {
                ret = PTR_ERR(di);
@@ -120,13 +120,13 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
                ret = btrfs_delete_one_dir_name(trans, root, path, di);
                BUG_ON(ret);
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                /* if we don't have a value then we are removing the xattr */
                if (!value)
                        goto out;
        } else {
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                if (flags & XATTR_REPLACE) {
                        /* we couldn't find the attr to replace */
@@ -136,7 +136,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
        }
        /* ok we have to create a completely new xattr */
-        ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino,
+        ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
                                      name, name_len, value, size);
        BUG_ON(ret);
 out:
@@ -190,7 +190,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
         * NOTE: we set key.offset = 0; because we want to start with the
         * first xattr that we find and walk forward
         */
-        key.objectid = inode->i_ino;
+        key.objectid = btrfs_ino(inode);
        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
        key.offset = 0;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 75c47cd8d086..1cd4c3a1862d 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -153,26 +153,6 @@ config CIFS_ACL
            Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
            is handed over to the application/caller.
-config CIFS_SMB2
-        bool "SMB2 network file system support (EXPERIMENTAL)"
-        depends on EXPERIMENTAL && INET && BROKEN
-        select NLS
-        select KEYS
-        select FSCACHE
-        select DNS_RESOLVER
-        help
-          This enables experimental support for the SMB2 (Server Message Block
-          version 2) protocol. The SMB2 protocol is the successor to the
-          popular CIFS and SMB network file sharing protocols. SMB2 is the
-          native file sharing mechanism for recent versions of Windows
-          operating systems (since Vista).  SMB2 enablement will eventually
-          allow users better performance, security and features, than would be
-          possible with cifs. Note that smb2 mount options also are simpler
-          (compared to cifs) due to protocol improvements.
-          Unless you are a developer or tester, say N.
 config CIFS_NFSD_EXPORT
          bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)"
          depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/README b/fs/cifs/README
index 4a3ca0e5ca24..c5c2c5e5f0f2 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -457,6 +457,9 @@ A partial list of the supported mount options follows:
                otherwise - read from the server. All written data are stored
                in the cache, but if the client doesn't have Exclusive Oplock,
                it writes the data to the server.
+  rwpidforward  Forward pid of a process who opened a file to any read or write
+                operation on that file. This prevent applications like WINE
+                from failing on read and write if we use mandatory brlock style.
  acl           Allow setfacl and getfacl to manage posix ACLs if server
                supports them.  (default)
  noacl         Do not allow setfacl and getfacl calls on this mount
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 53d57a3fe427..dd8584d35a14 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -146,7 +146,7 @@ static char *extract_sharename(const char *treename)
 static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
                                   uint16_t maxbuf)
 {
-        const struct cifsTconInfo *tcon = cookie_netfs_data;
+        const struct cifs_tcon *tcon = cookie_netfs_data;
        char *sharename;
        uint16_t len;
@@ -173,7 +173,7 @@ cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
                           uint16_t maxbuf)
 {
        struct cifs_fscache_super_auxdata auxdata;
-        const struct cifsTconInfo *tcon = cookie_netfs_data;
+        const struct cifs_tcon *tcon = cookie_netfs_data;
        memset(&auxdata, 0, sizeof(auxdata));
        auxdata.resource_id = tcon->resource_id;
@@ -192,7 +192,7 @@ fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
                                              uint16_t datalen)
 {
        struct cifs_fscache_super_auxdata auxdata;
-        const struct cifsTconInfo *tcon = cookie_netfs_data;
+        const struct cifs_tcon *tcon = cookie_netfs_data;
        if (datalen != sizeof(auxdata))
                return FSCACHE_CHECKAUX_OBSOLETE;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 18f4272d9047..2fe3cf13b2e9 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -110,8 +110,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
        struct list_head *tmp1, *tmp2, *tmp3;
        struct mid_q_entry *mid_entry;
        struct TCP_Server_Info *server;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        int i, j;
        __u32 dev_type;
@@ -152,7 +152,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                                    tcp_ses_list);
                i++;
                list_for_each(tmp2, &server->smb_ses_list) {
-                        ses = list_entry(tmp2, struct cifsSesInfo,
+                        ses = list_entry(tmp2, struct cifs_ses,
                                         smb_ses_list);
                        if ((ses->serverDomain == NULL) ||
                                (ses->serverOS == NULL) ||
@@ -171,7 +171,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                        seq_printf(m, "TCP status: %d\n\tLocal Users To "
                                   "Server: %d SecMode: 0x%x Req On Wire: %d",
                                   server->tcpStatus, server->srv_count,
-                                   server->secMode,
+                                   server->sec_mode,
                                   atomic_read(&server->inFlight));
 #ifdef CONFIG_CIFS_STATS2
@@ -183,7 +183,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                        seq_puts(m, "\n\tShares:");
                        j = 0;
                        list_for_each(tmp3, &ses->tcon_list) {
-                                tcon = list_entry(tmp3, struct cifsTconInfo,
+                                tcon = list_entry(tmp3, struct cifs_tcon,
                                                  tcon_list);
                                ++j;
                                dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
@@ -256,8 +256,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
        int rc;
        struct list_head *tmp1, *tmp2, *tmp3;
        struct TCP_Server_Info *server;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        rc = get_user(c, buffer);
        if (rc)
@@ -273,11 +273,11 @@ static ssize_t cifs_stats_proc_write(struct file *file,
                        server = list_entry(tmp1, struct TCP_Server_Info,
                                            tcp_ses_list);
                        list_for_each(tmp2, &server->smb_ses_list) {
-                                ses = list_entry(tmp2, struct cifsSesInfo,
+                                ses = list_entry(tmp2, struct cifs_ses,
                                                 smb_ses_list);
                                list_for_each(tmp3, &ses->tcon_list) {
                                        tcon = list_entry(tmp3,
-                                                          struct cifsTconInfo,
+                                                          struct cifs_tcon,
                                                          tcon_list);
                                        atomic_set(&tcon->num_smbs_sent, 0);
                                        atomic_set(&tcon->num_writes, 0);
@@ -312,8 +312,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
        int i;
        struct list_head *tmp1, *tmp2, *tmp3;
        struct TCP_Server_Info *server;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        seq_printf(m,
                        "Resources in use\nCIFS Session: %d\n",
@@ -346,11 +346,11 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                server = list_entry(tmp1, struct TCP_Server_Info,
                                    tcp_ses_list);
                list_for_each(tmp2, &server->smb_ses_list) {
-                        ses = list_entry(tmp2, struct cifsSesInfo,
+                        ses = list_entry(tmp2, struct cifs_ses,
                                         smb_ses_list);
                        list_for_each(tmp3, &ses->tcon_list) {
                                tcon = list_entry(tmp3,
-                                                  struct cifsTconInfo,
+                                                  struct cifs_tcon,
                                                  tcon_list);
                                i++;
                                seq_printf(m, "\n%d) %s", i, tcon->treeName);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 2b68ac57d97d..8d8f28c94c0f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -272,7 +272,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
        struct dfs_info3_param *referrals = NULL;
        unsigned int num_referrals = 0;
        struct cifs_sb_info *cifs_sb;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
        char *full_path;
        int xid, i;
        int rc;
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index a9d5692e0c20..ffb1459dc6ec 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -41,6 +41,7 @@
 #define CIFS_MOUNT_MF_SYMLINKS  0x10000 /* Minshall+French Symlinks enabled */
 #define CIFS_MOUNT_MULTIUSER    0x20000 /* multiuser mount */
 #define CIFS_MOUNT_STRICT_IO    0x40000 /* strict cache mode */
+#define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */
 struct cifs_sb_info {
        struct rb_root tlink_tree;
@@ -56,8 +57,6 @@ struct cifs_sb_info {
        mode_t  mnt_file_mode;
        mode_t  mnt_dir_mode;
        unsigned int mnt_cifs_flags;
-        int     prepathlen;
-        char   *prepath; /* relative path under the share to mount to */
        char   *mountdata; /* options received at mount time or via DFS refs */
        struct backing_dev_info bdi;
        struct delayed_work prune_tlinks;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 33d221394aca..2272fd5fe5b7 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -95,7 +95,7 @@ struct key_type cifs_spnego_key_type = {
 /* get a key struct with a SPNEGO security blob, suitable for session setup */
 struct key *
-cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
+cifs_get_spnego_key(struct cifs_ses *sesInfo)
 {
        struct TCP_Server_Info *server = sesInfo->server;
        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index e4041ec4d712..31bef9ee078b 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -41,7 +41,7 @@ struct cifs_spnego_msg {
 #ifdef __KERNEL__
 extern struct key_type cifs_spnego_key_type;
-extern struct key *cifs_get_spnego_key(struct cifsSesInfo *sesInfo);
+extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo);
 #endif /* KERNEL */
 #endif /* _CIFS_SPNEGO_H */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index f3c6fb9942ac..8f1700623b41 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -38,7 +38,7 @@ static const struct cifs_sid sid_everyone = {
        1, 1, {0, 0, 0, 0, 0, 1}, {0} };
 /* security id for Authenticated Users system group */
 static const struct cifs_sid sid_authusers = {
-        1, 1, {0, 0, 0, 0, 0, 5}, {11} };
+        1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11)} };
 /* group users */
 static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
@@ -458,7 +458,8 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
        if (num_subauth) {
                for (i = 0; i < num_subauth; ++i) {
                        if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
-                                if (ctsid->sub_auth[i] > cwsid->sub_auth[i])
+                                if (le32_to_cpu(ctsid->sub_auth[i]) >
+                                        le32_to_cpu(cwsid->sub_auth[i]))
                                        return 1;
                                else
                                        return -1;
@@ -945,7 +946,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        int oplock = 0;
        int xid, rc;
        __u16 fid;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
@@ -1013,7 +1014,7 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
        int oplock = 0;
        int xid, rc;
        __u16 fid;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 45c3f78c8f81..dfbd9f1f373d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -229,7 +229,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 }
 /* first calculate 24 bytes ntlm response and then 16 byte session key */
-int setup_ntlm_response(struct cifsSesInfo *ses)
+int setup_ntlm_response(struct cifs_ses *ses)
 {
        int rc = 0;
        unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
@@ -312,7 +312,7 @@ int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 * Allocate domain name which gets freed when session struct is deallocated.
 */
 static int
-build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
        unsigned int dlen;
        unsigned int wlen;
@@ -400,7 +400,7 @@ build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 * about target string i.e. for some, just user name might suffice.
 */
 static int
-find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
        unsigned int attrsize;
        unsigned int type;
@@ -445,7 +445,7 @@ find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
        return 0;
 }
-static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
+static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
                            const struct nls_table *nls_cp)
 {
        int rc = 0;
@@ -527,7 +527,7 @@ calc_exit_2:
 }
 static int
-CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
+CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
 {
        int rc;
        unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
@@ -563,7 +563,7 @@ CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
 int
-setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
        int rc;
        int baselen;
@@ -649,7 +649,7 @@ setup_ntlmv2_rsp_ret:
 }
 int
-calc_seckey(struct cifsSesInfo *ses)
+calc_seckey(struct cifs_ses *ses)
 {
        int rc;
        struct crypto_blkcipher *tfm_arc4;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 493b74ca5648..989442dcfb45 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -104,46 +104,25 @@ cifs_sb_deactive(struct super_block *sb)
 }
 static int
-cifs_read_super(struct super_block *sb, void *data,
+cifs_read_super(struct super_block *sb, struct smb_vol *volume_info,
                const char *devname, int silent)
 {
        struct inode *inode;
        struct cifs_sb_info *cifs_sb;
        int rc = 0;
-        /* BB should we make this contingent on mount parm? */
-        sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
-        sb->s_fs_info = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL);
        cifs_sb = CIFS_SB(sb);
-        if (cifs_sb == NULL)
-                return -ENOMEM;
        spin_lock_init(&cifs_sb->tlink_tree_lock);
        cifs_sb->tlink_tree = RB_ROOT;
        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
-        if (rc) {
+        if (rc)
-                kfree(cifs_sb);
                return rc;
-        }
-        cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
-        /*
+        cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
-         * Copy mount params to sb for use in submounts. Better to do
-         * the copy here and deal with the error before cleanup gets
-         * complicated post-mount.
-         */
-        if (data) {
-                cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
-                if (cifs_sb->mountdata == NULL) {
-                        bdi_destroy(&cifs_sb->bdi);
-                        kfree(sb->s_fs_info);
-                        sb->s_fs_info = NULL;
-                        return -ENOMEM;
-                }
-        }
-        rc = cifs_mount(sb, cifs_sb, devname);
+        rc = cifs_mount(sb, cifs_sb, volume_info, devname);
        if (rc) {
                if (!silent)
@@ -194,15 +173,7 @@ out_no_root:
        cifs_umount(sb, cifs_sb);
 out_mount_failed:
-        if (cifs_sb) {
+        bdi_destroy(&cifs_sb->bdi);
-                if (cifs_sb->mountdata) {
-                        kfree(cifs_sb->mountdata);
-                        cifs_sb->mountdata = NULL;
-                }
-                unload_nls(cifs_sb->local_nls);
-                bdi_destroy(&cifs_sb->bdi);
-                kfree(cifs_sb);
-        }
        return rc;
 }
@@ -237,7 +208,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
        int rc = -EOPNOTSUPP;
        int xid;
@@ -390,7 +361,7 @@ static int
 cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
        struct sockaddr *srcaddr;
        srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
@@ -444,14 +415,20 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
                seq_printf(s, ",nocase");
        if (tcon->retry)
                seq_printf(s, ",hard");
-        if (cifs_sb->prepath)
+        if (tcon->unix_ext)
-                seq_printf(s, ",prepath=%s", cifs_sb->prepath);
+                seq_printf(s, ",unix");
+        else
+                seq_printf(s, ",nounix");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
                seq_printf(s, ",posixpaths");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
                seq_printf(s, ",setuids");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
                seq_printf(s, ",serverino");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+                seq_printf(s, ",rwpidforward");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
+                seq_printf(s, ",forcemand");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
                seq_printf(s, ",directio");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
@@ -484,7 +461,7 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 static void cifs_umount_begin(struct super_block *sb)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        if (cifs_sb == NULL)
                return;
@@ -559,29 +536,189 @@ static const struct super_operations cifs_super_ops = {
 #endif
 };
+/*
+ * Get root dentry from superblock according to prefix path mount option.
+ * Return dentry with refcount + 1 on success and NULL otherwise.
+ */
+static struct dentry *
+cifs_get_root(struct smb_vol *vol, struct super_block *sb)
+{
+        int xid, rc;
+        struct inode *inode;
+        struct qstr name;
+        struct dentry *dparent = NULL, *dchild = NULL, *alias;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        unsigned int i, full_len, len;
+        char *full_path = NULL, *pstart;
+        char sep;
+        full_path = cifs_build_path_to_root(vol, cifs_sb,
+                                            cifs_sb_master_tcon(cifs_sb));
+        if (full_path == NULL)
+                return NULL;
+        cFYI(1, "Get root dentry for %s", full_path);
+        xid = GetXid();
+        sep = CIFS_DIR_SEP(cifs_sb);
+        dparent = dget(sb->s_root);
+        full_len = strlen(full_path);
+        full_path[full_len] = sep;
+        pstart = full_path + 1;
+        for (i = 1, len = 0; i <= full_len; i++) {
+                if (full_path[i] != sep || !len) {
+                        len++;
+                        continue;
+                }
+                full_path[i] = 0;
+                cFYI(1, "get dentry for %s", pstart);
+                name.name = pstart;
+                name.len = len;
+                name.hash = full_name_hash(pstart, len);
+                dchild = d_lookup(dparent, &name);
+                if (dchild == NULL) {
+                        cFYI(1, "not exists");
+                        dchild = d_alloc(dparent, &name);
+                        if (dchild == NULL) {
+                                dput(dparent);
+                                dparent = NULL;
+                                goto out;
+                        }
+                }
+                cFYI(1, "get inode");
+                if (dchild->d_inode == NULL) {
+                        cFYI(1, "not exists");
+                        inode = NULL;
+                        if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
+                                rc = cifs_get_inode_info_unix(&inode, full_path,
+                                                              sb, xid);
+                        else
+                                rc = cifs_get_inode_info(&inode, full_path,
+                                                         NULL, sb, xid, NULL);
+                        if (rc) {
+                                dput(dchild);
+                                dput(dparent);
+                                dparent = NULL;
+                                goto out;
+                        }
+                        alias = d_materialise_unique(dchild, inode);
+                        if (alias != NULL) {
+                                dput(dchild);
+                                if (IS_ERR(alias)) {
+                                        dput(dparent);
+                                        dparent = NULL;
+                                        goto out;
+                                }
+                                dchild = alias;
+                        }
+                }
+                cFYI(1, "parent %p, child %p", dparent, dchild);
+                dput(dparent);
+                dparent = dchild;
+                len = 0;
+                pstart = full_path + i + 1;
+                full_path[i] = sep;
+        }
+out:
+        _FreeXid(xid);
+        kfree(full_path);
+        return dparent;
+}
 static struct dentry *
 cifs_do_mount(struct file_system_type *fs_type,
-            int flags, const char *dev_name, void *data)
+              int flags, const char *dev_name, void *data)
 {
        int rc;
        struct super_block *sb;
+        struct cifs_sb_info *cifs_sb;
-        sb = sget(fs_type, NULL, set_anon_super, NULL);
+        struct smb_vol *volume_info;
+        struct cifs_mnt_data mnt_data;
+        struct dentry *root;
        cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
-        if (IS_ERR(sb))
+        rc = cifs_setup_volume_info(&volume_info, (char *)data, dev_name);
-                return ERR_CAST(sb);
+        if (rc)
+                return ERR_PTR(rc);
+        cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL);
+        if (cifs_sb == NULL) {
+                root = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        cifs_setup_cifs_sb(volume_info, cifs_sb);
+        mnt_data.vol = volume_info;
+        mnt_data.cifs_sb = cifs_sb;
+        mnt_data.flags = flags;
+        sb = sget(fs_type, cifs_match_super, set_anon_super, &mnt_data);
+        if (IS_ERR(sb)) {
+                root = ERR_CAST(sb);
+                goto out_cifs_sb;
+        }
+        if (sb->s_fs_info) {
+                cFYI(1, "Use existing superblock");
+                goto out_shared;
+        }
+        /*
+         * Copy mount params for use in submounts. Better to do
+         * the copy here and deal with the error before cleanup gets
+         * complicated post-mount.
+         */
+        cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
+        if (cifs_sb->mountdata == NULL) {
+                root = ERR_PTR(-ENOMEM);
+                goto out_super;
+        }
        sb->s_flags = flags;
+        /* BB should we make this contingent on mount parm? */
+        sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
+        sb->s_fs_info = cifs_sb;
-        rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
+        rc = cifs_read_super(sb, volume_info, dev_name,
+                             flags & MS_SILENT ? 1 : 0);
        if (rc) {
-                deactivate_locked_super(sb);
+                root = ERR_PTR(rc);
-                return ERR_PTR(rc);
+                goto out_super;
        }
        sb->s_flags |= MS_ACTIVE;
-        return dget(sb->s_root);
+        root = cifs_get_root(volume_info, sb);
+        if (root == NULL)
+                goto out_super;
+        cFYI(1, "dentry root is: %p", root);
+        goto out;
+out_shared:
+        root = cifs_get_root(volume_info, sb);
+        if (root)
+                cFYI(1, "dentry root is: %p", root);
+        goto out;
+out_super:
+        kfree(cifs_sb->mountdata);
+        deactivate_locked_super(sb);
+out_cifs_sb:
+        unload_nls(cifs_sb->local_nls);
+        kfree(cifs_sb);
+out:
+        cifs_cleanup_volume_info(&volume_info);
+        return root;
 }
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 76b4517e74b0..6255fa812c7a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -155,6 +155,81 @@ struct cifs_cred {
 *****************************************************************
 */
+struct smb_vol {
+        char *username;
+        char *password;
+        char *domainname;
+        char *UNC;
+        char *UNCip;
+        char *iocharset;  /* local code page for mapping to and from Unicode */
+        char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
+        char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
+        uid_t cred_uid;
+        uid_t linux_uid;
+        gid_t linux_gid;
+        mode_t file_mode;
+        mode_t dir_mode;
+        unsigned secFlg;
+        bool retry:1;
+        bool intr:1;
+        bool setuids:1;
+        bool override_uid:1;
+        bool override_gid:1;
+        bool dynperm:1;
+        bool noperm:1;
+        bool no_psx_acl:1; /* set if posix acl support should be disabled */
+        bool cifs_acl:1;
+        bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
+        bool server_ino:1; /* use inode numbers from server ie UniqueId */
+        bool direct_io:1;
+        bool strict_io:1; /* strict cache behavior */
+        bool remap:1;      /* set to remap seven reserved chars in filenames */
+        bool posix_paths:1; /* unset to not ask for posix pathnames. */
+        bool no_linux_ext:1;
+        bool sfu_emul:1;
+        bool nullauth:1;   /* attempt to authenticate with null user */
+        bool nocase:1;     /* request case insensitive filenames */
+        bool nobrl:1;      /* disable sending byte range locks to srv */
+        bool mand_lock:1;  /* send mandatory not posix byte range lock reqs */
+        bool seal:1;       /* request transport encryption on share */
+        bool nodfs:1;      /* Do not request DFS, even if available */
+        bool local_lease:1; /* check leases only on local system, not remote */
+        bool noblocksnd:1;
+        bool noautotune:1;
+        bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
+        bool fsc:1;     /* enable fscache */
+        bool mfsymlinks:1; /* use Minshall+French Symlinks */
+        bool multiuser:1;
+        bool rwpidforward:1; /* pid forward for read/write operations */
+        unsigned int rsize;
+        unsigned int wsize;
+        bool sockopt_tcp_nodelay:1;
+        unsigned short int port;
+        unsigned long actimeo; /* attribute cache timeout (jiffies) */
+        char *prepath;
+        struct sockaddr_storage srcaddr; /* allow binding to a local IP */
+        struct nls_table *local_nls;
+};
+#define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
+                         CIFS_MOUNT_SERVER_INUM | CIFS_MOUNT_DIRECT_IO | \
+                         CIFS_MOUNT_NO_XATTR | CIFS_MOUNT_MAP_SPECIAL_CHR | \
+                         CIFS_MOUNT_UNX_EMUL | CIFS_MOUNT_NO_BRL | \
+                         CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_OVERR_UID | \
+                         CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
+                         CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \
+                         CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
+                         CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO)
+#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
+                      MS_NODEV | MS_SYNCHRONOUS)
+struct cifs_mnt_data {
+        struct cifs_sb_info *cifs_sb;
+        struct smb_vol *vol;
+        int flags;
+};
 struct TCP_Server_Info {
        struct list_head tcp_ses_list;
        struct list_head smb_ses_list;
@@ -179,7 +254,7 @@ struct TCP_Server_Info {
        struct mutex srv_mutex;
        struct task_struct *tsk;
        char server_GUID[16];
-        char secMode;
+        char sec_mode;
        bool session_estab; /* mark when very first sess is established */
        u16 dialect; /* dialect index that server chose */
        enum securityEnum secType;
@@ -254,7 +329,7 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
 /*
 * Session structure.  One of these for each uid session with a particular host
 */
-struct cifsSesInfo {
+struct cifs_ses {
        struct list_head smb_ses_list;
        struct list_head tcon_list;
        struct mutex session_mutex;
@@ -294,11 +369,11 @@ struct cifsSesInfo {
 * there is one of these for each connection to a resource on a particular
 * session
 */
-struct cifsTconInfo {
+struct cifs_tcon {
        struct list_head tcon_list;
        int tc_count;
        struct list_head openFileList;
-        struct cifsSesInfo *ses;        /* pointer to session associated with */
+        struct cifs_ses *ses;   /* pointer to session associated with */
        char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
        char *nativeFileSystem;
        char *password;         /* for share-level security */
@@ -380,12 +455,12 @@ struct tcon_link {
 #define TCON_LINK_IN_TREE       2
        unsigned long           tl_time;
        atomic_t                tl_count;
-        struct cifsTconInfo     *tl_tcon;
+        struct cifs_tcon        *tl_tcon;
 };
 extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
-static inline struct cifsTconInfo *
+static inline struct cifs_tcon *
 tlink_tcon(struct tcon_link *tlink)
 {
        return tlink->tl_tcon;
@@ -402,7 +477,7 @@ cifs_get_tlink(struct tcon_link *tlink)
 }
 /* This function is always expected to succeed */
-extern struct cifsTconInfo *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
+extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
 /*
 * This info hangs off the cifsFileInfo structure, pointed to by llist.
@@ -455,6 +530,14 @@ struct cifsFileInfo {
        struct work_struct oplock_break; /* work for oplock breaks */
 };
+struct cifs_io_parms {
+        __u16 netfid;
+        __u32 pid;
+        __u64 offset;
+        unsigned int length;
+        struct cifs_tcon *tcon;
+};
 /*
 * Take a reference on the file private data. Must be called with
 * cifs_file_list_lock held.
@@ -509,10 +592,30 @@ static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
                return '\\';
 }
+static inline void
+convert_delimiter(char *path, char delim)
+{
+        int i;
+        char old_delim;
+        if (path == NULL)
+                return;
+        if (delim == '/')
+                old_delim = '\\';
+        else
+                old_delim = '/';
+        for (i = 0; path[i] != '\0'; i++) {
+                if (path[i] == old_delim)
+                        path[i] = delim;
+        }
+}
 #ifdef CONFIG_CIFS_STATS
 #define cifs_stats_inc atomic_inc
-static inline void cifs_stats_bytes_written(struct cifsTconInfo *tcon,
+static inline void cifs_stats_bytes_written(struct cifs_tcon *tcon,
                                            unsigned int bytes)
 {
        if (bytes) {
@@ -522,7 +625,7 @@ static inline void cifs_stats_bytes_written(struct cifsTconInfo *tcon,
        }
 }
-static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
+static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
                                         unsigned int bytes)
 {
        spin_lock(&tcon->stat_lock);
@@ -543,9 +646,8 @@ struct mid_q_entry;
 * This is the prototype for the mid callback function. When creating one,
 * take special care to avoid deadlocks. Things to bear in mind:
 *
- * - it will be called by cifsd
+ * - it will be called by cifsd, with no locks held
- * - the GlobalMid_Lock will be held
+ * - the mid will be removed from any lists
- * - the mid will be removed from the pending_mid_q list
 */
 typedef void (mid_callback_t)(struct mid_q_entry *mid);
@@ -573,7 +675,7 @@ struct mid_q_entry {
 struct oplock_q_entry {
        struct list_head qhead;
        struct inode *pinode;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        __u16 netfid;
 };
@@ -656,6 +758,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   MID_RESPONSE_RECEIVED 4
 #define   MID_RETRY_NEEDED      8 /* session closed while this request out */
 #define   MID_RESPONSE_MALFORMED 0x10
+#define   MID_SHUTDOWN           0x20
 /* Types of response buffer returned from SendReceive2 */
 #define   CIFS_NO_BUFFER        0    /* Response buffer not returned */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 6e69e06a30b3..953f84413c77 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -57,8 +57,9 @@ extern int init_cifs_idmap(void);
 extern void exit_cifs_idmap(void);
 extern void cifs_destroy_idmaptrees(void);
 extern char *build_path_from_dentry(struct dentry *);
-extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+extern char *cifs_build_path_to_root(struct smb_vol *vol,
-                                        struct cifsTconInfo *tcon);
+                                     struct cifs_sb_info *cifs_sb,
+                                     struct cifs_tcon *tcon);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 extern char *cifs_compose_mount_options(const char *sb_mountdata,
                const char *fullpath, const struct dfs_info3_param *ref,
@@ -67,20 +68,22 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata,
 extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
                                        struct TCP_Server_Info *server);
 extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
-extern int cifs_call_async(struct TCP_Server_Info *server,
+extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
-                           struct smb_hdr *in_buf, mid_callback_t *callback,
+                           unsigned int nvec, mid_callback_t *callback,
-                           void *cbdata);
+                           void *cbdata, bool ignore_pend);
-extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
+extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
                        struct smb_hdr * /* input */ ,
                        struct smb_hdr * /* out */ ,
                        int * /* bytes returned */ , const int long_op);
-extern int SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
+extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
                        struct smb_hdr *in_buf, int flags);
-extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
+extern int cifs_check_receive(struct mid_q_entry *mid,
+                        struct TCP_Server_Info *server, bool log_error);
+extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
                        struct kvec *, int /* nvec to send */,
                        int * /* type of buf returned */ , const int flags);
 extern int SendReceiveBlockingLock(const unsigned int xid,
-                        struct cifsTconInfo *ptcon,
+                        struct cifs_tcon *ptcon,
                        struct smb_hdr *in_buf ,
                        struct smb_hdr *out_buf,
                        int *bytes_returned);
@@ -99,14 +102,14 @@ extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
 extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
 extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
                                const unsigned short int port);
-extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
+extern int map_smb_to_linux_error(struct smb_hdr *smb, bool logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
-                            const struct cifsTconInfo *, int /* length of
+                            const struct cifs_tcon *, int /* length of
                            fixed section (word count) in two byte units */);
 extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
-                                struct cifsSesInfo *ses,
+                                struct cifs_ses *ses,
                                void **request_buf);
-extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
+extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses,
                             const struct nls_table *nls_cp);
 extern __u16 GetNextMid(struct TCP_Server_Info *server);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -148,102 +151,108 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
 extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
                                const char *);
+extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+                               struct cifs_sb_info *cifs_sb);
+extern int cifs_match_super(struct super_block *, void *);
+extern void cifs_cleanup_volume_info(struct smb_vol **pvolume_info);
+extern int cifs_setup_volume_info(struct smb_vol **pvolume_info,
+                                  char *mount_data, const char *devname);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *,
-                        const char *);
+                      struct smb_vol *, const char *);
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
 extern void cifs_dfs_release_automount_timer(void);
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
 extern int cifs_negotiate_protocol(unsigned int xid,
-                                  struct cifsSesInfo *ses);
+                                  struct cifs_ses *ses);
-extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
+extern int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
                        struct nls_table *nls_info);
-extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses);
+extern int CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses);
-extern int CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
+extern int CIFSTCon(unsigned int xid, struct cifs_ses *ses,
-                        const char *tree, struct cifsTconInfo *tcon,
+                        const char *tree, struct cifs_tcon *tcon,
                        const struct nls_table *);
-extern int CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSFindFirst(const int xid, struct cifs_tcon *tcon,
                const char *searchName, const struct nls_table *nls_codepage,
                __u16 *searchHandle, struct cifs_search_info *psrch_inf,
                int map, const char dirsep);
-extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
                __u16 searchHandle, struct cifs_search_info *psrch_inf);
-extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
+extern int CIFSFindClose(const int, struct cifs_tcon *tcon,
                        const __u16 search_handle);
-extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBQFileInfo(const int xid, struct cifs_tcon *tcon,
                        u16 netfid, FILE_ALL_INFO *pFindData);
-extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBQPathInfo(const int xid, struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        FILE_ALL_INFO *findData,
                        int legacy /* whether to use old info level */,
                        const struct nls_table *nls_codepage, int remap);
-extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
+extern int SMBQueryInformation(const int xid, struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        FILE_ALL_INFO *findData,
                        const struct nls_table *nls_codepage, int remap);
-extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBUnixQFileInfo(const int xid, struct cifs_tcon *tcon,
                        u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
 extern int CIFSSMBUnixQPathInfo(const int xid,
-                        struct cifsTconInfo *tcon,
+                        struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        FILE_UNIX_BASIC_INFO *pFindData,
                        const struct nls_table *nls_codepage, int remap);
-extern int CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
+extern int CIFSGetDFSRefer(const int xid, struct cifs_ses *ses,
                        const unsigned char *searchName,
                        struct dfs_info3_param **target_nodes,
                        unsigned int *number_of_nodes_in_array,
                        const struct nls_table *nls_codepage, int remap);
-extern int get_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
+extern int get_dfs_path(int xid, struct cifs_ses *pSesInfo,
                        const char *old_path,
                        const struct nls_table *nls_codepage,
                        unsigned int *pnum_referrals,
                        struct dfs_info3_param **preferrals,
                        int remap);
-extern void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
+extern void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
                                 struct super_block *sb, struct smb_vol *vol);
-extern int CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBQFSInfo(const int xid, struct cifs_tcon *tcon,
                        struct kstatfs *FSData);
-extern int SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon,
+extern int SMBOldQFSInfo(const int xid, struct cifs_tcon *tcon,
                        struct kstatfs *FSData);
-extern int CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetFSUnixInfo(const int xid, struct cifs_tcon *tcon,
                        __u64 cap);
 extern int CIFSSMBQFSAttributeInfo(const int xid,
-                        struct cifsTconInfo *tcon);
+                        struct cifs_tcon *tcon);
-extern int CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBQFSDeviceInfo(const int xid, struct cifs_tcon *tcon);
-extern int CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBQFSUnixInfo(const int xid, struct cifs_tcon *tcon);
-extern int CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBQFSPosixInfo(const int xid, struct cifs_tcon *tcon,
                        struct kstatfs *FSData);
-extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetPathInfo(const int xid, struct cifs_tcon *tcon,
                        const char *fileName, const FILE_BASIC_INFO *data,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetFileInfo(const int xid, struct cifs_tcon *tcon,
                        const FILE_BASIC_INFO *data, __u16 fid,
                        __u32 pid_of_opener);
-extern int CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetFileDisposition(const int xid, struct cifs_tcon *tcon,
                        bool delete_file, __u16 fid, __u32 pid_of_opener);
 #if 0
-extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetAttrLegacy(int xid, struct cifs_tcon *tcon,
                        char *fileName, __u16 dos_attributes,
                        const struct nls_table *nls_codepage);
 #endif /* possibly unneeded function */
-extern int CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetEOF(const int xid, struct cifs_tcon *tcon,
                        const char *fileName, __u64 size,
                        bool setAllocationSizeFlag,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetFileSize(const int xid, struct cifs_tcon *tcon,
                         __u64 size, __u16 fileHandle, __u32 opener_pid,
                        bool AllocSizeFlag);
@@ -257,120 +266,116 @@ struct cifs_unix_set_info_args {
        dev_t   device;
 };
-extern int CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBUnixSetFileInfo(const int xid, struct cifs_tcon *tcon,
                                  const struct cifs_unix_set_info_args *args,
                                  u16 fid, u32 pid_of_opener);
-extern int CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *pTcon,
+extern int CIFSSMBUnixSetPathInfo(const int xid, struct cifs_tcon *pTcon,
                        char *fileName,
                        const struct cifs_unix_set_info_args *args,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBMkDir(const int xid, struct cifs_tcon *tcon,
                        const char *newName,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBRmDir(const int xid, struct cifs_tcon *tcon,
                        const char *name, const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSPOSIXDelFile(const int xid, struct cifs_tcon *tcon,
                        const char *name, __u16 type,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBDelFile(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBDelFile(const int xid, struct cifs_tcon *tcon,
                        const char *name,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBRename(const int xid, struct cifs_tcon *tcon,
                        const char *fromName, const char *toName,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
+extern int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
                        int netfid, const char *target_name,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
 extern int CIFSCreateHardLink(const int xid,
-                        struct cifsTconInfo *tcon,
+                        struct cifs_tcon *tcon,
                        const char *fromName, const char *toName,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
 extern int CIFSUnixCreateHardLink(const int xid,
-                        struct cifsTconInfo *tcon,
+                        struct cifs_tcon *tcon,
                        const char *fromName, const char *toName,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
 extern int CIFSUnixCreateSymLink(const int xid,
-                        struct cifsTconInfo *tcon,
+                        struct cifs_tcon *tcon,
                        const char *fromName, const char *toName,
                        const struct nls_table *nls_codepage);
 extern int CIFSSMBUnixQuerySymLink(const int xid,
-                        struct cifsTconInfo *tcon,
+                        struct cifs_tcon *tcon,
                        const unsigned char *searchName, char **syminfo,
                        const struct nls_table *nls_codepage);
 #ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL
 extern int CIFSSMBQueryReparseLinkInfo(const int xid,
-                        struct cifsTconInfo *tcon,
+                        struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        char *symlinkinfo, const int buflen, __u16 fid,
                        const struct nls_table *nls_codepage);
 #endif /* temporarily unused until cifs_symlink fixed */
-extern int CIFSSMBOpen(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBOpen(const int xid, struct cifs_tcon *tcon,
                        const char *fileName, const int disposition,
                        const int access_flags, const int omode,
                        __u16 *netfid, int *pOplock, FILE_ALL_INFO *,
                        const struct nls_table *nls_codepage, int remap);
-extern int SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
+extern int SMBLegacyOpen(const int xid, struct cifs_tcon *tcon,
                        const char *fileName, const int disposition,
                        const int access_flags, const int omode,
                        __u16 *netfid, int *pOplock, FILE_ALL_INFO *,
                        const struct nls_table *nls_codepage, int remap);
-extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSPOSIXCreate(const int xid, struct cifs_tcon *tcon,
                        u32 posix_flags, __u64 mode, __u16 *netfid,
                        FILE_UNIX_BASIC_INFO *pRetData,
                        __u32 *pOplock, const char *name,
                        const struct nls_table *nls_codepage, int remap);
-extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBClose(const int xid, struct cifs_tcon *tcon,
                        const int smb_file_id);
-extern int CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBFlush(const int xid, struct cifs_tcon *tcon,
                        const int smb_file_id);
-extern int CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms,
-                        const int netfid, unsigned int count,
+                        unsigned int *nbytes, char **buf,
-                        const __u64 lseek, unsigned int *nbytes, char **buf,
                        int *return_buf_type);
-extern int CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBWrite(const int xid, struct cifs_io_parms *io_parms,
-                        const int netfid, const unsigned int count,
+                        unsigned int *nbytes, const char *buf,
-                        const __u64 lseek, unsigned int *nbytes,
+                        const char __user *ubuf, const int long_op);
-                        const char *buf, const char __user *ubuf,
+extern int CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
+                        unsigned int *nbytes, struct kvec *iov, const int nvec,
                        const int long_op);
-extern int CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
-                        const int netfid, const unsigned int count,
-                        const __u64 offset, unsigned int *nbytes,
-                        struct kvec *iov, const int nvec, const int long_op);
-extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
                        const unsigned char *searchName, __u64 *inode_number,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
                        const __u16 netfid, const __u64 len,
                        const __u64 offset, const __u32 numUnlock,
                        const __u32 numLock, const __u8 lockType,
                        const bool waitFlag, const __u8 oplock_level);
-extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
                        const __u16 smb_file_id, const int get_flag,
                        const __u64 len, struct file_lock *,
                        const __u16 lock_type, const bool waitFlag);
-extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon);
 extern int CIFSSMBEcho(struct TCP_Server_Info *server);
-extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
+extern int CIFSSMBLogoff(const int xid, struct cifs_ses *ses);
-extern struct cifsSesInfo *sesInfoAlloc(void);
+extern struct cifs_ses *sesInfoAlloc(void);
-extern void sesInfoFree(struct cifsSesInfo *);
+extern void sesInfoFree(struct cifs_ses *);
-extern struct cifsTconInfo *tconInfoAlloc(void);
+extern struct cifs_tcon *tconInfoAlloc(void);
-extern void tconInfoFree(struct cifsTconInfo *);
+extern void tconInfoFree(struct cifs_tcon *);
 extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
@@ -379,51 +384,51 @@ extern int cifs_verify_signature(struct smb_hdr *,
                                 struct TCP_Server_Info *server,
                                __u32 expected_sequence_number);
 extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
-extern int setup_ntlm_response(struct cifsSesInfo *);
+extern int setup_ntlm_response(struct cifs_ses *);
-extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
+extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
 extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
-extern int calc_seckey(struct cifsSesInfo *);
+extern int calc_seckey(struct cifs_ses *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern int calc_lanman_hash(const char *password, const char *cryptkey,
                                bool encrypt, char *lnm_session_key);
 #endif /* CIFS_WEAK_PW_HASH */
 #ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
-extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon,
                        const int notify_subdirs, const __u16 netfid,
                        __u32 filter, struct file *file, int multishot,
                        const struct nls_table *nls_codepage);
 #endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
 extern int CIFSSMBCopy(int xid,
-                        struct cifsTconInfo *source_tcon,
+                        struct cifs_tcon *source_tcon,
                        const char *fromName,
                        const __u16 target_tid,
                        const char *toName, const int flags,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
+extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        const unsigned char *ea_name, char *EAData,
                        size_t bufsize, const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon,
                const char *fileName, const char *ea_name,
                const void *ea_value, const __u16 ea_value_len,
                const struct nls_table *nls_codepage, int remap_special_chars);
-extern int CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon,
                        __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
-extern int CIFSSMBSetCIFSACL(const int, struct cifsTconInfo *, __u16,
+extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16,
                        struct cifs_ntsd *, __u32);
-extern int CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon,
                const unsigned char *searchName,
                char *acl_inf, const int buflen, const int acl_type,
                const struct nls_table *nls_codepage, int remap_special_chars);
-extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetPosixACL(const int xid, struct cifs_tcon *tcon,
                const unsigned char *fileName,
                const char *local_acl, const int buflen, const int acl_type,
                const struct nls_table *nls_codepage, int remap_special_chars);
-extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSGetExtAttr(const int xid, struct cifs_tcon *tcon,
                        const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
 extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
@@ -434,4 +439,22 @@ extern int mdfour(unsigned char *, unsigned char *, int);
 extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
 extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
                        unsigned char *p24);
+/* asynchronous write support */
+struct cifs_writedata {
+        struct kref                     refcount;
+        enum writeback_sync_modes       sync_mode;
+        struct work_struct              work;
+        struct cifsFileInfo             *cfile;
+        __u64                           offset;
+        unsigned int                    bytes;
+        int                             result;
+        unsigned int                    nr_pages;
+        struct page                     *pages[1];
+};
+int cifs_async_writev(struct cifs_writedata *wdata);
+struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages);
+void cifs_writedata_release(struct kref *refcount);
 #endif                  /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 83df937b814e..1a9fe7f816d1 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -32,6 +32,7 @@
 #include <linux/vfs.h>
 #include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/pagemap.h>
 #include <asm/uaccess.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -84,7 +85,7 @@ static struct {
 /* Mark as invalid, all open files on tree connections since they
   were closed when session to server was lost */
-static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
+static void mark_open_files_invalid(struct cifs_tcon *pTcon)
 {
        struct cifsFileInfo *open_file = NULL;
        struct list_head *tmp;
@@ -104,10 +105,10 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
 /* reconnect the socket, tcon, and smb session if needed */
 static int
-cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
+cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
 {
        int rc = 0;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
        struct TCP_Server_Info *server;
        struct nls_table *nls_codepage;
@@ -226,7 +227,7 @@ out:
   SMB information in the SMB header.  If the return code is zero, this
   function must have filled in request_buf pointer */
 static int
-small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+small_smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
                void **request_buf)
 {
        int rc;
@@ -252,7 +253,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 int
 small_smb_init_no_tc(const int smb_command, const int wct,
-                     struct cifsSesInfo *ses, void **request_buf)
+                     struct cifs_ses *ses, void **request_buf)
 {
        int rc;
        struct smb_hdr *buffer;
@@ -278,7 +279,7 @@ small_smb_init_no_tc(const int smb_command, const int wct,
 /* If the return code is zero, this function must fill in request_buf pointer */
 static int
-__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+__smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
                        void **request_buf, void **response_buf)
 {
        *request_buf = cifs_buf_get();
@@ -304,7 +305,7 @@ __smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 /* If the return code is zero, this function must fill in request_buf pointer */
 static int
-smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
         void **request_buf, void **response_buf)
 {
        int rc;
@@ -317,7 +318,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 }
 static int
-smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
+smb_init_no_reconnect(int smb_command, int wct, struct cifs_tcon *tcon,
                        void **request_buf, void **response_buf)
 {
        if (tcon->ses->need_reconnect || tcon->need_reconnect)
@@ -366,7 +367,7 @@ static inline void inc_rfc1001_len(void *pSMB, int count)
 }
 int
-CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
+CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
 {
        NEGOTIATE_REQ *pSMB;
        NEGOTIATE_RSP *pSMBr;
@@ -450,7 +451,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        rc = -EOPNOTSUPP;
                        goto neg_err_exit;
                }
-                server->secMode = (__u8)le16_to_cpu(rsp->SecurityMode);
+                server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode);
                server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
                server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
                                (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
@@ -504,7 +505,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                                cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
                        memcpy(ses->server->cryptkey, rsp->EncryptionKey,
                                CIFS_CRYPTO_KEY_SIZE);
-                } else if (server->secMode & SECMODE_PW_ENCRYPT) {
+                } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
                        rc = -EIO; /* need cryptkey unless plain text */
                        goto neg_err_exit;
                }
@@ -526,11 +527,11 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                goto neg_err_exit;
        }
        /* else wct == 17 NTLM */
-        server->secMode = pSMBr->SecurityMode;
+        server->sec_mode = pSMBr->SecurityMode;
-        if ((server->secMode & SECMODE_USER) == 0)
+        if ((server->sec_mode & SECMODE_USER) == 0)
                cFYI(1, "share mode security");
-        if ((server->secMode & SECMODE_PW_ENCRYPT) == 0)
+        if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0)
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
 #endif /* CIFS_WEAK_PW_HASH */
@@ -570,18 +571,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
                memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
                       CIFS_CRYPTO_KEY_SIZE);
-        } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
+        } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
-                        && (pSMBr->EncryptionKeyLength == 0)) {
+                        server->capabilities & CAP_EXTENDED_SECURITY) &&
+                                (pSMBr->EncryptionKeyLength == 0)) {
                /* decode security blob */
-        } else if (server->secMode & SECMODE_PW_ENCRYPT) {
-                rc = -EIO; /* no crypt key only if plain text pwd */
-                goto neg_err_exit;
-        }
-        /* BB might be helpful to save off the domain of server here */
-        if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) &&
-                (server->capabilities & CAP_EXTENDED_SECURITY)) {
                count = get_bcc(&pSMBr->hdr);
                if (count < 16) {
                        rc = -EIO;
@@ -624,6 +617,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        } else
                                        rc = -EOPNOTSUPP;
                }
+        } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
+                rc = -EIO; /* no crypt key only if plain text pwd */
+                goto neg_err_exit;
        } else
                server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -634,27 +630,27 @@ signing_check:
                /* MUST_SIGN already includes the MAY_SIGN FLAG
                   so if this is zero it means that signing is disabled */
                cFYI(1, "Signing disabled");
-                if (server->secMode & SECMODE_SIGN_REQUIRED) {
+                if (server->sec_mode & SECMODE_SIGN_REQUIRED) {
                        cERROR(1, "Server requires "
                                   "packet signing to be enabled in "
                                   "/proc/fs/cifs/SecurityFlags.");
                        rc = -EOPNOTSUPP;
                }
-                server->secMode &=
+                server->sec_mode &=
                        ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
        } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
                /* signing required */
                cFYI(1, "Must sign - secFlags 0x%x", secFlags);
-                if ((server->secMode &
+                if ((server->sec_mode &
                        (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
                        cERROR(1, "signing required but server lacks support");
                        rc = -EOPNOTSUPP;
                } else
-                        server->secMode |= SECMODE_SIGN_REQUIRED;
+                        server->sec_mode |= SECMODE_SIGN_REQUIRED;
        } else {
                /* signing optional ie CIFSSEC_MAY_SIGN */
-                if ((server->secMode & SECMODE_SIGN_REQUIRED) == 0)
+                if ((server->sec_mode & SECMODE_SIGN_REQUIRED) == 0)
-                        server->secMode &=
+                        server->sec_mode &=
                                ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
        }
@@ -666,7 +662,7 @@ neg_err_exit:
 }
 int
-CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
+CIFSSMBTDis(const int xid, struct cifs_tcon *tcon)
 {
        struct smb_hdr *smb_buffer;
        int rc = 0;
@@ -725,6 +721,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 {
        ECHO_REQ *smb;
        int rc = 0;
+        struct kvec iov;
        cFYI(1, "In echo request");
@@ -739,9 +736,10 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
        put_bcc(1, &smb->hdr);
        smb->Data[0] = 'a';
        inc_rfc1001_len(smb, 3);
+        iov.iov_base = smb;
+        iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
-        rc = cifs_call_async(server, (struct smb_hdr *)smb,
+        rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true);
-                                cifs_echo_callback, server);
        if (rc)
                cFYI(1, "Echo request failed: %d", rc);
@@ -751,7 +749,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 }
 int
-CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
+CIFSSMBLogoff(const int xid, struct cifs_ses *ses)
 {
        LOGOFF_ANDX_REQ *pSMB;
        int rc = 0;
@@ -778,7 +776,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
        pSMB->hdr.Mid = GetNextMid(ses->server);
-        if (ses->server->secMode &
+        if (ses->server->sec_mode &
                   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                        pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
@@ -798,7 +796,7 @@ session_already_dead:
 }
 int
-CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
+CIFSPOSIXDelFile(const int xid, struct cifs_tcon *tcon, const char *fileName,
                 __u16 type, const struct nls_table *nls_codepage, int remap)
 {
        TRANSACTION2_SPI_REQ *pSMB = NULL;
@@ -873,7 +871,7 @@ PsxDelete:
 }
 int
-CIFSSMBDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
+CIFSSMBDelFile(const int xid, struct cifs_tcon *tcon, const char *fileName,
               const struct nls_table *nls_codepage, int remap)
 {
        DELETE_FILE_REQ *pSMB = NULL;
@@ -918,7 +916,7 @@ DelFileRetry:
 }
 int
-CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon, const char *dirName,
+CIFSSMBRmDir(const int xid, struct cifs_tcon *tcon, const char *dirName,
             const struct nls_table *nls_codepage, int remap)
 {
        DELETE_DIRECTORY_REQ *pSMB = NULL;
@@ -961,7 +959,7 @@ RmDirRetry:
 }
 int
-CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBMkDir(const int xid, struct cifs_tcon *tcon,
             const char *name, const struct nls_table *nls_codepage, int remap)
 {
        int rc = 0;
@@ -1004,7 +1002,7 @@ MkDirRetry:
 }
 int
-CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
+CIFSPOSIXCreate(const int xid, struct cifs_tcon *tcon, __u32 posix_flags,
                __u64 mode, __u16 *netfid, FILE_UNIX_BASIC_INFO *pRetData,
                __u32 *pOplock, const char *name,
                const struct nls_table *nls_codepage, int remap)
@@ -1170,7 +1168,7 @@ access_flags_to_smbopen_mode(const int access_flags)
 }
 int
-SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
+SMBLegacyOpen(const int xid, struct cifs_tcon *tcon,
            const char *fileName, const int openDisposition,
            const int access_flags, const int create_options, __u16 *netfid,
            int *pOplock, FILE_ALL_INFO *pfile_info,
@@ -1277,7 +1275,7 @@ OldOpenRetry:
 }
 int
-CIFSSMBOpen(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBOpen(const int xid, struct cifs_tcon *tcon,
            const char *fileName, const int openDisposition,
            const int access_flags, const int create_options, __u16 *netfid,
            int *pOplock, FILE_ALL_INFO *pfile_info,
@@ -1379,8 +1377,7 @@ openRetry:
 }
 int
-CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
+CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes,
-            const unsigned int count, const __u64 lseek, unsigned int *nbytes,
            char **buf, int *pbuf_type)
 {
        int rc = -EACCES;
@@ -1390,13 +1387,18 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        int wct;
        int resp_buf_type = 0;
        struct kvec iov[1];
+        __u32 pid = io_parms->pid;
+        __u16 netfid = io_parms->netfid;
+        __u64 offset = io_parms->offset;
+        struct cifs_tcon *tcon = io_parms->tcon;
+        unsigned int count = io_parms->length;
        cFYI(1, "Reading %d bytes on fid %d", count, netfid);
        if (tcon->ses->capabilities & CAP_LARGE_FILES)
                wct = 12;
        else {
                wct = 10; /* old style read */
-                if ((lseek >> 32) > 0)  {
+                if ((offset >> 32) > 0)  {
                        /* can not handle this big offset for old */
                        return -EIO;
                }
@@ -1407,15 +1409,18 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        if (rc)
                return rc;
+        pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+        pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
        /* tcon and ses pointer are checked in smb_init */
        if (tcon->ses->server == NULL)
                return -ECONNABORTED;
        pSMB->AndXCommand = 0xFF;       /* none */
        pSMB->Fid = netfid;
-        pSMB->OffsetLow = cpu_to_le32(lseek & 0xFFFFFFFF);
+        pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF);
        if (wct == 12)
-                pSMB->OffsetHigh = cpu_to_le32(lseek >> 32);
+                pSMB->OffsetHigh = cpu_to_le32(offset >> 32);
        pSMB->Remaining = 0;
        pSMB->MaxCount = cpu_to_le16(count & 0xFFFF);
@@ -1484,9 +1489,8 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 int
-CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBWrite(const int xid, struct cifs_io_parms *io_parms,
-             const int netfid, const unsigned int count,
+             unsigned int *nbytes, const char *buf,
-             const __u64 offset, unsigned int *nbytes, const char *buf,
             const char __user *ubuf, const int long_op)
 {
        int rc = -EACCES;
@@ -1495,6 +1499,11 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned, wct;
        __u32 bytes_sent;
        __u16 byte_count;
+        __u32 pid = io_parms->pid;
+        __u16 netfid = io_parms->netfid;
+        __u64 offset = io_parms->offset;
+        struct cifs_tcon *tcon = io_parms->tcon;
+        unsigned int count = io_parms->length;
        *nbytes = 0;
@@ -1516,6 +1525,10 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
                      (void **) &pSMBr);
        if (rc)
                return rc;
+        pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+        pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
        /* tcon and ses pointer are checked in smb_init */
        if (tcon->ses->server == NULL)
                return -ECONNABORTED;
@@ -1602,17 +1615,259 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        return rc;
 }
+void
+cifs_writedata_release(struct kref *refcount)
+{
+        struct cifs_writedata *wdata = container_of(refcount,
+                                        struct cifs_writedata, refcount);
+        if (wdata->cfile)
+                cifsFileInfo_put(wdata->cfile);
+        kfree(wdata);
+}
+/*
+ * Write failed with a retryable error. Resend the write request. It's also
+ * possible that the page was redirtied so re-clean the page.
+ */
+static void
+cifs_writev_requeue(struct cifs_writedata *wdata)
+{
+        int i, rc;
+        struct inode *inode = wdata->cfile->dentry->d_inode;
+        for (i = 0; i < wdata->nr_pages; i++) {
+                lock_page(wdata->pages[i]);
+                clear_page_dirty_for_io(wdata->pages[i]);
+        }
+        do {
+                rc = cifs_async_writev(wdata);
+        } while (rc == -EAGAIN);
+        for (i = 0; i < wdata->nr_pages; i++) {
+                if (rc != 0)
+                        SetPageError(wdata->pages[i]);
+                unlock_page(wdata->pages[i]);
+        }
+        mapping_set_error(inode->i_mapping, rc);
+        kref_put(&wdata->refcount, cifs_writedata_release);
+}
+static void
+cifs_writev_complete(struct work_struct *work)
+{
+        struct cifs_writedata *wdata = container_of(work,
+                                                struct cifs_writedata, work);
+        struct inode *inode = wdata->cfile->dentry->d_inode;
+        int i = 0;
+        if (wdata->result == 0) {
+                cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes);
+                cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink),
+                                         wdata->bytes);
+        } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
+                return cifs_writev_requeue(wdata);
+        for (i = 0; i < wdata->nr_pages; i++) {
+                struct page *page = wdata->pages[i];
+                if (wdata->result == -EAGAIN)
+                        __set_page_dirty_nobuffers(page);
+                else if (wdata->result < 0)
+                        SetPageError(page);
+                end_page_writeback(page);
+                page_cache_release(page);
+        }
+        if (wdata->result != -EAGAIN)
+                mapping_set_error(inode->i_mapping, wdata->result);
+        kref_put(&wdata->refcount, cifs_writedata_release);
+}
+struct cifs_writedata *
+cifs_writedata_alloc(unsigned int nr_pages)
+{
+        struct cifs_writedata *wdata;
+        /* this would overflow */
+        if (nr_pages == 0) {
+                cERROR(1, "%s: called with nr_pages == 0!", __func__);
+                return NULL;
+        }
+        /* writedata + number of page pointers */
+        wdata = kzalloc(sizeof(*wdata) +
+                        sizeof(struct page *) * (nr_pages - 1), GFP_NOFS);
+        if (wdata != NULL) {
+                INIT_WORK(&wdata->work, cifs_writev_complete);
+                kref_init(&wdata->refcount);
+        }
+        return wdata;
+}
+/*
+ * Check the midState and signature on received buffer (if any), and queue the
+ * workqueue completion task.
+ */
+static void
+cifs_writev_callback(struct mid_q_entry *mid)
+{
+        struct cifs_writedata *wdata = mid->callback_data;
+        struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+        unsigned int written;
+        WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
+        switch (mid->midState) {
+        case MID_RESPONSE_RECEIVED:
+                wdata->result = cifs_check_receive(mid, tcon->ses->server, 0);
+                if (wdata->result != 0)
+                        break;
+                written = le16_to_cpu(smb->CountHigh);
+                written <<= 16;
+                written += le16_to_cpu(smb->Count);
+                /*
+                 * Mask off high 16 bits when bytes written as returned
+                 * by the server is greater than bytes requested by the
+                 * client. OS/2 servers are known to set incorrect
+                 * CountHigh values.
+                 */
+                if (written > wdata->bytes)
+                        written &= 0xFFFF;
+                if (written < wdata->bytes)
+                        wdata->result = -ENOSPC;
+                else
+                        wdata->bytes = written;
+                break;
+        case MID_REQUEST_SUBMITTED:
+        case MID_RETRY_NEEDED:
+                wdata->result = -EAGAIN;
+                break;
+        default:
+                wdata->result = -EIO;
+                break;
+        }
+        queue_work(system_nrt_wq, &wdata->work);
+        DeleteMidQEntry(mid);
+        atomic_dec(&tcon->ses->server->inFlight);
+        wake_up(&tcon->ses->server->request_q);
+}
+/* cifs_async_writev - send an async write, and set up mid to handle result */
+int
+cifs_async_writev(struct cifs_writedata *wdata)
+{
+        int i, rc = -EACCES;
+        WRITE_REQ *smb = NULL;
+        int wct;
+        struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+        struct inode *inode = wdata->cfile->dentry->d_inode;
+        struct kvec *iov = NULL;
+        if (tcon->ses->capabilities & CAP_LARGE_FILES) {
+                wct = 14;
+        } else {
+                wct = 12;
+                if (wdata->offset >> 32 > 0) {
+                        /* can not handle big offset for old srv */
+                        return -EIO;
+                }
+        }
+        rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **)&smb);
+        if (rc)
+                goto async_writev_out;
+        /* 1 iov per page + 1 for header */
+        iov = kzalloc((wdata->nr_pages + 1) * sizeof(*iov), GFP_NOFS);
+        if (iov == NULL) {
+                rc = -ENOMEM;
+                goto async_writev_out;
+        }
+        smb->hdr.Pid = cpu_to_le16((__u16)wdata->cfile->pid);
+        smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->cfile->pid >> 16));
+        smb->AndXCommand = 0xFF;        /* none */
+        smb->Fid = wdata->cfile->netfid;
+        smb->OffsetLow = cpu_to_le32(wdata->offset & 0xFFFFFFFF);
+        if (wct == 14)
+                smb->OffsetHigh = cpu_to_le32(wdata->offset >> 32);
+        smb->Reserved = 0xFFFFFFFF;
+        smb->WriteMode = 0;
+        smb->Remaining = 0;
+        smb->DataOffset =
+            cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
+        /* 4 for RFC1001 length + 1 for BCC */
+        iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1;
+        iov[0].iov_base = smb;
+        /* marshal up the pages into iov array */
+        wdata->bytes = 0;
+        for (i = 0; i < wdata->nr_pages; i++) {
+                iov[i + 1].iov_len = min(inode->i_size -
+                                      page_offset(wdata->pages[i]),
+                                        (loff_t)PAGE_CACHE_SIZE);
+                iov[i + 1].iov_base = kmap(wdata->pages[i]);
+                wdata->bytes += iov[i + 1].iov_len;
+        }
+        cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes);
+        smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF);
+        smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16);
+        if (wct == 14) {
+                inc_rfc1001_len(&smb->hdr, wdata->bytes + 1);
+                put_bcc(wdata->bytes + 1, &smb->hdr);
+        } else {
+                /* wct == 12 */
+                struct smb_com_writex_req *smbw =
+                                (struct smb_com_writex_req *)smb;
+                inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5);
+                put_bcc(wdata->bytes + 5, &smbw->hdr);
+                iov[0].iov_len += 4; /* pad bigger by four bytes */
+        }
+        kref_get(&wdata->refcount);
+        rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1,
+                             cifs_writev_callback, wdata, false);
+        if (rc == 0)
+                cifs_stats_inc(&tcon->num_writes);
+        else
+                kref_put(&wdata->refcount, cifs_writedata_release);
+        /* send is done, unmap pages */
+        for (i = 0; i < wdata->nr_pages; i++)
+                kunmap(wdata->pages[i]);
+async_writev_out:
+        cifs_small_buf_release(smb);
+        kfree(iov);
+        return rc;
+}
 int
-CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
-             const int netfid, const unsigned int count,
+              unsigned int *nbytes, struct kvec *iov, int n_vec,
-             const __u64 offset, unsigned int *nbytes, struct kvec *iov,
+              const int long_op)
-             int n_vec, const int long_op)
 {
        int rc = -EACCES;
        WRITE_REQ *pSMB = NULL;
        int wct;
        int smb_hdr_len;
        int resp_buf_type = 0;
+        __u32 pid = io_parms->pid;
+        __u16 netfid = io_parms->netfid;
+        __u64 offset = io_parms->offset;
+        struct cifs_tcon *tcon = io_parms->tcon;
+        unsigned int count = io_parms->length;
        *nbytes = 0;
@@ -1630,6 +1885,10 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB);
        if (rc)
                return rc;
+        pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+        pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
        /* tcon and ses pointer are checked in smb_init */
        if (tcon->ses->server == NULL)
                return -ECONNABORTED;
@@ -1705,7 +1964,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 int
-CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
            const __u16 smb_file_id, const __u64 len,
            const __u64 offset, const __u32 numUnlock,
            const __u32 numLock, const __u8 lockType,
@@ -1775,7 +2034,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 }
 int
-CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
                const __u16 smb_file_id, const int get_flag, const __u64 len,
                struct file_lock *pLockData, const __u16 lock_type,
                const bool waitFlag)
@@ -1913,7 +2172,7 @@ plk_err_exit:
 int
-CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
+CIFSSMBClose(const int xid, struct cifs_tcon *tcon, int smb_file_id)
 {
        int rc = 0;
        CLOSE_REQ *pSMB = NULL;
@@ -1946,7 +2205,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 }
 int
-CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
+CIFSSMBFlush(const int xid, struct cifs_tcon *tcon, int smb_file_id)
 {
        int rc = 0;
        FLUSH_REQ *pSMB = NULL;
@@ -1967,7 +2226,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 }
 int
-CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBRename(const int xid, struct cifs_tcon *tcon,
              const char *fromName, const char *toName,
              const struct nls_table *nls_codepage, int remap)
 {
@@ -2034,7 +2293,7 @@ renameRetry:
        return rc;
 }
-int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
+int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
                int netfid, const char *target_name,
                const struct nls_table *nls_codepage, int remap)
 {
@@ -2114,7 +2373,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 }
 int
-CIFSSMBCopy(const int xid, struct cifsTconInfo *tcon, const char *fromName,
+CIFSSMBCopy(const int xid, struct cifs_tcon *tcon, const char *fromName,
            const __u16 target_tid, const char *toName, const int flags,
            const struct nls_table *nls_codepage, int remap)
 {
@@ -2182,7 +2441,7 @@ copyRetry:
 }
 int
-CIFSUnixCreateSymLink(const int xid, struct cifsTconInfo *tcon,
+CIFSUnixCreateSymLink(const int xid, struct cifs_tcon *tcon,
                      const char *fromName, const char *toName,
                      const struct nls_table *nls_codepage)
 {
@@ -2271,7 +2530,7 @@ createSymLinkRetry:
 }
 int
-CIFSUnixCreateHardLink(const int xid, struct cifsTconInfo *tcon,
+CIFSUnixCreateHardLink(const int xid, struct cifs_tcon *tcon,
                       const char *fromName, const char *toName,
                       const struct nls_table *nls_codepage, int remap)
 {
@@ -2356,7 +2615,7 @@ createHardLinkRetry:
 }
 int
-CIFSCreateHardLink(const int xid, struct cifsTconInfo *tcon,
+CIFSCreateHardLink(const int xid, struct cifs_tcon *tcon,
                   const char *fromName, const char *toName,
                   const struct nls_table *nls_codepage, int remap)
 {
@@ -2428,7 +2687,7 @@ winCreateHardLinkRetry:
 }
 int
-CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixQuerySymLink(const int xid, struct cifs_tcon *tcon,
                        const unsigned char *searchName, char **symlinkinfo,
                        const struct nls_table *nls_codepage)
 {
@@ -2533,7 +2792,7 @@ querySymLinkRetry:
 *      it is not compiled in by default until callers fixed up and more tested.
 */
 int
-CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        char *symlinkinfo, const int buflen, __u16 fid,
                        const struct nls_table *nls_codepage)
@@ -2771,7 +3030,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 }
 int
-CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon,
                   const unsigned char *searchName,
                   char *acl_inf, const int buflen, const int acl_type,
                   const struct nls_table *nls_codepage, int remap)
@@ -2859,7 +3118,7 @@ queryAclRetry:
 }
 int
-CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetPosixACL(const int xid, struct cifs_tcon *tcon,
                   const unsigned char *fileName,
                   const char *local_acl, const int buflen,
                   const int acl_type,
@@ -2939,7 +3198,7 @@ setACLerrorExit:
 /* BB fix tabs in this function FIXME BB */
 int
-CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
+CIFSGetExtAttr(const int xid, struct cifs_tcon *tcon,
               const int netfid, __u64 *pExtAttrBits, __u64 *pMask)
 {
        int rc = 0;
@@ -3032,7 +3291,7 @@ GetExtAttrOut:
 */
 static int
 smb_init_nttransact(const __u16 sub_command, const int setup_count,
-                   const int parm_len, struct cifsTconInfo *tcon,
+                   const int parm_len, struct cifs_tcon *tcon,
                   void **ret_buf)
 {
        int rc;
@@ -3115,7 +3374,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 /* Get Security Descriptor (by handle) from remote server for a file or dir */
 int
-CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
+CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
                  struct cifs_ntsd **acl_inf, __u32 *pbuflen)
 {
        int rc = 0;
@@ -3207,7 +3466,7 @@ qsec_out:
 }
 int
-CIFSSMBSetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
+CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
                        struct cifs_ntsd *pntsd, __u32 acllen)
 {
        __u16 byte_count, param_count, data_count, param_offset, data_offset;
@@ -3273,7 +3532,7 @@ setCifsAclRetry:
 /* Legacy Query Path Information call for lookup to old servers such
   as Win9x/WinME */
-int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
+int SMBQueryInformation(const int xid, struct cifs_tcon *tcon,
                        const unsigned char *searchName,
                        FILE_ALL_INFO *pFinfo,
                        const struct nls_table *nls_codepage, int remap)
@@ -3341,7 +3600,7 @@ QInfRetry:
 }
 int
-CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQFileInfo(const int xid, struct cifs_tcon *tcon,
                 u16 netfid, FILE_ALL_INFO *pFindData)
 {
        struct smb_t2_qfi_req *pSMB = NULL;
@@ -3408,7 +3667,7 @@ QFileInfoRetry:
 }
 int
-CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQPathInfo(const int xid, struct cifs_tcon *tcon,
                 const unsigned char *searchName,
                 FILE_ALL_INFO *pFindData,
                 int legacy /* old style infolevel */,
@@ -3509,7 +3768,7 @@ QPathInfoRetry:
 }
 int
-CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixQFileInfo(const int xid, struct cifs_tcon *tcon,
                 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
 {
        struct smb_t2_qfi_req *pSMB = NULL;
@@ -3578,7 +3837,7 @@ UnixQFileInfoRetry:
 }
 int
-CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixQPathInfo(const int xid, struct cifs_tcon *tcon,
                     const unsigned char *searchName,
                     FILE_UNIX_BASIC_INFO *pFindData,
                     const struct nls_table *nls_codepage, int remap)
@@ -3664,7 +3923,7 @@ UnixQPathInfoRetry:
 /* xid, tcon, searchName and codepage are input parms, rest are returned */
 int
-CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
+CIFSFindFirst(const int xid, struct cifs_tcon *tcon,
              const char *searchName,
              const struct nls_table *nls_codepage,
              __u16 *pnetfid,
@@ -3812,7 +4071,7 @@ findFirstRetry:
        return rc;
 }
-int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
+int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
                 __u16 searchHandle, struct cifs_search_info *psrch_inf)
 {
        TRANSACTION2_FNEXT_REQ *pSMB = NULL;
@@ -3950,7 +4209,7 @@ FNext2_err_exit:
 }
 int
-CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
+CIFSFindClose(const int xid, struct cifs_tcon *tcon,
              const __u16 searchHandle)
 {
        int rc = 0;
@@ -3982,7 +4241,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
 }
 int
-CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
+CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
                      const unsigned char *searchName,
                      __u64 *inode_number,
                      const struct nls_table *nls_codepage, int remap)
@@ -4184,7 +4443,7 @@ parse_DFS_referrals_exit:
 }
 int
-CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
+CIFSGetDFSRefer(const int xid, struct cifs_ses *ses,
                const unsigned char *searchName,
                struct dfs_info3_param **target_nodes,
                unsigned int *num_of_nodes,
@@ -4233,7 +4492,7 @@ getDFSRetry:
        }
        if (ses->server) {
-                if (ses->server->secMode &
+                if (ses->server->sec_mode &
                   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                        pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        }
@@ -4298,7 +4557,7 @@ GetDFSRefExit:
 /* Query File System Info such as free space to old servers such as Win 9x */
 int
-SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
+SMBOldQFSInfo(const int xid, struct cifs_tcon *tcon, struct kstatfs *FSData)
 {
 /* level 0x01 SMB_QUERY_FILE_SYSTEM_INFO */
        TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4377,7 +4636,7 @@ oldQFSInfoRetry:
 }
 int
-CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
+CIFSSMBQFSInfo(const int xid, struct cifs_tcon *tcon, struct kstatfs *FSData)
 {
 /* level 0x103 SMB_QUERY_FILE_SYSTEM_INFO */
        TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4456,7 +4715,7 @@ QFSInfoRetry:
 }
 int
-CIFSSMBQFSAttributeInfo(const int xid, struct cifsTconInfo *tcon)
+CIFSSMBQFSAttributeInfo(const int xid, struct cifs_tcon *tcon)
 {
 /* level 0x105  SMB_QUERY_FILE_SYSTEM_INFO */
        TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4526,7 +4785,7 @@ QFSAttributeRetry:
 }
 int
-CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon)
+CIFSSMBQFSDeviceInfo(const int xid, struct cifs_tcon *tcon)
 {
 /* level 0x104 SMB_QUERY_FILE_SYSTEM_INFO */
        TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4597,7 +4856,7 @@ QFSDeviceRetry:
 }
 int
-CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
+CIFSSMBQFSUnixInfo(const int xid, struct cifs_tcon *tcon)
 {
 /* level 0x200  SMB_QUERY_CIFS_UNIX_INFO */
        TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4667,7 +4926,7 @@ QFSUnixRetry:
 }
 int
-CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
+CIFSSMBSetFSUnixInfo(const int xid, struct cifs_tcon *tcon, __u64 cap)
 {
 /* level 0x200  SMB_SET_CIFS_UNIX_INFO */
        TRANSACTION2_SETFSI_REQ *pSMB = NULL;
@@ -4741,7 +5000,7 @@ SETFSUnixRetry:
 int
-CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQFSPosixInfo(const int xid, struct cifs_tcon *tcon,
                   struct kstatfs *FSData)
 {
 /* level 0x201  SMB_QUERY_CIFS_POSIX_INFO */
@@ -4834,7 +5093,7 @@ QFSPosixRetry:
   in Samba which this routine can run into */
 int
-CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
+CIFSSMBSetEOF(const int xid, struct cifs_tcon *tcon, const char *fileName,
              __u64 size, bool SetAllocation,
              const struct nls_table *nls_codepage, int remap)
 {
@@ -4923,7 +5182,7 @@ SetEOFRetry:
 }
 int
-CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
+CIFSSMBSetFileSize(const int xid, struct cifs_tcon *tcon, __u64 size,
                   __u16 fid, __u32 pid_of_opener, bool SetAllocation)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
@@ -5005,7 +5264,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
   time and resort to the original setpathinfo level which takes the ancient
   DOS time format with 2 second granularity */
 int
-CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetFileInfo(const int xid, struct cifs_tcon *tcon,
                    const FILE_BASIC_INFO *data, __u16 fid, __u32 pid_of_opener)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
@@ -5067,7 +5326,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 }
 int
-CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetFileDisposition(const int xid, struct cifs_tcon *tcon,
                          bool delete_file, __u16 fid, __u32 pid_of_opener)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
@@ -5123,7 +5382,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
 }
 int
-CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetPathInfo(const int xid, struct cifs_tcon *tcon,
                   const char *fileName, const FILE_BASIC_INFO *data,
                   const struct nls_table *nls_codepage, int remap)
 {
@@ -5207,7 +5466,7 @@ SetTimesRetry:
          handling it anyway and NT4 was what we thought it would be needed for
          Do not delete it until we prove whether needed for Win9x though */
 int
-CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName,
+CIFSSMBSetAttrLegacy(int xid, struct cifs_tcon *tcon, char *fileName,
                __u16 dos_attrs, const struct nls_table *nls_codepage)
 {
        SETATTR_REQ *pSMB = NULL;
@@ -5295,7 +5554,7 @@ cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
 }
 int
-CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixSetFileInfo(const int xid, struct cifs_tcon *tcon,
                       const struct cifs_unix_set_info_args *args,
                       u16 fid, u32 pid_of_opener)
 {
@@ -5358,7 +5617,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 }
 int
-CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
+CIFSSMBUnixSetPathInfo(const int xid, struct cifs_tcon *tcon, char *fileName,
                       const struct cifs_unix_set_info_args *args,
                       const struct nls_table *nls_codepage, int remap)
 {
@@ -5445,7 +5704,7 @@ setPermsRetry:
 * the data isn't copied to it, but the length is returned.
 */
 ssize_t
-CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon,
                const unsigned char *searchName, const unsigned char *ea_name,
                char *EAData, size_t buf_size,
                const struct nls_table *nls_codepage, int remap)
@@ -5626,7 +5885,7 @@ QAllEAsOut:
 }
 int
-CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
+CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon, const char *fileName,
             const char *ea_name, const void *ea_value,
             const __u16 ea_value_len, const struct nls_table *nls_codepage,
             int remap)
@@ -5753,7 +6012,7 @@ SetEARetry:
 *      incompatible for network fs clients, we could instead simply
 *      expose this config flag by adding a future cifs (and smb2) notify ioctl.
 */
-int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
+int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon,
                  const int notify_subdirs, const __u16 netfid,
                  __u32 filter, struct file *pfile, int multishot,
                  const struct nls_table *nls_codepage)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index da284e3cb653..6d88b82537c3 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -57,62 +57,6 @@
 extern mempool_t *cifs_req_poolp;
-struct smb_vol {
-        char *username;
-        char *password;
-        char *domainname;
-        char *UNC;
-        char *UNCip;
-        char *iocharset;  /* local code page for mapping to and from Unicode */
-        char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
-        char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
-        uid_t cred_uid;
-        uid_t linux_uid;
-        gid_t linux_gid;
-        mode_t file_mode;
-        mode_t dir_mode;
-        unsigned secFlg;
-        bool retry:1;
-        bool intr:1;
-        bool setuids:1;
-        bool override_uid:1;
-        bool override_gid:1;
-        bool dynperm:1;
-        bool noperm:1;
-        bool no_psx_acl:1; /* set if posix acl support should be disabled */
-        bool cifs_acl:1;
-        bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
-        bool server_ino:1; /* use inode numbers from server ie UniqueId */
-        bool direct_io:1;
-        bool strict_io:1; /* strict cache behavior */
-        bool remap:1;      /* set to remap seven reserved chars in filenames */
-        bool posix_paths:1; /* unset to not ask for posix pathnames. */
-        bool no_linux_ext:1;
-        bool sfu_emul:1;
-        bool nullauth:1;   /* attempt to authenticate with null user */
-        bool nocase:1;     /* request case insensitive filenames */
-        bool nobrl:1;      /* disable sending byte range locks to srv */
-        bool mand_lock:1;  /* send mandatory not posix byte range lock reqs */
-        bool seal:1;       /* request transport encryption on share */
-        bool nodfs:1;      /* Do not request DFS, even if available */
-        bool local_lease:1; /* check leases only on local system, not remote */
-        bool noblocksnd:1;
-        bool noautotune:1;
-        bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
-        bool fsc:1;     /* enable fscache */
-        bool mfsymlinks:1; /* use Minshall+French Symlinks */
-        bool multiuser:1;
-        bool use_smb2:1; /* force smb2 use on mount instead of cifs */
-        unsigned int rsize;
-        unsigned int wsize;
-        bool sockopt_tcp_nodelay:1;
-        unsigned short int port;
-        unsigned long actimeo; /* attribute cache timeout (jiffies) */
-        char *prepath;
-        struct sockaddr_storage srcaddr; /* allow binding to a local IP */
-        struct nls_table *local_nls;
-};
 /* FIXME: should these be tunable? */
 #define TLINK_ERROR_EXPIRE      (1 * HZ)
 #define TLINK_IDLE_EXPIRE       (600 * HZ)
@@ -135,9 +79,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
 {
        int rc = 0;
        struct list_head *tmp, *tmp2;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct mid_q_entry *mid_entry;
+        struct list_head retry_list;
        spin_lock(&GlobalMid_Lock);
        if (server->tcpStatus == CifsExiting) {
@@ -157,11 +102,11 @@ cifs_reconnect(struct TCP_Server_Info *server)
        cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &server->smb_ses_list) {
-                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+                ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
                ses->need_reconnect = true;
                ses->ipc_tid = 0;
                list_for_each(tmp2, &ses->tcon_list) {
-                        tcon = list_entry(tmp2, struct cifsTconInfo, tcon_list);
+                        tcon = list_entry(tmp2, struct cifs_tcon, tcon_list);
                        tcon->need_reconnect = true;
                }
        }
@@ -189,16 +134,23 @@ cifs_reconnect(struct TCP_Server_Info *server)
        mutex_unlock(&server->srv_mutex);
        /* mark submitted MIDs for retry and issue callback */
-        cFYI(1, "%s: issuing mid callbacks", __func__);
+        INIT_LIST_HEAD(&retry_list);
+        cFYI(1, "%s: moving mids to private list", __func__);
        spin_lock(&GlobalMid_Lock);
        list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
                if (mid_entry->midState == MID_REQUEST_SUBMITTED)
                        mid_entry->midState = MID_RETRY_NEEDED;
+                list_move(&mid_entry->qhead, &retry_list);
+        }
+        spin_unlock(&GlobalMid_Lock);
+        cFYI(1, "%s: issuing mid callbacks", __func__);
+        list_for_each_safe(tmp, tmp2, &retry_list) {
+                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
                list_del_init(&mid_entry->qhead);
                mid_entry->callback(mid_entry);
        }
-        spin_unlock(&GlobalMid_Lock);
        while (server->tcpStatus == CifsNeedReconnect) {
                try_to_freeze();
@@ -672,12 +624,12 @@ multi_t2_fnd:
                        mid_entry->when_received = jiffies;
 #endif
                        list_del_init(&mid_entry->qhead);
-                        mid_entry->callback(mid_entry);
                        break;
                }
                spin_unlock(&GlobalMid_Lock);
                if (mid_entry != NULL) {
+                        mid_entry->callback(mid_entry);
                        /* Was previous buf put in mpx struct for multi-rsp? */
                        if (!isMultiRsp) {
                                /* smb buffer will be freed by user thread */
@@ -741,15 +693,25 @@ multi_t2_fnd:
                cifs_small_buf_release(smallbuf);
        if (!list_empty(&server->pending_mid_q)) {
+                struct list_head dispose_list;
+                INIT_LIST_HEAD(&dispose_list);
                spin_lock(&GlobalMid_Lock);
                list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                        cFYI(1, "Clearing Mid 0x%x - issuing callback",
+                        cFYI(1, "Clearing mid 0x%x", mid_entry->mid);
-                                         mid_entry->mid);
+                        mid_entry->midState = MID_SHUTDOWN;
+                        list_move(&mid_entry->qhead, &dispose_list);
+                }
+                spin_unlock(&GlobalMid_Lock);
+                /* now walk dispose list and issue callbacks */
+                list_for_each_safe(tmp, tmp2, &dispose_list) {
+                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+                        cFYI(1, "Callback mid 0x%x", mid_entry->mid);
                        list_del_init(&mid_entry->qhead);
                        mid_entry->callback(mid_entry);
                }
-                spin_unlock(&GlobalMid_Lock);
                /* 1/8th of sec is more than enough time for them to exit */
                msleep(125);
        }
@@ -1062,13 +1024,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                                   (strnicmp(value, "1", 1) == 0)) {
                                /* this is the default */
                                continue;
-                        } else if ((strnicmp(value, "smb2", 4) == 0) ||
-                                   (strnicmp(value, "2", 1) == 0)) {
-#ifdef CONFIG_CIFS_SMB2
-                                vol->use_smb2 = true;
-#else
-                                cERROR(1, "smb2 support not enabled");
-#endif /* CONFIG_CIFS_SMB2 */
                        }
                } else if ((strnicmp(data, "unc", 3) == 0)
                           || (strnicmp(data, "target", 6) == 0)
@@ -1404,6 +1359,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        vol->server_ino = 1;
                } else if (strnicmp(data, "noserverino", 9) == 0) {
                        vol->server_ino = 0;
+                } else if (strnicmp(data, "rwpidforward", 4) == 0) {
+                        vol->rwpidforward = 1;
                } else if (strnicmp(data, "cifsacl", 7) == 0) {
                        vol->cifs_acl = 1;
                } else if (strnicmp(data, "nocifsacl", 9) == 0) {
@@ -1640,16 +1597,35 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
        /* now check if signing mode is acceptable */
        if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
-            (server->secMode & SECMODE_SIGN_REQUIRED))
+            (server->sec_mode & SECMODE_SIGN_REQUIRED))
                        return false;
        else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
-                 (server->secMode &
+                 (server->sec_mode &
                  (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
                        return false;
        return true;
 }
+static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
+                         struct smb_vol *vol)
+{
+        if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
+                return 0;
+        if (!match_address(server, addr,
+                           (struct sockaddr *)&vol->srcaddr))
+                return 0;
+        if (!match_port(server, addr))
+                return 0;
+        if (!match_security(server, vol))
+                return 0;
+        return 1;
+}
 static struct TCP_Server_Info *
 cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 {
@@ -1657,17 +1633,7 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
-                if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
+                if (!match_server(server, addr, vol))
-                        continue;
-                if (!match_address(server, addr,
-                                   (struct sockaddr *)&vol->srcaddr))
-                        continue;
-                if (!match_port(server, addr))
-                        continue;
-                if (!match_security(server, vol))
                        continue;
                ++server->srv_count;
@@ -1861,32 +1827,39 @@ out_err:
        return ERR_PTR(rc);
 }
-static struct cifsSesInfo *
+static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
+{
+        switch (ses->server->secType) {
+        case Kerberos:
+                if (vol->cred_uid != ses->cred_uid)
+                        return 0;
+                break;
+        default:
+                /* anything else takes username/password */
+                if (ses->user_name == NULL)
+                        return 0;
+                if (strncmp(ses->user_name, vol->username,
+                            MAX_USERNAME_SIZE))
+                        return 0;
+                if (strlen(vol->username) != 0 &&
+                    ses->password != NULL &&
+                    strncmp(ses->password,
+                            vol->password ? vol->password : "",
+                            MAX_PASSWORD_SIZE))
+                        return 0;
+        }
+        return 1;
+}
+static struct cifs_ses *
 cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
-                switch (server->secType) {
+                if (!match_session(ses, vol))
-                case Kerberos:
+                        continue;
-                        if (vol->cred_uid != ses->cred_uid)
-                                continue;
-                        break;
-                default:
-                        /* anything else takes username/password */
-                        if (ses->user_name == NULL)
-                                continue;
-                        if (strncmp(ses->user_name, vol->username,
-                                    MAX_USERNAME_SIZE))
-                                continue;
-                        if (strlen(vol->username) != 0 &&
-                            ses->password != NULL &&
-                            strncmp(ses->password,
-                                    vol->password ? vol->password : "",
-                                    MAX_PASSWORD_SIZE))
-                                continue;
-                }
                ++ses->ses_count;
                spin_unlock(&cifs_tcp_ses_lock);
                return ses;
@@ -1896,7 +1869,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 }
 static void
-cifs_put_smb_ses(struct cifsSesInfo *ses)
+cifs_put_smb_ses(struct cifs_ses *ses)
 {
        int xid;
        struct TCP_Server_Info *server = ses->server;
@@ -1922,11 +1895,11 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
 static bool warned_on_ntlm;  /* globals init to false automatically */
-static struct cifsSesInfo *
+static struct cifs_ses *
 cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
        int rc = -ENOMEM, xid;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
@@ -2029,20 +2002,26 @@ get_ses_fail:
        return ERR_PTR(rc);
 }
-static struct cifsTconInfo *
+static int match_tcon(struct cifs_tcon *tcon, const char *unc)
-cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
+{
+        if (tcon->tidStatus == CifsExiting)
+                return 0;
+        if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
+                return 0;
+        return 1;
+}
+static struct cifs_tcon *
+cifs_find_tcon(struct cifs_ses *ses, const char *unc)
 {
        struct list_head *tmp;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &ses->tcon_list) {
-                tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
+                tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
-                if (tcon->tidStatus == CifsExiting)
+                if (!match_tcon(tcon, unc))
-                        continue;
-                if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
                        continue;
                ++tcon->tc_count;
                spin_unlock(&cifs_tcp_ses_lock);
                return tcon;
@@ -2052,10 +2031,10 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
 }
 static void
-cifs_put_tcon(struct cifsTconInfo *tcon)
+cifs_put_tcon(struct cifs_tcon *tcon)
 {
        int xid;
-        struct cifsSesInfo *ses = tcon->ses;
+        struct cifs_ses *ses = tcon->ses;
        cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
        spin_lock(&cifs_tcp_ses_lock);
@@ -2076,11 +2055,11 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
        cifs_put_smb_ses(ses);
 }
-static struct cifsTconInfo *
+static struct cifs_tcon *
-cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
+cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 {
        int rc, xid;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        tcon = cifs_find_tcon(ses, volume_info->UNC);
        if (tcon) {
@@ -2169,8 +2148,102 @@ cifs_put_tlink(struct tcon_link *tlink)
        return;
 }
+static inline struct tcon_link *
+cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb);
+static int
+compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
+{
+        struct cifs_sb_info *old = CIFS_SB(sb);
+        struct cifs_sb_info *new = mnt_data->cifs_sb;
+        if ((sb->s_flags & CIFS_MS_MASK) != (mnt_data->flags & CIFS_MS_MASK))
+                return 0;
+        if ((old->mnt_cifs_flags & CIFS_MOUNT_MASK) !=
+            (new->mnt_cifs_flags & CIFS_MOUNT_MASK))
+                return 0;
+        if (old->rsize != new->rsize)
+                return 0;
+        /*
+         * We want to share sb only if we don't specify wsize or specified wsize
+         * is greater or equal than existing one.
+         */
+        if (new->wsize && new->wsize < old->wsize)
+                return 0;
+        if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
+                return 0;
+        if (old->mnt_file_mode != new->mnt_file_mode ||
+            old->mnt_dir_mode != new->mnt_dir_mode)
+                return 0;
+        if (strcmp(old->local_nls->charset, new->local_nls->charset))
+                return 0;
+        if (old->actimeo != new->actimeo)
+                return 0;
+        return 1;
+}
 int
-get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
+cifs_match_super(struct super_block *sb, void *data)
+{
+        struct cifs_mnt_data *mnt_data = (struct cifs_mnt_data *)data;
+        struct smb_vol *volume_info;
+        struct cifs_sb_info *cifs_sb;
+        struct TCP_Server_Info *tcp_srv;
+        struct cifs_ses *ses;
+        struct cifs_tcon *tcon;
+        struct tcon_link *tlink;
+        struct sockaddr_storage addr;
+        int rc = 0;
+        memset(&addr, 0, sizeof(struct sockaddr_storage));
+        spin_lock(&cifs_tcp_ses_lock);
+        cifs_sb = CIFS_SB(sb);
+        tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
+        if (IS_ERR(tlink)) {
+                spin_unlock(&cifs_tcp_ses_lock);
+                return rc;
+        }
+        tcon = tlink_tcon(tlink);
+        ses = tcon->ses;
+        tcp_srv = ses->server;
+        volume_info = mnt_data->vol;
+        if (!volume_info->UNCip || !volume_info->UNC)
+                goto out;
+        rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
+                                volume_info->UNCip,
+                                strlen(volume_info->UNCip),
+                                volume_info->port);
+        if (!rc)
+                goto out;
+        if (!match_server(tcp_srv, (struct sockaddr *)&addr, volume_info) ||
+            !match_session(ses, volume_info) ||
+            !match_tcon(tcon, volume_info->UNC)) {
+                rc = 0;
+                goto out;
+        }
+        rc = compare_mount_options(sb, mnt_data);
+out:
+        cifs_put_tlink(tlink);
+        spin_unlock(&cifs_tcp_ses_lock);
+        return rc;
+}
+int
+get_dfs_path(int xid, struct cifs_ses *pSesInfo, const char *old_path,
             const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
             struct dfs_info3_param **preferrals, int remap)
 {
@@ -2469,7 +2542,7 @@ ip_connect(struct TCP_Server_Info *server)
        return generic_ip_connect(server);
 }
-void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
+void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
                          struct super_block *sb, struct smb_vol *vol_info)
 {
        /* if we are reconnecting then should we check to see if
@@ -2498,7 +2571,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
        if (!CIFSSMBQFSUnixInfo(xid, tcon)) {
                __u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+                cFYI(1, "unix caps which server supports %lld", cap);
                /* check for reconnect case in which we do not
                   want to change the mount behavior if we can avoid it */
                if (vol_info == NULL) {
@@ -2516,6 +2589,9 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                        }
                }
+                if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
+                        cERROR(1, "per-share encryption not supported yet");
                cap &= CIFS_UNIX_CAP_MASK;
                if (vol_info && vol_info->no_psx_acl)
                        cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
@@ -2534,12 +2610,6 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                                        CIFS_MOUNT_POSIX_PATHS;
                }
-                /* We might be setting the path sep back to a different
-                form if we are reconnecting and the server switched its
-                posix path capability for this share */
-                if (sb && (CIFS_SB(sb)->prepathlen > 0))
-                        CIFS_SB(sb)->prepath[0] = CIFS_DIR_SEP(CIFS_SB(sb));
                if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
                        if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
                                CIFS_SB(sb)->rsize = 127 * 1024;
@@ -2564,6 +2634,10 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                        cFYI(1, "very large read cap");
                if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
                        cFYI(1, "very large write cap");
+                if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP)
+                        cFYI(1, "transport encryption cap");
+                if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
+                        cFYI(1, "mandatory transport encryption cap");
 #endif /* CIFS_DEBUG2 */
                if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
                        if (vol_info == NULL) {
@@ -2580,28 +2654,8 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
        }
 }
-static void
+void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
-convert_delimiter(char *path, char delim)
+                        struct cifs_sb_info *cifs_sb)
-{
-        int i;
-        char old_delim;
-        if (path == NULL)
-                return;
-        if (delim == '/')
-                old_delim = '\\';
-        else
-                old_delim = '/';
-        for (i = 0; path[i] != '\0'; i++) {
-                if (path[i] == old_delim)
-                        path[i] = delim;
-        }
-}
-static void setup_cifs_sb(struct smb_vol *pvolume_info,
-                          struct cifs_sb_info *cifs_sb)
 {
        INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
@@ -2615,40 +2669,19 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        else /* default */
                cifs_sb->rsize = CIFSMaxBufSize;
-        if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
-                cERROR(1, "wsize %d too large, using 4096 instead",
-                          pvolume_info->wsize);
-                cifs_sb->wsize = 4096;
-        } else if (pvolume_info->wsize)
-                cifs_sb->wsize = pvolume_info->wsize;
-        else
-                cifs_sb->wsize = min_t(const int,
-                                        PAGEVEC_SIZE * PAGE_CACHE_SIZE,
-                                        127*1024);
-                /* old default of CIFSMaxBufSize was too small now
-                   that SMB Write2 can send multiple pages in kvec.
-                   RFC1001 does not describe what happens when frame
-                   bigger than 128K is sent so use that as max in
-                   conjunction with 52K kvec constraint on arch with 4K
-                   page size  */
        if (cifs_sb->rsize < 2048) {
                cifs_sb->rsize = 2048;
                /* Windows ME may prefer this */
                cFYI(1, "readsize set to minimum: 2048");
        }
-        /* calculate prepath */
-        cifs_sb->prepath = pvolume_info->prepath;
+        /*
-        if (cifs_sb->prepath) {
+         * Temporarily set wsize for matching superblock. If we end up using
-                cifs_sb->prepathlen = strlen(cifs_sb->prepath);
+         * new sb then cifs_negotiate_wsize will later negotiate it downward
-                /* we can not convert the / to \ in the path
+         * if needed.
-                separators in the prefixpath yet because we do not
+         */
-                know (until reset_cifs_unix_caps is called later)
+        cifs_sb->wsize = pvolume_info->wsize;
-                whether POSIX PATH CAP is available. We normalize
-                the / to \ after reset_cifs_unix_caps is called */
-                pvolume_info->prepath = NULL;
-        } else
-                cifs_sb->prepathlen = 0;
        cifs_sb->mnt_uid = pvolume_info->linux_uid;
        cifs_sb->mnt_gid = pvolume_info->linux_gid;
        cifs_sb->mnt_file_mode = pvolume_info->file_mode;
@@ -2657,6 +2690,7 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
        cifs_sb->actimeo = pvolume_info->actimeo;
+        cifs_sb->local_nls = pvolume_info->local_nls;
        if (pvolume_info->noperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
@@ -2676,6 +2710,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
        if (pvolume_info->mand_lock)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
+        if (pvolume_info->rwpidforward)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
        if (pvolume_info->cifs_acl)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
        if (pvolume_info->override_uid)
@@ -2709,8 +2745,55 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                           "mount option supported");
 }
+/*
+ * When the server supports very large writes via POSIX extensions, we can
+ * allow up to 2^24 - PAGE_CACHE_SIZE.
+ *
+ * Note that this might make for "interesting" allocation problems during
+ * writeback however (as we have to allocate an array of pointers for the
+ * pages). A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ */
+#define CIFS_MAX_WSIZE ((1<<24) - PAGE_CACHE_SIZE)
+/*
+ * When the server doesn't allow large posix writes, default to a wsize of
+ * 128k - PAGE_CACHE_SIZE -- one page less than the largest frame size
+ * described in RFC1001. This allows space for the header without going over
+ * that by default.
+ */
+#define CIFS_MAX_RFC1001_WSIZE (128 * 1024 - PAGE_CACHE_SIZE)
+/*
+ * The default wsize is 1M. find_get_pages seems to return a maximum of 256
+ * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
+ * a single wsize request with a single call.
+ */
+#define CIFS_DEFAULT_WSIZE (1024 * 1024)
+static unsigned int
+cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
+{
+        __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+        struct TCP_Server_Info *server = tcon->ses->server;
+        unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
+                                CIFS_DEFAULT_WSIZE;
+        /* can server support 24-bit write sizes? (via UNIX extensions) */
+        if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+                wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1001_WSIZE);
+        /* no CAP_LARGE_WRITE_X? Limit it to 16 bits */
+        if (!(server->capabilities & CAP_LARGE_WRITE_X))
+                wsize = min_t(unsigned int, wsize, USHRT_MAX);
+        /* hard limit of CIFS_MAX_WSIZE */
+        wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE);
+        return wsize;
+}
 static int
-is_path_accessible(int xid, struct cifsTconInfo *tcon,
+is_path_accessible(int xid, struct cifs_tcon *tcon,
                   struct cifs_sb_info *cifs_sb, const char *full_path)
 {
        int rc;
@@ -2733,8 +2816,8 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon,
        return rc;
 }
-static void
+void
-cleanup_volume_info(struct smb_vol **pvolume_info)
+cifs_cleanup_volume_info(struct smb_vol **pvolume_info)
 {
        struct smb_vol *volume_info;
@@ -2764,24 +2847,13 @@ build_unc_path_to_root(const struct smb_vol *volume_info,
        char *full_path;
        int unc_len = strnlen(volume_info->UNC, MAX_TREE_SIZE + 1);
-        full_path = kmalloc(unc_len + cifs_sb->prepathlen + 1, GFP_KERNEL);
+        full_path = kmalloc(unc_len + 1, GFP_KERNEL);
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
        strncpy(full_path, volume_info->UNC, unc_len);
-        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
+        full_path[unc_len] = 0; /* add trailing null */
-                int i;
+        convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
-                for (i = 0; i < unc_len; i++) {
-                        if (full_path[i] == '\\')
-                                full_path[i] = '/';
-                }
-        }
-        if (cifs_sb->prepathlen)
-                strncpy(full_path + unc_len, cifs_sb->prepath,
-                                cifs_sb->prepathlen);
-        full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */
        return full_path;
 }
@@ -2796,7 +2868,7 @@ build_unc_path_to_root(const struct smb_vol *volume_info,
 * determine whether there were referrals.
 */
 static int
-expand_dfs_referral(int xid, struct cifsSesInfo *pSesInfo,
+expand_dfs_referral(int xid, struct cifs_ses *pSesInfo,
                    struct smb_vol *volume_info, struct cifs_sb_info *cifs_sb,
                    int check_prefix)
 {
@@ -2840,40 +2912,13 @@ expand_dfs_referral(int xid, struct cifsSesInfo *pSesInfo,
 }
 #endif
-int
+int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data,
-cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
+                           const char *devname)
-                const char *devname)
 {
-        int rc;
-        int xid;
        struct smb_vol *volume_info;
-        struct cifsSesInfo *pSesInfo;
+        int rc = 0;
-        struct cifsTconInfo *tcon;
-        struct TCP_Server_Info *srvTcp;
-        char   *full_path;
-        struct tcon_link *tlink;
-#ifdef CONFIG_CIFS_DFS_UPCALL
-        int referral_walks_count = 0;
-try_mount_again:
-        /* cleanup activities if we're chasing a referral */
-        if (referral_walks_count) {
-                if (tcon)
-                        cifs_put_tcon(tcon);
-                else if (pSesInfo)
-                        cifs_put_smb_ses(pSesInfo);
-                cleanup_volume_info(&volume_info);
-                FreeXid(xid);
-        }
-#endif
-        rc = 0;
-        tcon = NULL;
-        pSesInfo = NULL;
-        srvTcp = NULL;
-        full_path = NULL;
-        tlink = NULL;
-        xid = GetXid();
+        *pvolume_info = NULL;
        volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL);
        if (!volume_info) {
@@ -2881,7 +2926,7 @@ try_mount_again:
                goto out;
        }
-        if (cifs_parse_mount_options(cifs_sb->mountdata, devname,
+        if (cifs_parse_mount_options(mount_data, devname,
                                     volume_info)) {
                rc = -EINVAL;
                goto out;
@@ -2914,7 +2959,46 @@ try_mount_again:
                        goto out;
                }
        }
-        cifs_sb->local_nls = volume_info->local_nls;
+        *pvolume_info = volume_info;
+        return rc;
+out:
+        cifs_cleanup_volume_info(&volume_info);
+        return rc;
+}
+int
+cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
+           struct smb_vol *volume_info, const char *devname)
+{
+        int rc = 0;
+        int xid;
+        struct cifs_ses *pSesInfo;
+        struct cifs_tcon *tcon;
+        struct TCP_Server_Info *srvTcp;
+        char   *full_path;
+        struct tcon_link *tlink;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        int referral_walks_count = 0;
+try_mount_again:
+        /* cleanup activities if we're chasing a referral */
+        if (referral_walks_count) {
+                if (tcon)
+                        cifs_put_tcon(tcon);
+                else if (pSesInfo)
+                        cifs_put_smb_ses(pSesInfo);
+                cifs_cleanup_volume_info(&volume_info);
+                FreeXid(xid);
+        }
+#endif
+        tcon = NULL;
+        pSesInfo = NULL;
+        srvTcp = NULL;
+        full_path = NULL;
+        tlink = NULL;
+        xid = GetXid();
        /* get a reference to a tcp session */
        srvTcp = cifs_get_tcp_session(volume_info);
@@ -2931,7 +3015,6 @@ try_mount_again:
                goto mount_fail_check;
        }
-        setup_cifs_sb(volume_info, cifs_sb);
        if (pSesInfo->capabilities & CAP_LARGE_FILES)
                sb->s_maxbytes = MAX_LFS_FILESIZE;
        else
@@ -2948,35 +3031,36 @@ try_mount_again:
                goto remote_path_check;
        }
-        /* do not care if following two calls succeed - informational */
-        if (!tcon->ipc) {
-                CIFSSMBQFSDeviceInfo(xid, tcon);
-                CIFSSMBQFSAttributeInfo(xid, tcon);
-        }
        /* tell server which Unix caps we support */
-        if (tcon->ses->capabilities & CAP_UNIX)
+        if (tcon->ses->capabilities & CAP_UNIX) {
                /* reset of caps checks mount to see if unix extensions
                   disabled for just this mount */
                reset_cifs_unix_caps(xid, tcon, sb, volume_info);
-        else
+                if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
+                    (le64_to_cpu(tcon->fsUnixInfo.Capability) &
+                     CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) {
+                        rc = -EACCES;
+                        goto mount_fail_check;
+                }
+        } else
                tcon->unix_ext = 0; /* server does not support them */
-        /* convert forward to back slashes in prepath here if needed */
+        /* do not care if following two calls succeed - informational */
-        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
+        if (!tcon->ipc) {
-                convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb));
+                CIFSSMBQFSDeviceInfo(xid, tcon);
+                CIFSSMBQFSAttributeInfo(xid, tcon);
+        }
        if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
                cifs_sb->rsize = 1024 * 127;
                cFYI(DBG2, "no very large read support, rsize now 127K");
        }
-        if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
-                cifs_sb->wsize = min(cifs_sb->wsize,
-                               (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
        if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
                cifs_sb->rsize = min(cifs_sb->rsize,
                               (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+        cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info);
 remote_path_check:
 #ifdef CONFIG_CIFS_DFS_UPCALL
        /*
@@ -2996,10 +3080,10 @@ remote_path_check:
        }
 #endif
-        /* check if a whole path (including prepath) is not remote */
+        /* check if a whole path is not remote */
        if (!rc && tcon) {
                /* build_path_to_root works only when we have a valid tcon */
-                full_path = cifs_build_path_to_root(cifs_sb, tcon);
+                full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
                if (full_path == NULL) {
                        rc = -ENOMEM;
                        goto mount_fail_check;
@@ -3025,10 +3109,6 @@ remote_path_check:
                        rc = -ELOOP;
                        goto mount_fail_check;
                }
-                /* convert forward to back slashes in prepath here if needed */
-                if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
-                        convert_delimiter(cifs_sb->prepath,
-                                        CIFS_DIR_SEP(cifs_sb));
                rc = expand_dfs_referral(xid, pSesInfo, volume_info, cifs_sb,
                                         true);
@@ -3087,14 +3167,13 @@ mount_fail_check:
        password will be freed at unmount time) */
 out:
        /* zero out password before freeing */
-        cleanup_volume_info(&volume_info);
        FreeXid(xid);
        return rc;
 }
 int
-CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
+CIFSTCon(unsigned int xid, struct cifs_ses *ses,
-         const char *tree, struct cifsTconInfo *tcon,
+         const char *tree, struct cifs_tcon *tcon,
         const struct nls_table *nls_codepage)
 {
        struct smb_hdr *smb_buffer;
@@ -3126,7 +3205,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        pSMB->AndXCommand = 0xFF;
        pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO);
        bcc_ptr = &pSMB->Password[0];
-        if ((ses->server->secMode) & SECMODE_USER) {
+        if ((ses->server->sec_mode) & SECMODE_USER) {
                pSMB->PasswordLength = cpu_to_le16(1);  /* minimum */
                *bcc_ptr = 0; /* password is null byte */
                bcc_ptr++;              /* skip password */
@@ -3143,7 +3222,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
                    (ses->server->secType == LANMAN))
                        calc_lanman_hash(tcon->password, ses->server->cryptkey,
-                                         ses->server->secMode &
+                                         ses->server->sec_mode &
                                            SECMODE_PW_ENCRYPT ? true : false,
                                         bcc_ptr);
                else
@@ -3159,7 +3238,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                }
        }
-        if (ses->server->secMode &
+        if (ses->server->sec_mode &
                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
@@ -3255,7 +3334,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
        struct rb_root *root = &cifs_sb->tlink_tree;
        struct rb_node *node;
        struct tcon_link *tlink;
-        char *tmp;
        cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
@@ -3272,15 +3350,10 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
        }
        spin_unlock(&cifs_sb->tlink_tree_lock);
-        tmp = cifs_sb->prepath;
-        cifs_sb->prepathlen = 0;
-        cifs_sb->prepath = NULL;
-        kfree(tmp);
        return 0;
 }
-int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
+int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses)
 {
        int rc = 0;
        struct TCP_Server_Info *server = ses->server;
@@ -3310,7 +3383,7 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
 }
-int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
+int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
                        struct nls_table *nls_info)
 {
        int rc = 0;
@@ -3322,7 +3395,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
                ses->capabilities &= (~CAP_UNIX);
        cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
-                 server->secMode, server->capabilities, server->timeAdj);
+                 server->sec_mode, server->capabilities, server->timeAdj);
        rc = CIFS_SessSetup(xid, ses, nls_info);
        if (rc) {
@@ -3354,12 +3427,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
        return rc;
 }
-static struct cifsTconInfo *
+static struct cifs_tcon *
 cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 {
-        struct cifsTconInfo *master_tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
-        struct cifsTconInfo *tcon = NULL;
+        struct cifs_tcon *tcon = NULL;
        struct smb_vol *vol_info;
        char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */
                           /* We used to have this as MAX_USERNAME which is   */
@@ -3392,7 +3465,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
        ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
        if (IS_ERR(ses)) {
-                tcon = (struct cifsTconInfo *)ses;
+                tcon = (struct cifs_tcon *)ses;
                cifs_put_tcp_session(master_tcon->ses->server);
                goto out;
        }
@@ -3417,7 +3490,7 @@ cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
        return cifs_sb->master_tlink;
 }
-struct cifsTconInfo *
+struct cifs_tcon *
 cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
 {
        return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 9ea65cf36714..81914df47ef1 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -50,12 +50,11 @@ build_path_from_dentry(struct dentry *direntry)
 {
        struct dentry *temp;
        int namelen;
-        int pplen;
        int dfsplen;
        char *full_path;
        char dirsep;
        struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
        if (direntry == NULL)
                return NULL;  /* not much we can do if dentry is freed and
@@ -63,13 +62,12 @@ build_path_from_dentry(struct dentry *direntry)
                when the server crashed */
        dirsep = CIFS_DIR_SEP(cifs_sb);
-        pplen = cifs_sb->prepathlen;
        if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
                dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
        else
                dfsplen = 0;
 cifs_bp_rename_retry:
-        namelen = pplen + dfsplen;
+        namelen = dfsplen;
        for (temp = direntry; !IS_ROOT(temp);) {
                namelen += (1 + temp->d_name.len);
                temp = temp->d_parent;
@@ -100,7 +98,7 @@ cifs_bp_rename_retry:
                        return NULL;
                }
        }
-        if (namelen != pplen + dfsplen) {
+        if (namelen != dfsplen) {
                cERROR(1, "did not end path lookup where expected namelen is %d",
                        namelen);
                /* presumably this is only possible if racing with a rename
@@ -126,7 +124,6 @@ cifs_bp_rename_retry:
                        }
                }
        }
-        strncpy(full_path + dfsplen, CIFS_SB(direntry->d_sb)->prepath, pplen);
        return full_path;
 }
@@ -152,7 +149,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        __u16 fileHandle;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        char *full_path = NULL;
        FILE_ALL_INFO *buf = NULL;
        struct inode *newinode = NULL;
@@ -356,7 +353,8 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
+        struct cifs_io_parms io_parms;
        char *full_path = NULL;
        struct inode *newinode = NULL;
        int oplock = 0;
@@ -439,16 +437,19 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
         * timestamps in, but we can reuse it safely */
        pdev = (struct win_dev *)buf;
+        io_parms.netfid = fileHandle;
+        io_parms.pid = current->tgid;
+        io_parms.tcon = pTcon;
+        io_parms.offset = 0;
+        io_parms.length = sizeof(struct win_dev);
        if (S_ISCHR(mode)) {
                memcpy(pdev->type, "IntxCHR", 8);
                pdev->major =
                      cpu_to_le64(MAJOR(device_number));
                pdev->minor =
                      cpu_to_le64(MINOR(device_number));
-                rc = CIFSSMBWrite(xid, pTcon,
+                rc = CIFSSMBWrite(xid, &io_parms,
-                        fileHandle,
+                        &bytes_written, (char *)pdev,
-                        sizeof(struct win_dev),
-                        0, &bytes_written, (char *)pdev,
                        NULL, 0);
        } else if (S_ISBLK(mode)) {
                memcpy(pdev->type, "IntxBLK", 8);
@@ -456,10 +457,8 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                      cpu_to_le64(MAJOR(device_number));
                pdev->minor =
                      cpu_to_le64(MINOR(device_number));
-                rc = CIFSSMBWrite(xid, pTcon,
+                rc = CIFSSMBWrite(xid, &io_parms,
-                        fileHandle,
+                        &bytes_written, (char *)pdev,
-                        sizeof(struct win_dev),
-                        0, &bytes_written, (char *)pdev,
                        NULL, 0);
        } /* else if (S_ISFIFO) */
        CIFSSMBClose(xid, pTcon, fileHandle);
@@ -486,7 +485,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        bool posix_open = false;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct cifsFileInfo *cfile;
        struct inode *newInode = NULL;
        char *full_path = NULL;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c672afef0c09..bb71471a4d9d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -114,7 +114,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifs_fattr fattr;
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        cFYI(1, "posix open %s", full_path);
@@ -168,7 +168,7 @@ posix_open_ret:
 static int
 cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
-             struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock,
+             struct cifs_tcon *tcon, unsigned int f_flags, __u32 *poplock,
             __u16 *pnetfid, int xid)
 {
        int rc;
@@ -285,7 +285,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 {
        struct inode *inode = cifs_file->dentry->d_inode;
-        struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
+        struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsLockInfo *li, *tmp;
@@ -343,7 +343,7 @@ int cifs_open(struct inode *inode, struct file *file)
        int xid;
        __u32 oplock;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct tcon_link *tlink;
        struct cifsFileInfo *pCifsFile = NULL;
        char *full_path = NULL;
@@ -457,7 +457,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
        int xid;
        __u32 oplock;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct cifsInodeInfo *pCifsInode;
        struct inode *inode;
        char *full_path = NULL;
@@ -596,7 +596,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
        xid = GetXid();
        if (pCFileStruct) {
-                struct cifsTconInfo *pTcon = tlink_tcon(pCFileStruct->tlink);
+                struct cifs_tcon *pTcon = tlink_tcon(pCFileStruct->tlink);
                cFYI(1, "Freeing private data in close dir");
                spin_lock(&cifs_file_list_lock);
@@ -653,7 +653,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        __u64 length;
        bool wait_flag = false;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        __u16 netfid;
        __u8 lockType = LOCKING_ANDX_LARGE_FILES;
        bool posix_locking = 0;
@@ -725,8 +725,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                        else
                                posix_lock_type = CIFS_WRLCK;
                        rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
-                                        length, pfLock,
+                                        length, pfLock, posix_lock_type,
-                                        posix_lock_type, wait_flag);
+                                        wait_flag);
                        FreeXid(xid);
                        return rc;
                }
@@ -797,8 +797,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                        posix_lock_type = CIFS_UNLCK;
                rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */,
-                                      length, pfLock,
+                                      length, pfLock, posix_lock_type,
-                                      posix_lock_type, wait_flag);
+                                      wait_flag);
        } else {
                struct cifsFileInfo *fid = file->private_data;
@@ -857,7 +857,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
                cifsi->server_eof = end_of_write;
 }
-static ssize_t cifs_write(struct cifsFileInfo *open_file,
+static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid,
                          const char *write_data, size_t write_size,
                          loff_t *poffset)
 {
@@ -865,10 +865,11 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
        unsigned int bytes_written = 0;
        unsigned int total_written;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        int xid;
        struct dentry *dentry = open_file->dentry;
        struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
+        struct cifs_io_parms io_parms;
        cifs_sb = CIFS_SB(dentry->d_sb);
@@ -901,8 +902,13 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
                        /* iov[0] is reserved for smb header */
                        iov[1].iov_base = (char *)write_data + total_written;
                        iov[1].iov_len = len;
-                        rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid, len,
+                        io_parms.netfid = open_file->netfid;
-                                           *poffset, &bytes_written, iov, 1, 0);
+                        io_parms.pid = pid;
+                        io_parms.tcon = pTcon;
+                        io_parms.offset = *poffset;
+                        io_parms.length = len;
+                        rc = CIFSSMBWrite2(xid, &io_parms, &bytes_written, iov,
+                                           1, 0);
                }
                if (rc || (bytes_written == 0)) {
                        if (total_written)
@@ -1071,8 +1077,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
        open_file = find_writable_file(CIFS_I(mapping->host), false);
        if (open_file) {
-                bytes_written = cifs_write(open_file, write_data,
+                bytes_written = cifs_write(open_file, open_file->pid,
-                                           to - from, &offset);
+                                           write_data, to - from, &offset);
                cifsFileInfo_put(open_file);
                /* Does mm or vfs already set times? */
                inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
@@ -1092,58 +1098,20 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 static int cifs_writepages(struct address_space *mapping,
                           struct writeback_control *wbc)
 {
-        unsigned int bytes_to_write;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
-        unsigned int bytes_written;
+        bool done = false, scanned = false, range_whole = false;
-        struct cifs_sb_info *cifs_sb;
+        pgoff_t end, index;
-        int done = 0;
+        struct cifs_writedata *wdata;
-        pgoff_t end;
-        pgoff_t index;
-        int range_whole = 0;
-        struct kvec *iov;
-        int len;
-        int n_iov = 0;
-        pgoff_t next;
-        int nr_pages;
-        __u64 offset = 0;
-        struct cifsFileInfo *open_file;
-        struct cifsTconInfo *tcon;
-        struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
        struct page *page;
-        struct pagevec pvec;
        int rc = 0;
-        int scanned = 0;
-        int xid;
-        cifs_sb = CIFS_SB(mapping->host->i_sb);
        /*
-         * If wsize is smaller that the page cache size, default to writing
+         * If wsize is smaller than the page cache size, default to writing
         * one page at a time via cifs_writepage
         */
        if (cifs_sb->wsize < PAGE_CACHE_SIZE)
                return generic_writepages(mapping, wbc);
-        iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
-        if (iov == NULL)
-                return generic_writepages(mapping, wbc);
-        /*
-         * if there's no open file, then this is likely to fail too,
-         * but it'll at least handle the return. Maybe it should be
-         * a BUG() instead?
-         */
-        open_file = find_writable_file(CIFS_I(mapping->host), false);
-        if (!open_file) {
-                kfree(iov);
-                return generic_writepages(mapping, wbc);
-        }
-        tcon = tlink_tcon(open_file->tlink);
-        cifsFileInfo_put(open_file);
-        xid = GetXid();
-        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
                index = mapping->writeback_index; /* Start from prev offset */
                end = -1;
@@ -1151,24 +1119,49 @@ static int cifs_writepages(struct address_space *mapping,
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-                        range_whole = 1;
+                        range_whole = true;
-                scanned = 1;
+                scanned = true;
        }
 retry:
-        while (!done && (index <= end) &&
+        while (!done && index <= end) {
-               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                unsigned int i, nr_pages, found_pages;
-                        PAGECACHE_TAG_DIRTY,
+                pgoff_t next = 0, tofind;
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1))) {
+                struct page **pages;
-                int first;
-                unsigned int i;
+                tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1,
+                                end - index) + 1;
-                first = -1;
-                next = 0;
+                wdata = cifs_writedata_alloc((unsigned int)tofind);
-                n_iov = 0;
+                if (!wdata) {
-                bytes_to_write = 0;
+                        rc = -ENOMEM;
+                        break;
-                for (i = 0; i < nr_pages; i++) {
+                }
-                        page = pvec.pages[i];
+                /*
+                 * find_get_pages_tag seems to return a max of 256 on each
+                 * iteration, so we must call it several times in order to
+                 * fill the array or the wsize is effectively limited to
+                 * 256 * PAGE_CACHE_SIZE.
+                 */
+                found_pages = 0;
+                pages = wdata->pages;
+                do {
+                        nr_pages = find_get_pages_tag(mapping, &index,
+                                                        PAGECACHE_TAG_DIRTY,
+                                                        tofind, pages);
+                        found_pages += nr_pages;
+                        tofind -= nr_pages;
+                        pages += nr_pages;
+                } while (nr_pages && tofind && index <= end);
+                if (found_pages == 0) {
+                        kref_put(&wdata->refcount, cifs_writedata_release);
+                        break;
+                }
+                nr_pages = 0;
+                for (i = 0; i < found_pages; i++) {
+                        page = wdata->pages[i];
                        /*
                         * At this point we hold neither mapping->tree_lock nor
                         * lock on the page itself: the page may be truncated or
@@ -1177,7 +1170,7 @@ retry:
                         * mapping
                         */
-                        if (first < 0)
+                        if (nr_pages == 0)
                                lock_page(page);
                        else if (!trylock_page(page))
                                break;
@@ -1188,7 +1181,7 @@ retry:
                        }
                        if (!wbc->range_cyclic && page->index > end) {
-                                done = 1;
+                                done = true;
                                unlock_page(page);
                                break;
                        }
@@ -1215,119 +1208,89 @@ retry:
                        set_page_writeback(page);
                        if (page_offset(page) >= mapping->host->i_size) {
-                                done = 1;
+                                done = true;
                                unlock_page(page);
                                end_page_writeback(page);
                                break;
                        }
-                        /*
+                        wdata->pages[i] = page;
-                         * BB can we get rid of this?  pages are held by pvec
+                        next = page->index + 1;
-                         */
+                        ++nr_pages;
-                        page_cache_get(page);
+                }
-                        len = min(mapping->host->i_size - page_offset(page),
+                /* reset index to refind any pages skipped */
-                                  (loff_t)PAGE_CACHE_SIZE);
+                if (nr_pages == 0)
+                        index = wdata->pages[0]->index + 1;
-                        /* reserve iov[0] for the smb header */
+                /* put any pages we aren't going to use */
-                        n_iov++;
+                for (i = nr_pages; i < found_pages; i++) {
-                        iov[n_iov].iov_base = kmap(page);
+                        page_cache_release(wdata->pages[i]);
-                        iov[n_iov].iov_len = len;
+                        wdata->pages[i] = NULL;
-                        bytes_to_write += len;
+                }
-                        if (first < 0) {
+                /* nothing to write? */
-                                first = i;
+                if (nr_pages == 0) {
-                                offset = page_offset(page);
+                        kref_put(&wdata->refcount, cifs_writedata_release);
-                        }
+                        continue;
-                        next = page->index + 1;
-                        if (bytes_to_write + PAGE_CACHE_SIZE > cifs_sb->wsize)
-                                break;
                }
-                if (n_iov) {
-retry_write:
-                        open_file = find_writable_file(CIFS_I(mapping->host),
-                                                        false);
-                        if (!open_file) {
-                                cERROR(1, "No writable handles for inode");
-                                rc = -EBADF;
-                        } else {
-                                rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
-                                                   bytes_to_write, offset,
-                                                   &bytes_written, iov, n_iov,
-                                                   0);
-                                cifsFileInfo_put(open_file);
-                        }
-                        cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
+                wdata->sync_mode = wbc->sync_mode;
+                wdata->nr_pages = nr_pages;
+                wdata->offset = page_offset(wdata->pages[0]);
-                        /*
+                do {
-                         * For now, treat a short write as if nothing got
+                        if (wdata->cfile != NULL)
-                         * written. A zero length write however indicates
+                                cifsFileInfo_put(wdata->cfile);
-                         * ENOSPC or EFBIG. We have no way to know which
+                        wdata->cfile = find_writable_file(CIFS_I(mapping->host),
-                         * though, so call it ENOSPC for now. EFBIG would
+                                                          false);
-                         * get translated to AS_EIO anyway.
+                        if (!wdata->cfile) {
-                         *
+                                cERROR(1, "No writable handles for inode");
-                         * FIXME: make it take into account the data that did
+                                rc = -EBADF;
-                         *        get written
+                                break;
-                         */
-                        if (rc == 0) {
-                                if (bytes_written == 0)
-                                        rc = -ENOSPC;
-                                else if (bytes_written < bytes_to_write)
-                                        rc = -EAGAIN;
                        }
+                        rc = cifs_async_writev(wdata);
+                } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
-                        /* retry on data-integrity flush */
+                for (i = 0; i < nr_pages; ++i)
-                        if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
+                        unlock_page(wdata->pages[i]);
-                                goto retry_write;
-                        /* fix the stats and EOF */
-                        if (bytes_written > 0) {
-                                cifs_stats_bytes_written(tcon, bytes_written);
-                                cifs_update_eof(cifsi, offset, bytes_written);
-                        }
-                        for (i = 0; i < n_iov; i++) {
+                /* send failure -- clean up the mess */
-                                page = pvec.pages[first + i];
+                if (rc != 0) {
-                                /* on retryable write error, redirty page */
+                        for (i = 0; i < nr_pages; ++i) {
                                if (rc == -EAGAIN)
-                                        redirty_page_for_writepage(wbc, page);
+                                        redirty_page_for_writepage(wbc,
-                                else if (rc != 0)
+                                                           wdata->pages[i]);
-                                        SetPageError(page);
+                                else
-                                kunmap(page);
+                                        SetPageError(wdata->pages[i]);
-                                unlock_page(page);
+                                end_page_writeback(wdata->pages[i]);
-                                end_page_writeback(page);
+                                page_cache_release(wdata->pages[i]);
-                                page_cache_release(page);
                        }
                        if (rc != -EAGAIN)
                                mapping_set_error(mapping, rc);
-                        else
+                }
-                                rc = 0;
+                kref_put(&wdata->refcount, cifs_writedata_release);
-                        if ((wbc->nr_to_write -= n_iov) <= 0)
+                wbc->nr_to_write -= nr_pages;
-                                done = 1;
+                if (wbc->nr_to_write <= 0)
-                        index = next;
+                        done = true;
-                } else
-                        /* Need to re-find the pages we skipped */
-                        index = pvec.pages[0]->index + 1;
-                pagevec_release(&pvec);
+                index = next;
        }
        if (!scanned && !done) {
                /*
                 * We hit the last page and there is more work to be done: wrap
                 * back to the start of the file
                 */
-                scanned = 1;
+                scanned = true;
                index = 0;
                goto retry;
        }
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                mapping->writeback_index = index;
-        FreeXid(xid);
-        kfree(iov);
        return rc;
 }
@@ -1383,6 +1346,14 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 {
        int rc;
        struct inode *inode = mapping->host;
+        struct cifsFileInfo *cfile = file->private_data;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+        __u32 pid;
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+                pid = cfile->pid;
+        else
+                pid = current->tgid;
        cFYI(1, "write_end for page %p from pos %lld with %d bytes",
                 page, pos, copied);
@@ -1406,8 +1377,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
                /* BB check if anything else missing out of ppw
                   such as updating last write time */
                page_data = kmap(page);
-                rc = cifs_write(file->private_data, page_data + offset,
+                rc = cifs_write(cfile, pid, page_data + offset, copied, &pos);
-                                copied, &pos);
                /* if (rc < 0) should we set writebehind rc? */
                kunmap(page);
@@ -1435,7 +1405,7 @@ int cifs_strict_fsync(struct file *file, int datasync)
 {
        int xid;
        int rc = 0;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct cifsFileInfo *smbfile = file->private_data;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -1465,7 +1435,7 @@ int cifs_fsync(struct file *file, int datasync)
 {
        int xid;
        int rc = 0;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct cifsFileInfo *smbfile = file->private_data;
        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -1556,9 +1526,11 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
        struct iov_iter it;
        struct inode *inode;
        struct cifsFileInfo *open_file;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct cifs_sb_info *cifs_sb;
+        struct cifs_io_parms io_parms;
        int xid, rc;
+        __u32 pid;
        len = iov_length(iov, nr_segs);
        if (!len)
@@ -1590,6 +1562,12 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
        xid = GetXid();
        open_file = file->private_data;
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+                pid = open_file->pid;
+        else
+                pid = current->tgid;
        pTcon = tlink_tcon(open_file->tlink);
        inode = file->f_path.dentry->d_inode;
@@ -1616,9 +1594,13 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
                                if (rc != 0)
                                        break;
                        }
-                        rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid,
+                        io_parms.netfid = open_file->netfid;
-                                           cur_len, *poffset, &written,
+                        io_parms.pid = pid;
-                                           to_send, npages, 0);
+                        io_parms.tcon = pTcon;
+                        io_parms.offset = *poffset;
+                        io_parms.length = cur_len;
+                        rc = CIFSSMBWrite2(xid, &io_parms, &written, to_send,
+                                           npages, 0);
                } while (rc == -EAGAIN);
                for (i = 0; i < npages; i++)
@@ -1711,10 +1693,12 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
        size_t len, cur_len;
        int iov_offset = 0;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct cifsFileInfo *open_file;
        struct smb_com_read_rsp *pSMBr;
+        struct cifs_io_parms io_parms;
        char *read_data;
+        __u32 pid;
        if (!nr_segs)
                return 0;
@@ -1729,6 +1713,11 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
        open_file = file->private_data;
        pTcon = tlink_tcon(open_file->tlink);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+                pid = open_file->pid;
+        else
+                pid = current->tgid;
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, "attempting read on write only file instance");
@@ -1744,8 +1733,12 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
                                if (rc != 0)
                                        break;
                        }
-                        rc = CIFSSMBRead(xid, pTcon, open_file->netfid,
+                        io_parms.netfid = open_file->netfid;
-                                         cur_len, *poffset, &bytes_read,
+                        io_parms.pid = pid;
+                        io_parms.tcon = pTcon;
+                        io_parms.offset = *poffset;
+                        io_parms.length = len;
+                        rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
                                         &read_data, &buf_type);
                        pSMBr = (struct smb_com_read_rsp *)read_data;
                        if (read_data) {
@@ -1822,11 +1815,13 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        unsigned int total_read;
        unsigned int current_read_size;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        int xid;
        char *current_offset;
        struct cifsFileInfo *open_file;
+        struct cifs_io_parms io_parms;
        int buf_type = CIFS_NO_BUFFER;
+        __u32 pid;
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -1839,6 +1834,11 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        open_file = file->private_data;
        pTcon = tlink_tcon(open_file->tlink);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+                pid = open_file->pid;
+        else
+                pid = current->tgid;
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, "attempting read on write only file instance");
@@ -1861,11 +1861,13 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
                                if (rc != 0)
                                        break;
                        }
-                        rc = CIFSSMBRead(xid, pTcon,
+                        io_parms.netfid = open_file->netfid;
-                                         open_file->netfid,
+                        io_parms.pid = pid;
-                                         current_read_size, *poffset,
+                        io_parms.tcon = pTcon;
-                                         &bytes_read, &current_offset,
+                        io_parms.offset = *poffset;
-                                         &buf_type);
+                        io_parms.length = current_read_size;
+                        rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
+                                         &current_offset, &buf_type);
                }
                if (rc || (bytes_read == 0)) {
                        if (total_read) {
@@ -1996,13 +1998,15 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        loff_t offset;
        struct page *page;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        unsigned int bytes_read = 0;
        unsigned int read_size, i;
        char *smb_read_data = NULL;
        struct smb_com_read_rsp *pSMBr;
        struct cifsFileInfo *open_file;
+        struct cifs_io_parms io_parms;
        int buf_type = CIFS_NO_BUFFER;
+        __u32 pid;
        xid = GetXid();
        if (file->private_data == NULL) {
@@ -2024,6 +2028,11 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                goto read_complete;
        cFYI(DBG2, "rpages: num pages %d", num_pages);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+                pid = open_file->pid;
+        else
+                pid = current->tgid;
        for (i = 0; i < num_pages; ) {
                unsigned contig_pages;
                struct page *tmp_page;
@@ -2065,12 +2074,13 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                                if (rc != 0)
                                        break;
                        }
+                        io_parms.netfid = open_file->netfid;
-                        rc = CIFSSMBRead(xid, pTcon,
+                        io_parms.pid = pid;
-                                         open_file->netfid,
+                        io_parms.tcon = pTcon;
-                                         read_size, offset,
+                        io_parms.offset = offset;
-                                         &bytes_read, &smb_read_data,
+                        io_parms.length = read_size;
-                                         &buf_type);
+                        rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
+                                         &smb_read_data, &buf_type);
                        /* BB more RC checks ? */
                        if (rc == -EAGAIN) {
                                if (smb_read_data) {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 297a43d0ff7f..d368a47ba5eb 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -40,7 +40,7 @@ void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
        server->fscache = NULL;
 }
-void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon)
+void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
 {
        struct TCP_Server_Info *server = tcon->ses->server;
@@ -51,7 +51,7 @@ void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon)
                                server->fscache, tcon->fscache);
 }
-void cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon)
+void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
 {
        cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache);
        fscache_relinquish_cookie(tcon->fscache, 0);
@@ -62,7 +62,7 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
 {
        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
        if (cifsi->fscache)
                return;
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 31b88ec2341e..63539323e0b9 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -40,8 +40,8 @@ extern void cifs_fscache_unregister(void);
 */
 extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *);
 extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *);
-extern void cifs_fscache_get_super_cookie(struct cifsTconInfo *);
+extern void cifs_fscache_get_super_cookie(struct cifs_tcon *);
-extern void cifs_fscache_release_super_cookie(struct cifsTconInfo *);
+extern void cifs_fscache_release_super_cookie(struct cifs_tcon *);
 extern void cifs_fscache_release_inode_cookie(struct inode *);
 extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
@@ -99,9 +99,9 @@ static inline void
 cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {}
 static inline void
 cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {}
-static inline void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) {}
+static inline void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) {}
 static inline void
-cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) {}
+cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {}
 static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
 static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index de02ed5e25c2..9b018c8334fa 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -295,7 +295,7 @@ int cifs_get_file_info_unix(struct file *filp)
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsFileInfo *cfile = filp->private_data;
-        struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
+        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        xid = GetXid();
        rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -318,7 +318,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        int rc;
        FILE_UNIX_BASIC_INFO find_data;
        struct cifs_fattr fattr;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct tcon_link *tlink;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -373,7 +373,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
        int oplock = 0;
        __u16 netfid;
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
+        struct cifs_io_parms io_parms;
        char buf[24];
        unsigned int bytes_read;
        char *pbuf;
@@ -405,9 +406,13 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
        if (rc == 0) {
                int buf_type = CIFS_NO_BUFFER;
                        /* Read header */
-                rc = CIFSSMBRead(xid, tcon, netfid,
+                io_parms.netfid = netfid;
-                                 24 /* length */, 0 /* offset */,
+                io_parms.pid = current->tgid;
-                                 &bytes_read, &pbuf, &buf_type);
+                io_parms.tcon = tcon;
+                io_parms.offset = 0;
+                io_parms.length = 24;
+                rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf,
+                                 &buf_type);
                if ((rc == 0) && (bytes_read >= 8)) {
                        if (memcmp("IntxBLK", pbuf, 8) == 0) {
                                cFYI(1, "Block device");
@@ -468,7 +473,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
        char ea_value[4];
        __u32 mode;
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
@@ -502,7 +507,7 @@ static void
 cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
                       struct cifs_sb_info *cifs_sb, bool adjust_tz)
 {
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
        memset(fattr, 0, sizeof(*fattr));
        fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
@@ -553,7 +558,7 @@ int cifs_get_file_info(struct file *filp)
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsFileInfo *cfile = filp->private_data;
-        struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
+        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        xid = GetXid();
        rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -590,7 +595,7 @@ int cifs_get_inode_info(struct inode **pinode,
        struct super_block *sb, int xid, const __u16 *pfid)
 {
        int rc = 0, tmprc;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct tcon_link *tlink;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        char *buf = NULL;
@@ -735,10 +740,10 @@ static const struct inode_operations cifs_ipc_inode_ops = {
        .lookup = cifs_lookup,
 };
-char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
-                                struct cifsTconInfo *tcon)
+                              struct cifs_tcon *tcon)
 {
-        int pplen = cifs_sb->prepathlen;
+        int pplen = vol->prepath ? strlen(vol->prepath) : 0;
        int dfsplen;
        char *full_path = NULL;
@@ -772,7 +777,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
                        }
                }
        }
-        strncpy(full_path + dfsplen, cifs_sb->prepath, pplen);
+        strncpy(full_path + dfsplen, vol->prepath, pplen);
        full_path[dfsplen + pplen] = 0; /* add trailing null */
        return full_path;
 }
@@ -884,19 +889,13 @@ struct inode *cifs_root_iget(struct super_block *sb)
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct inode *inode = NULL;
        long rc;
-        char *full_path;
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
-        full_path = cifs_build_path_to_root(cifs_sb, tcon);
-        if (full_path == NULL)
-                return ERR_PTR(-ENOMEM);
        xid = GetXid();
        if (tcon->unix_ext)
-                rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
+                rc = cifs_get_inode_info_unix(&inode, "", sb, xid);
        else
-                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
+                rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL);
-                                                xid, NULL);
        if (!inode) {
                inode = ERR_PTR(rc);
@@ -922,7 +921,6 @@ struct inode *cifs_root_iget(struct super_block *sb)
        }
 out:
-        kfree(full_path);
        /* can not call macro FreeXid here since in a void func
         * TODO: This is no longer true
         */
@@ -943,7 +941,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink = NULL;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        FILE_BASIC_INFO info_buf;
        if (attrs == NULL)
@@ -1061,7 +1059,7 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        __u32 dosattr, origattr;
        FILE_BASIC_INFO *info_buf = NULL;
@@ -1179,7 +1177,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
        struct super_block *sb = dir->i_sb;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct iattr *attrs = NULL;
        __u32 dosattr = 0, origattr = 0;
@@ -1277,7 +1275,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
        struct cifs_fattr fattr;
@@ -1455,7 +1453,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        char *full_path = NULL;
        struct cifsInodeInfo *cifsInode;
@@ -1512,7 +1510,7 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        __u16 srcfid;
        int oplock, rc;
@@ -1564,7 +1562,7 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
        char *toName = NULL;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
        FILE_UNIX_BASIC_INFO *info_buf_target;
        int xid, rc, tmprc;
@@ -1794,7 +1792,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
                 struct kstat *stat)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
        struct inode *inode = dentry->d_inode;
        int rc;
@@ -1872,7 +1870,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink = NULL;
-        struct cifsTconInfo *pTcon = NULL;
+        struct cifs_tcon *pTcon = NULL;
+        struct cifs_io_parms io_parms;
        /*
         * To avoid spurious oplock breaks from server, in the case of
@@ -1894,8 +1893,14 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                cFYI(1, "SetFSize for attrs rc = %d", rc);
                if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        unsigned int bytes_written;
-                        rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
-                                          &bytes_written, NULL, NULL, 1);
+                        io_parms.netfid = nfid;
+                        io_parms.pid = npid;
+                        io_parms.tcon = pTcon;
+                        io_parms.offset = 0;
+                        io_parms.length = attrs->ia_size;
+                        rc = CIFSSMBWrite(xid, &io_parms, &bytes_written,
+                                          NULL, NULL, 1);
                        cFYI(1, "Wrt seteof rc %d", rc);
                }
        } else
@@ -1930,10 +1935,15 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                        if (rc == 0) {
                                unsigned int bytes_written;
-                                rc = CIFSSMBWrite(xid, pTcon, netfid, 0,
-                                                  attrs->ia_size,
+                                io_parms.netfid = netfid;
-                                                  &bytes_written, NULL,
+                                io_parms.pid = current->tgid;
-                                                  NULL, 1);
+                                io_parms.tcon = pTcon;
+                                io_parms.offset = 0;
+                                io_parms.length = attrs->ia_size;
+                                rc = CIFSSMBWrite(xid, &io_parms,
+                                                  &bytes_written,
+                                                  NULL, NULL,  1);
                                cFYI(1, "wrt seteof rc %d", rc);
                                CIFSSMBClose(xid, pTcon, netfid);
                        }
@@ -1961,7 +1971,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct cifs_unix_set_info_args *args = NULL;
        struct cifsFileInfo *open_file;
@@ -2247,7 +2257,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 {
        struct inode *inode = direntry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *pTcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
        if (pTcon->unix_ext)
                return cifs_setattr_unix(direntry, attrs);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 0c98672d0122..4221b5e48a42 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -38,7 +38,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        struct cifs_sb_info *cifs_sb;
 #ifdef CONFIG_CIFS_POSIX
        struct cifsFileInfo *pSMBFile = filep->private_data;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        __u64   ExtAttrBits = 0;
        __u64   ExtAttrMask = 0;
        __u64   caps;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index ce417a9764a3..556b1a0b54de 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -175,7 +175,7 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 }
 static int
-CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
+CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
                    const char *fromName, const char *toName,
                    const struct nls_table *nls_codepage, int remap)
 {
@@ -184,6 +184,7 @@ CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
        __u16 netfid = 0;
        u8 *buf;
        unsigned int bytes_written = 0;
+        struct cifs_io_parms io_parms;
        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
        if (!buf)
@@ -203,10 +204,13 @@ CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
                return rc;
        }
-        rc = CIFSSMBWrite(xid, tcon, netfid,
+        io_parms.netfid = netfid;
-                          CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+        io_parms.pid = current->tgid;
-                          0 /* offset */,
+        io_parms.tcon = tcon;
-                          &bytes_written, buf, NULL, 0);
+        io_parms.offset = 0;
+        io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+        rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, buf, NULL, 0);
        CIFSSMBClose(xid, tcon, netfid);
        kfree(buf);
        if (rc != 0)
@@ -219,7 +223,7 @@ CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
 }
 static int
-CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
+CIFSQueryMFSymLink(const int xid, struct cifs_tcon *tcon,
                   const unsigned char *searchName, char **symlinkinfo,
                   const struct nls_table *nls_codepage, int remap)
 {
@@ -231,6 +235,7 @@ CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
        unsigned int bytes_read = 0;
        int buf_type = CIFS_NO_BUFFER;
        unsigned int link_len = 0;
+        struct cifs_io_parms io_parms;
        FILE_ALL_INFO file_info;
        rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
@@ -249,11 +254,13 @@ CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
        if (!buf)
                return -ENOMEM;
        pbuf = buf;
+        io_parms.netfid = netfid;
+        io_parms.pid = current->tgid;
+        io_parms.tcon = tcon;
+        io_parms.offset = 0;
+        io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
-        rc = CIFSSMBRead(xid, tcon, netfid,
+        rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
-                         CIFS_MF_SYMLINK_FILE_SIZE /* length */,
-                         0 /* offset */,
-                         &bytes_read, &pbuf, &buf_type);
        CIFSSMBClose(xid, tcon, netfid);
        if (rc != 0) {
                kfree(buf);
@@ -291,7 +298,8 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
        int oplock = 0;
        __u16 netfid = 0;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
+        struct cifs_io_parms io_parms;
        u8 *buf;
        char *pbuf;
        unsigned int bytes_read = 0;
@@ -328,11 +336,13 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
                goto out;
        }
        pbuf = buf;
+        io_parms.netfid = netfid;
+        io_parms.pid = current->tgid;
+        io_parms.tcon = pTcon;
+        io_parms.offset = 0;
+        io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
-        rc = CIFSSMBRead(xid, pTcon, netfid,
+        rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
-                         CIFS_MF_SYMLINK_FILE_SIZE /* length */,
-                         0 /* offset */,
-                         &bytes_read, &pbuf, &buf_type);
        CIFSSMBClose(xid, pTcon, netfid);
        if (rc != 0) {
                kfree(buf);
@@ -370,7 +380,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
        char *toName = NULL;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct cifsInodeInfo *cifsInode;
        tlink = cifs_sb_tlink(cifs_sb);
@@ -445,7 +455,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
        char *target_path = NULL;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink = NULL;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        xid = GetXid();
@@ -518,7 +528,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
        int xid;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 907531ac5888..03a1f491d39b 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -67,12 +67,12 @@ _FreeXid(unsigned int xid)
        spin_unlock(&GlobalMid_Lock);
 }
-struct cifsSesInfo *
+struct cifs_ses *
 sesInfoAlloc(void)
 {
-        struct cifsSesInfo *ret_buf;
+        struct cifs_ses *ret_buf;
-        ret_buf = kzalloc(sizeof(struct cifsSesInfo), GFP_KERNEL);
+        ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL);
        if (ret_buf) {
                atomic_inc(&sesInfoAllocCount);
                ret_buf->status = CifsNew;
@@ -85,7 +85,7 @@ sesInfoAlloc(void)
 }
 void
-sesInfoFree(struct cifsSesInfo *buf_to_free)
+sesInfoFree(struct cifs_ses *buf_to_free)
 {
        if (buf_to_free == NULL) {
                cFYI(1, "Null buffer passed to sesInfoFree");
@@ -105,11 +105,11 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
        kfree(buf_to_free);
 }
-struct cifsTconInfo *
+struct cifs_tcon *
 tconInfoAlloc(void)
 {
-        struct cifsTconInfo *ret_buf;
+        struct cifs_tcon *ret_buf;
-        ret_buf = kzalloc(sizeof(struct cifsTconInfo), GFP_KERNEL);
+        ret_buf = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL);
        if (ret_buf) {
                atomic_inc(&tconInfoAllocCount);
                ret_buf->tidStatus = CifsNew;
@@ -124,7 +124,7 @@ tconInfoAlloc(void)
 }
 void
-tconInfoFree(struct cifsTconInfo *buf_to_free)
+tconInfoFree(struct cifs_tcon *buf_to_free)
 {
        if (buf_to_free == NULL) {
                cFYI(1, "Null buffer passed to tconInfoFree");
@@ -295,11 +295,11 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
   case it is responsbility of caller to set the mid */
 void
 header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
-                const struct cifsTconInfo *treeCon, int word_count
+                const struct cifs_tcon *treeCon, int word_count
                /* length of fixed section (word count) in two byte units  */)
 {
        struct list_head *temp_item;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
        char *temp = (char *) buffer;
        memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */
@@ -359,7 +359,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                                                 "did not match tcon uid");
                                        spin_lock(&cifs_tcp_ses_lock);
                                        list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
-                                                ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
+                                                ses = list_entry(temp_item, struct cifs_ses, smb_ses_list);
                                                if (ses->linux_uid == current_fsuid()) {
                                                        if (ses->server == treeCon->ses->server) {
                                                                cFYI(1, "found matching uid substitute right smb_uid");
@@ -380,7 +380,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                if (treeCon->nocase)
                        buffer->Flags  |= SMBFLG_CASELESS;
                if ((treeCon->ses) && (treeCon->ses->server))
-                        if (treeCon->ses->server->secMode &
+                        if (treeCon->ses->server->sec_mode &
                          (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                                buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        }
@@ -507,8 +507,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 {
        struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
        struct list_head *tmp, *tmp1, *tmp2;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
-        struct cifsTconInfo *tcon;
+        struct cifs_tcon *tcon;
        struct cifsInodeInfo *pCifsInode;
        struct cifsFileInfo *netfile;
@@ -566,9 +566,9 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
        /* look up tcon based on tid & uid */
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &srv->smb_ses_list) {
-                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+                ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
                list_for_each(tmp1, &ses->tcon_list) {
-                        tcon = list_entry(tmp1, struct cifsTconInfo, tcon_list);
+                        tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
                        if (tcon->tid != buf->Tid)
                                continue;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 79b71c2c7c9d..73e47e84b61a 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -836,7 +836,7 @@ ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode)
 }
 int
-map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
+map_smb_to_linux_error(struct smb_hdr *smb, bool logErr)
 {
        unsigned int i;
        int rc = -EIO;  /* if transport error smb error may not be set */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f8e4cd2a7912..6751e745bbc6 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -195,7 +195,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
        int len;
        int oplock = 0;
        int rc;
-        struct cifsTconInfo *ptcon = cifs_sb_tcon(cifs_sb);
+        struct cifs_tcon *ptcon = cifs_sb_tcon(cifs_sb);
        char *tmpbuffer;
        rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
@@ -223,7 +223,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
        struct cifsFileInfo *cifsFile;
        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        struct tcon_link *tlink = NULL;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        if (file->private_data == NULL) {
                tlink = cifs_sb_tlink(cifs_sb);
@@ -496,7 +496,7 @@ static int cifs_save_resume_key(const char *current_entry,
   assume that they are located in the findfirst return buffer.*/
 /* We start counting in the buffer with entry 2 and increment for every
   entry (do not increment for . or .. entry) */
-static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
+static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
        struct file *file, char **ppCurrentEntry, int *num_to_ret)
 {
        int rc = 0;
@@ -764,7 +764,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 {
        int rc = 0;
        int xid, i;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct cifsFileInfo *cifsFile = NULL;
        char *current_entry;
        int num_to_fill = 0;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7dd462100378..3892ab817a36 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -37,13 +37,13 @@
 * the socket has been reestablished (so we know whether to use vc 0).
 * Called while holding the cifs_tcp_ses_lock, so do not block
 */
-static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
+static bool is_first_ses_reconnect(struct cifs_ses *ses)
 {
        struct list_head *tmp;
-        struct cifsSesInfo *tmp_ses;
+        struct cifs_ses *tmp_ses;
        list_for_each(tmp, &ses->server->smb_ses_list) {
-                tmp_ses = list_entry(tmp, struct cifsSesInfo,
+                tmp_ses = list_entry(tmp, struct cifs_ses,
                                     smb_ses_list);
                if (tmp_ses->need_reconnect == false)
                        return false;
@@ -61,11 +61,11 @@ static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
 *      any vc but zero (some servers reset the connection on vcnum zero)
 *
 */
-static __le16 get_next_vcnum(struct cifsSesInfo *ses)
+static __le16 get_next_vcnum(struct cifs_ses *ses)
 {
        __u16 vcnum = 0;
        struct list_head *tmp;
-        struct cifsSesInfo *tmp_ses;
+        struct cifs_ses *tmp_ses;
        __u16 max_vcs = ses->server->max_vcs;
        __u16 i;
        int free_vc_found = 0;
@@ -87,7 +87,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
                free_vc_found = 1;
                list_for_each(tmp, &ses->server->smb_ses_list) {
-                        tmp_ses = list_entry(tmp, struct cifsSesInfo,
+                        tmp_ses = list_entry(tmp, struct cifs_ses,
                                             smb_ses_list);
                        if (tmp_ses->vcnum == i) {
                                free_vc_found = 0;
@@ -114,7 +114,7 @@ get_vc_num_exit:
        return cpu_to_le16(vcnum);
 }
-static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
+static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
 {
        __u32 capabilities = 0;
@@ -136,7 +136,7 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
        capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
                        CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
-        if (ses->server->secMode &
+        if (ses->server->sec_mode &
            (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
@@ -181,7 +181,7 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
        *pbcc_area = bcc_ptr;
 }
-static void unicode_domain_string(char **pbcc_area, struct cifsSesInfo *ses,
+static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
                                   const struct nls_table *nls_cp)
 {
        char *bcc_ptr = *pbcc_area;
@@ -204,7 +204,7 @@ static void unicode_domain_string(char **pbcc_area, struct cifsSesInfo *ses,
 }
-static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
+static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
                                   const struct nls_table *nls_cp)
 {
        char *bcc_ptr = *pbcc_area;
@@ -236,7 +236,7 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
        *pbcc_area = bcc_ptr;
 }
-static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
+static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
                                 const struct nls_table *nls_cp)
 {
        char *bcc_ptr = *pbcc_area;
@@ -276,7 +276,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 }
 static void
-decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
+decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
                      const struct nls_table *nls_cp)
 {
        int len;
@@ -310,7 +310,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
 }
 static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
-                               struct cifsSesInfo *ses,
+                               struct cifs_ses *ses,
                               const struct nls_table *nls_cp)
 {
        int rc = 0;
@@ -364,7 +364,7 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
 }
 static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
-                                    struct cifsSesInfo *ses)
+                                    struct cifs_ses *ses)
 {
        unsigned int tioffset; /* challenge message target info area */
        unsigned int tilen; /* challenge message target info area length  */
@@ -411,7 +411,7 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 /* We do not malloc the blob, it is passed in pbuffer, because
   it is fixed size, and small, making this approach cleaner */
 static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
-                                         struct cifsSesInfo *ses)
+                                         struct cifs_ses *ses)
 {
        NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
        __u32 flags;
@@ -424,7 +424,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
        flags = NTLMSSP_NEGOTIATE_56 |  NTLMSSP_REQUEST_TARGET |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
-        if (ses->server->secMode &
+        if (ses->server->sec_mode &
                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                flags |= NTLMSSP_NEGOTIATE_SIGN;
                if (!ses->server->session_estab)
@@ -449,7 +449,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
   This function returns the length of the data in the blob */
 static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                                        u16 *buflen,
-                                   struct cifsSesInfo *ses,
+                                   struct cifs_ses *ses,
                                   const struct nls_table *nls_cp)
 {
        int rc;
@@ -464,10 +464,10 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
-        if (ses->server->secMode &
+        if (ses->server->sec_mode &
           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                flags |= NTLMSSP_NEGOTIATE_SIGN;
-        if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
+        if (ses->server->sec_mode & SECMODE_SIGN_REQUIRED)
                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
        tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
@@ -551,7 +551,7 @@ setup_ntlmv2_ret:
 }
 int
-CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
+CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses,
               const struct nls_table *nls_cp)
 {
        int rc = 0;
@@ -657,7 +657,7 @@ ssetup_ntlmssp_authenticate:
                 */
                rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
-                                 ses->server->secMode & SECMODE_PW_ENCRYPT ?
+                                 ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
                                        true : false, lnm_session_key);
                ses->flags |= CIFS_SES_LANMAN;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index f2513fb8c391..147aa22c3c3a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -295,7 +295,7 @@ static int wait_for_free_request(struct TCP_Server_Info *server,
        return 0;
 }
-static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
+static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
                        struct mid_q_entry **ppmidQ)
 {
        if (ses->server->tcpStatus == CifsExiting) {
@@ -342,22 +342,24 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 * the result. Caller is responsible for dealing with timeouts.
 */
 int
-cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
-                mid_callback_t *callback, void *cbdata)
+                unsigned int nvec, mid_callback_t *callback, void *cbdata,
+                bool ignore_pend)
 {
        int rc;
        struct mid_q_entry *mid;
+        struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base;
-        rc = wait_for_free_request(server, CIFS_ASYNC_OP);
+        rc = wait_for_free_request(server, ignore_pend ? CIFS_ASYNC_OP : 0);
        if (rc)
                return rc;
        /* enable signing if server requires it */
-        if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+        if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-                in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+                hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        mutex_lock(&server->srv_mutex);
-        mid = AllocMidQEntry(in_buf, server);
+        mid = AllocMidQEntry(hdr, server);
        if (mid == NULL) {
                mutex_unlock(&server->srv_mutex);
                return -ENOMEM;
@@ -368,7 +370,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
        list_add_tail(&mid->qhead, &server->pending_mid_q);
        spin_unlock(&GlobalMid_Lock);
-        rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+        rc = cifs_sign_smb2(iov, nvec, server, &mid->sequence_number);
        if (rc) {
                mutex_unlock(&server->srv_mutex);
                goto out_err;
@@ -380,7 +382,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
 #ifdef CONFIG_CIFS_STATS2
        atomic_inc(&server->inSend);
 #endif
-        rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+        rc = smb_sendv(server, iov, nvec);
 #ifdef CONFIG_CIFS_STATS2
        atomic_dec(&server->inSend);
        mid->when_sent = jiffies;
@@ -407,7 +409,7 @@ out_err:
 *
 */
 int
-SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
+SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
                struct smb_hdr *in_buf, int flags)
 {
        int rc;
@@ -424,7 +426,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
 }
 static int
-sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
+cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 {
        int rc = 0;
@@ -432,28 +434,21 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
                mid->mid, mid->midState);
        spin_lock(&GlobalMid_Lock);
-        /* ensure that it's no longer on the pending_mid_q */
-        list_del_init(&mid->qhead);
        switch (mid->midState) {
        case MID_RESPONSE_RECEIVED:
                spin_unlock(&GlobalMid_Lock);
                return rc;
-        case MID_REQUEST_SUBMITTED:
-                /* socket is going down, reject all calls */
-                if (server->tcpStatus == CifsExiting) {
-                        cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
-                               __func__, mid->mid, mid->command, mid->midState);
-                        rc = -EHOSTDOWN;
-                        break;
-                }
        case MID_RETRY_NEEDED:
                rc = -EAGAIN;
                break;
        case MID_RESPONSE_MALFORMED:
                rc = -EIO;
                break;
+        case MID_SHUTDOWN:
+                rc = -EHOSTDOWN;
+                break;
        default:
+                list_del_init(&mid->qhead);
                cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
                        mid->mid, mid->midState);
                rc = -EIO;
@@ -502,13 +497,31 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
 }
 int
-SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
+cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+                   bool log_error)
+{
+        dump_smb(mid->resp_buf,
+                 min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length)));
+        /* convert the length into a more usable form */
+        if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+                /* FIXME: add code to kill session */
+                if (cifs_verify_signature(mid->resp_buf, server,
+                                          mid->sequence_number + 1) != 0)
+                        cERROR(1, "Unexpected SMB signature");
+        }
+        /* BB special case reconnect tid and uid here? */
+        return map_smb_to_linux_error(mid->resp_buf, log_error);
+}
+int
+SendReceive2(const unsigned int xid, struct cifs_ses *ses,
             struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
             const int flags)
 {
        int rc = 0;
        int long_op;
-        unsigned int receive_len;
        struct mid_q_entry *midQ;
        struct smb_hdr *in_buf = iov[0].iov_base;
@@ -598,61 +611,31 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        cifs_small_buf_release(in_buf);
-        rc = sync_mid_result(midQ, ses->server);
+        rc = cifs_sync_mid_result(midQ, ses->server);
        if (rc != 0) {
                atomic_dec(&ses->server->inFlight);
                wake_up(&ses->server->request_q);
                return rc;
        }
-        receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length);
+        if (!midQ->resp_buf || midQ->midState != MID_RESPONSE_RECEIVED) {
-        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid);
                rc = -EIO;
+                cFYI(1, "Bad MID state?");
                goto out;
        }
-        /* rcvd frame is ok */
+        iov[0].iov_base = (char *)midQ->resp_buf;
+        iov[0].iov_len = be32_to_cpu(midQ->resp_buf->smb_buf_length) + 4;
-        if (midQ->resp_buf &&
+        if (midQ->largeBuf)
-            (midQ->midState == MID_RESPONSE_RECEIVED)) {
+                *pRespBufType = CIFS_LARGE_BUFFER;
+        else
-                iov[0].iov_base = (char *)midQ->resp_buf;
+                *pRespBufType = CIFS_SMALL_BUFFER;
-                if (midQ->largeBuf)
-                        *pRespBufType = CIFS_LARGE_BUFFER;
-                else
-                        *pRespBufType = CIFS_SMALL_BUFFER;
-                iov[0].iov_len = receive_len + 4;
-                dump_smb(midQ->resp_buf, 80);
-                /* convert the length into a more usable form */
-                if ((receive_len > 24) &&
-                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
-                                             SECMODE_SIGN_ENABLED))) {
-                        rc = cifs_verify_signature(midQ->resp_buf,
-                                                ses->server,
-                                                midQ->sequence_number+1);
-                        if (rc) {
-                                cERROR(1, "Unexpected SMB signature");
-                                /* BB FIXME add code to kill session */
-                        }
-                }
-                /* BB special case reconnect tid and uid here? */
-                rc = map_smb_to_linux_error(midQ->resp_buf,
-                                            flags & CIFS_LOG_ERROR);
-                if ((flags & CIFS_NO_RESP) == 0)
+        rc = cifs_check_receive(midQ, ses->server, flags & CIFS_LOG_ERROR);
-                        midQ->resp_buf = NULL;  /* mark it so buf will
-                                                   not be freed by
-                                                   delete_mid */
-        } else {
-                rc = -EIO;
-                cFYI(1, "Bad MID state?");
-        }
+        /* mark it so buf will not be freed by delete_mid */
+        if ((flags & CIFS_NO_RESP) == 0)
+                midQ->resp_buf = NULL;
 out:
        delete_mid(midQ);
        atomic_dec(&ses->server->inFlight);
@@ -662,12 +645,11 @@ out:
 }
 int
-SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
+SendReceive(const unsigned int xid, struct cifs_ses *ses,
            struct smb_hdr *in_buf, struct smb_hdr *out_buf,
            int *pbytes_returned, const int long_op)
 {
        int rc = 0;
-        unsigned int receive_len;
        struct mid_q_entry *midQ;
        if (ses == NULL) {
@@ -750,54 +732,23 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                spin_unlock(&GlobalMid_Lock);
        }
-        rc = sync_mid_result(midQ, ses->server);
+        rc = cifs_sync_mid_result(midQ, ses->server);
        if (rc != 0) {
                atomic_dec(&ses->server->inFlight);
                wake_up(&ses->server->request_q);
                return rc;
        }
-        receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length);
+        if (!midQ->resp_buf || !out_buf ||
+            midQ->midState != MID_RESPONSE_RECEIVED) {
-        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid);
-                rc = -EIO;
-                goto out;
-        }
-        /* rcvd frame is ok */
-        if (midQ->resp_buf && out_buf
-            && (midQ->midState == MID_RESPONSE_RECEIVED)) {
-                out_buf->smb_buf_length = cpu_to_be32(receive_len);
-                memcpy((char *)out_buf + 4,
-                       (char *)midQ->resp_buf + 4,
-                       receive_len);
-                dump_smb(out_buf, 92);
-                /* convert the length into a more usable form */
-                if ((receive_len > 24) &&
-                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
-                                             SECMODE_SIGN_ENABLED))) {
-                        rc = cifs_verify_signature(out_buf,
-                                                ses->server,
-                                                midQ->sequence_number+1);
-                        if (rc) {
-                                cERROR(1, "Unexpected SMB signature");
-                                /* BB FIXME add code to kill session */
-                        }
-                }
-                *pbytes_returned = be32_to_cpu(out_buf->smb_buf_length);
-                /* BB special case reconnect tid and uid here? */
-                rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
-        } else {
                rc = -EIO;
                cERROR(1, "Bad MID state?");
+                goto out;
        }
+        *pbytes_returned = be32_to_cpu(midQ->resp_buf->smb_buf_length);
+        memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
+        rc = cifs_check_receive(midQ, ses->server, 0);
 out:
        delete_mid(midQ);
        atomic_dec(&ses->server->inFlight);
@@ -810,12 +761,12 @@ out:
   blocking lock to return. */
 static int
-send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
+send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon,
                        struct smb_hdr *in_buf,
                        struct smb_hdr *out_buf)
 {
        int bytes_returned;
-        struct cifsSesInfo *ses = tcon->ses;
+        struct cifs_ses *ses = tcon->ses;
        LOCK_REQ *pSMB = (LOCK_REQ *)in_buf;
        /* We just modify the current in_buf to change
@@ -832,15 +783,14 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
 }
 int
-SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
+SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
            struct smb_hdr *in_buf, struct smb_hdr *out_buf,
            int *pbytes_returned)
 {
        int rc = 0;
        int rstart = 0;
-        unsigned int receive_len;
        struct mid_q_entry *midQ;
-        struct cifsSesInfo *ses;
+        struct cifs_ses *ses;
        if (tcon == NULL || tcon->ses == NULL) {
                cERROR(1, "Null smb session");
@@ -957,50 +907,20 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                rstart = 1;
        }
-        rc = sync_mid_result(midQ, ses->server);
+        rc = cifs_sync_mid_result(midQ, ses->server);
        if (rc != 0)
                return rc;
-        receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length);
-        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid);
-                rc = -EIO;
-                goto out;
-        }
        /* rcvd frame is ok */
+        if (out_buf == NULL || midQ->midState != MID_RESPONSE_RECEIVED) {
-        if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) {
                rc = -EIO;
                cERROR(1, "Bad MID state?");
                goto out;
        }
-        out_buf->smb_buf_length = cpu_to_be32(receive_len);
+        *pbytes_returned = be32_to_cpu(midQ->resp_buf->smb_buf_length);
-        memcpy((char *)out_buf + 4,
+        memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
-               (char *)midQ->resp_buf + 4,
+        rc = cifs_check_receive(midQ, ses->server, 0);
-               receive_len);
-        dump_smb(out_buf, 92);
-        /* convert the length into a more usable form */
-        if ((receive_len > 24) &&
-            (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
-                                     SECMODE_SIGN_ENABLED))) {
-                rc = cifs_verify_signature(out_buf,
-                                           ses->server,
-                                           midQ->sequence_number+1);
-                if (rc) {
-                        cERROR(1, "Unexpected SMB signature");
-                        /* BB FIXME add code to kill session */
-                }
-        }
-        *pbytes_returned = be32_to_cpu(out_buf->smb_buf_length);
-        /* BB special case reconnect tid and uid here? */
-        rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
 out:
        delete_mid(midQ);
        if (rstart && rc == -EACCES)
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 912995e013ec..2a22fb2989e4 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -49,7 +49,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct super_block *sb;
        char *full_path = NULL;
@@ -109,7 +109,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct super_block *sb;
        char *full_path;
        struct cifs_ntsd *pacl;
@@ -240,7 +240,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct super_block *sb;
        char *full_path;
@@ -372,7 +372,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifsTconInfo *pTcon;
+        struct cifs_tcon *pTcon;
        struct super_block *sb;
        char *full_path;
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index b80e0aa3cfa5..5a59efa0bb46 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -50,7 +50,7 @@ static int __init init_dlm(void)
        if (error)
                goto out_netlink;
-        printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
+        printk("DLM installed\n");
        return 0;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 227b409b8406..bc116b9ffcf2 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -529,6 +529,8 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
        dget(lower_dentry);
        rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry);
        dput(lower_dentry);
+        if (!rc && dentry->d_inode)
+                clear_nlink(dentry->d_inode);
        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
        dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
        unlock_dir(lower_dir_dentry);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 03e609c45012..27a7fefb83eb 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -599,8 +599,8 @@ struct ecryptfs_write_tag_70_packet_silly_stack {
        struct mutex *tfm_mutex;
        char *block_aligned_filename;
        struct ecryptfs_auth_tok *auth_tok;
-        struct scatterlist src_sg;
+        struct scatterlist src_sg[2];
-        struct scatterlist dst_sg;
+        struct scatterlist dst_sg[2];
        struct blkcipher_desc desc;
        char iv[ECRYPTFS_MAX_IV_BYTES];
        char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
@@ -816,23 +816,21 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
        memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
               filename_size);
        rc = virt_to_scatterlist(s->block_aligned_filename,
-                                 s->block_aligned_filename_size, &s->src_sg, 1);
+                                 s->block_aligned_filename_size, s->src_sg, 2);
-        if (rc != 1) {
+        if (rc < 1) {
                printk(KERN_ERR "%s: Internal error whilst attempting to "
-                       "convert filename memory to scatterlist; "
+                       "convert filename memory to scatterlist; rc = [%d]. "
-                       "expected rc = 1; got rc = [%d]. "
                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
                       s->block_aligned_filename_size);
                goto out_release_free_unlock;
        }
        rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
-                                 &s->dst_sg, 1);
+                                 s->dst_sg, 2);
-        if (rc != 1) {
+        if (rc < 1) {
                printk(KERN_ERR "%s: Internal error whilst attempting to "
                       "convert encrypted filename memory to scatterlist; "
-                       "expected rc = 1; got rc = [%d]. "
+                       "rc = [%d]. block_aligned_filename_size = [%zd]\n",
-                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       __func__, rc, s->block_aligned_filename_size);
-                       s->block_aligned_filename_size);
                goto out_release_free_unlock;
        }
        /* The characters in the first block effectively do the job
@@ -855,7 +853,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
                goto out_release_free_unlock;
        }
-        rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+        rc = crypto_blkcipher_encrypt_iv(&s->desc, s->dst_sg, s->src_sg,
                                         s->block_aligned_filename_size);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to encrypt filename; "
@@ -891,8 +889,8 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
        struct mutex *tfm_mutex;
        char *decrypted_filename;
        struct ecryptfs_auth_tok *auth_tok;
-        struct scatterlist src_sg;
+        struct scatterlist src_sg[2];
-        struct scatterlist dst_sg;
+        struct scatterlist dst_sg[2];
        struct blkcipher_desc desc;
        char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
        char iv[ECRYPTFS_MAX_IV_BYTES];
@@ -1008,13 +1006,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
        }
        mutex_lock(s->tfm_mutex);
        rc = virt_to_scatterlist(&data[(*packet_size)],
-                                 s->block_aligned_filename_size, &s->src_sg, 1);
+                                 s->block_aligned_filename_size, s->src_sg, 2);
-        if (rc != 1) {
+        if (rc < 1) {
                printk(KERN_ERR "%s: Internal error whilst attempting to "
                       "convert encrypted filename memory to scatterlist; "
-                       "expected rc = 1; got rc = [%d]. "
+                       "rc = [%d]. block_aligned_filename_size = [%zd]\n",
-                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       __func__, rc, s->block_aligned_filename_size);
-                       s->block_aligned_filename_size);
                goto out_unlock;
        }
        (*packet_size) += s->block_aligned_filename_size;
@@ -1028,13 +1025,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
                goto out_unlock;
        }
        rc = virt_to_scatterlist(s->decrypted_filename,
-                                 s->block_aligned_filename_size, &s->dst_sg, 1);
+                                 s->block_aligned_filename_size, s->dst_sg, 2);
-        if (rc != 1) {
+        if (rc < 1) {
                printk(KERN_ERR "%s: Internal error whilst attempting to "
                       "convert decrypted filename memory to scatterlist; "
-                       "expected rc = 1; got rc = [%d]. "
+                       "rc = [%d]. block_aligned_filename_size = [%zd]\n",
-                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       __func__, rc, s->block_aligned_filename_size);
-                       s->block_aligned_filename_size);
                goto out_free_unlock;
        }
        /* The characters in the first block effectively do the job of
@@ -1065,7 +1061,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
                goto out_free_unlock;
        }
-        rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+        rc = crypto_blkcipher_decrypt_iv(&s->desc, s->dst_sg, s->src_sg,
                                         s->block_aligned_filename_size);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to decrypt filename; "
diff --git a/fs/exec.c b/fs/exec.c
index 936f5776655c..ea5f748906a8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -42,7 +42,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
 #include <linux/namei.h>
-#include <linux/proc_fs.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -1624,6 +1623,41 @@ expand_fail:
        return ret;
 }
+static int cn_print_exe_file(struct core_name *cn)
+{
+        struct file *exe_file;
+        char *pathbuf, *path, *p;
+        int ret;
+        exe_file = get_mm_exe_file(current->mm);
+        if (!exe_file)
+                return cn_printf(cn, "(unknown)");
+        pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+        if (!pathbuf) {
+                ret = -ENOMEM;
+                goto put_exe_file;
+        }
+        path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+        if (IS_ERR(path)) {
+                ret = PTR_ERR(path);
+                goto free_buf;
+        }
+        for (p = path; *p; p++)
+                if (*p == '/')
+                        *p = '!';
+        ret = cn_printf(cn, "%s", path);
+free_buf:
+        kfree(pathbuf);
+put_exe_file:
+        fput(exe_file);
+        return ret;
+}
 /* format_corename will inspect the pattern parameter, and output a
 * name into corename, which must have space for at least
 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
@@ -1695,6 +1729,9 @@ static int format_corename(struct core_name *cn, long signr)
                        case 'e':
                                err = cn_printf(cn, "%s", current->comm);
                                break;
+                        case 'E':
+                                err = cn_print_exe_file(cn);
+                                break;
                        /* core limit size */
                        case 'c':
                                err = cn_printf(cn, "%lu",
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index cfa327d33194..c2b34cd2abe0 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -146,7 +146,7 @@ static int __init init_gfs2_fs(void)
        gfs2_register_debugfs();
-        printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
+        printk("GFS2 installed\n");
        return 0;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 05f73328b28b..9a1e86fc1362 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -75,7 +75,6 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
                                   struct nameidata *nd)
 {
        struct jffs2_inode_info *dir_f;
-        struct jffs2_sb_info *c;
        struct jffs2_full_dirent *fd = NULL, *fd_list;
        uint32_t ino = 0;
        struct inode *inode = NULL;
@@ -86,7 +85,6 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
                return ERR_PTR(-ENAMETOOLONG);
        dir_f = JFFS2_INODE_INFO(dir_i);
-        c = JFFS2_SB_INFO(dir_i->i_sb);
        mutex_lock(&dir_f->sem);
@@ -119,7 +117,6 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
 static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
        struct jffs2_inode_info *f;
-        struct jffs2_sb_info *c;
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct jffs2_full_dirent *fd;
        unsigned long offset, curofs;
@@ -127,7 +124,6 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
        D1(printk(KERN_DEBUG "jffs2_readdir() for dir_i #%lu\n", filp->f_path.dentry->d_inode->i_ino));
        f = JFFS2_INODE_INFO(inode);
-        c = JFFS2_SB_INFO(inode->i_sb);
        offset = filp->f_pos;
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index b632dddcb482..8d8cd3419d02 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -94,7 +94,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
        uint32_t buf_size = 0;
        struct jffs2_summary *s = NULL; /* summary info collected by the scan process */
 #ifndef __ECOS
-        size_t pointlen;
+        size_t pointlen, try_size;
        if (c->mtd->point) {
                ret = c->mtd->point(c->mtd, 0, c->mtd->size, &pointlen,
@@ -113,18 +113,21 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                /* For NAND it's quicker to read a whole eraseblock at a time,
                   apparently */
                if (jffs2_cleanmarker_oob(c))
-                        buf_size = c->sector_size;
+                        try_size = c->sector_size;
                else
-                        buf_size = PAGE_SIZE;
+                        try_size = PAGE_SIZE;
-                /* Respect kmalloc limitations */
+                D1(printk(KERN_DEBUG "Trying to allocate readbuf of %zu "
-                if (buf_size > 128*1024)
+                        "bytes\n", try_size));
-                        buf_size = 128*1024;
-                D1(printk(KERN_DEBUG "Allocating readbuf of %d bytes\n", buf_size));
+                flashbuf = mtd_kmalloc_up_to(c->mtd, &try_size);
-                flashbuf = kmalloc(buf_size, GFP_KERNEL);
                if (!flashbuf)
                        return -ENOMEM;
+                D1(printk(KERN_DEBUG "Allocated readbuf of %zu bytes\n",
+                        try_size));
+                buf_size = (uint32_t)try_size;
        }
        if (jffs2_sum_active()) {
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index a7c07b44b100..e5d71b27a5b0 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -16,6 +16,7 @@
 #include <linux/mman.h>
 #include <linux/string.h>
 #include <linux/fcntl.h>
+#include <linux/memcontrol.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -92,6 +93,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
         * -- wli
         */
        count_vm_event(PGMAJFAULT);
+        mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
        return VM_FAULT_MAJOR;
 }
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 4c5488468c14..cd9427023d2e 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -368,7 +368,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
                                         int *vict_bit,
                                         struct buffer_head **ret_bh)
 {
-        int ret, i, blocks_per_unit = 1;
+        int ret, i, bits_per_unit = 0;
        u64 blkno;
        char namebuf[40];
@@ -398,14 +398,14 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
        rec = &(cl->cl_recs[0]);
        if (type == GLOBAL_BITMAP_SYSTEM_INODE)
-                blocks_per_unit <<= (osb->s_clustersize_bits -
+                bits_per_unit = osb->s_clustersize_bits -
-                                                inode->i_sb->s_blocksize_bits);
+                                        inode->i_sb->s_blocksize_bits;
        /*
         * 'vict_blkno' was out of the valid range.
         */
        if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
-            (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
+            (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
-                                blocks_per_unit))) {
+                                bits_per_unit))) {
                ret = -EINVAL;
                goto out;
        }
@@ -441,8 +441,8 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
                                                le16_to_cpu(bg->bg_bits))) {
                                *ret_bh = gd_bh;
-                                *vict_bit = (vict_blkno - blkno) /
+                                *vict_bit = (vict_blkno - blkno) >>
-                                                        blocks_per_unit;
+                                                        bits_per_unit;
                                mlog(0, "find the victim group: #%llu, "
                                     "total_bits: %u, vict_bit: %u\n",
                                     blkno, le16_to_cpu(bg->bg_bits),
@@ -472,12 +472,24 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
        int ret, goal_bit = 0;
        struct buffer_head *gd_bh = NULL;
-        struct ocfs2_group_desc *bg;
+        struct ocfs2_group_desc *bg = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int c_to_b = 1 << (osb->s_clustersize_bits -
                                        inode->i_sb->s_blocksize_bits);
        /*
+         * make goal become cluster aligned.
+         */
+        range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
+                                                      range->me_goal);
+        /*
+         * moving goal is not allowd to start with a group desc blok(#0 blk)
+         * let's compromise to the latter cluster.
+         */
+        if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+                range->me_goal += c_to_b;
+        /*
         * validate goal sits within global_bitmap, and return the victim
         * group desc
         */
@@ -491,19 +503,6 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
        bg = (struct ocfs2_group_desc *)gd_bh->b_data;
        /*
-         * make goal become cluster aligned.
-         */
-        if (range->me_goal % c_to_b)
-                range->me_goal = range->me_goal / c_to_b * c_to_b;
-        /*
-         * moving goal is not allowd to start with a group desc blok(#0 blk)
-         * let's compromise to the latter cluster.
-         */
-        if (range->me_goal == le64_to_cpu(bg->bg_blkno))
-                range->me_goal += c_to_b;
-        /*
         * movement is not gonna cross two groups.
         */
        if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8ed4d3433199..f82e762eeca2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -256,10 +256,12 @@ ssize_t part_discard_alignment_show(struct device *dev,
 {
        struct hd_struct *p = dev_to_part(dev);
        struct gendisk *disk = dev_to_disk(dev);
+        unsigned int alignment = 0;
-        return sprintf(buf, "%u\n",
+        if (disk->queue)
-                        queue_limit_discard_alignment(&disk->queue->limits,
+                alignment = queue_limit_discard_alignment(&disk->queue->limits,
-                                                        p->start_sect));
+                                                                p->start_sect);
+        return sprintf(buf, "%u\n", alignment);
 }
 ssize_t part_stat_show(struct device *dev,
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 19d6750d1d6c..6296b403c67a 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -310,6 +310,15 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
                goto fail;
        }
+        /* Check the GUID Partition Table header size */
+        if (le32_to_cpu((*gpt)->header_size) >
+                        bdev_logical_block_size(state->bdev)) {
+                pr_debug("GUID Partition Table Header size is wrong: %u > %u\n",
+                        le32_to_cpu((*gpt)->header_size),
+                        bdev_logical_block_size(state->bdev));
+                goto fail;
+        }
        /* Check the GUID Partition Table CRC */
        origcrc = le32_to_cpu((*gpt)->header_crc32);
        (*gpt)->header_crc32 = 0;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 5e4f776b0917..9b45ee84fbcc 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -131,7 +131,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
 * you can test for combinations of others with
 * simple bit tests.
 */
-static const char *task_state_array[] = {
+static const char * const task_state_array[] = {
        "R (running)",          /*   0 */
        "S (sleeping)",         /*   1 */
        "D (disk sleep)",       /*   2 */
@@ -147,7 +147,7 @@ static const char *task_state_array[] = {
 static inline const char *get_task_state(struct task_struct *tsk)
 {
        unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
-        const char **p = &task_state_array[0];
+        const char * const *p = &task_state_array[0];
        BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dc8bca72b002..4ede550517a6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -894,20 +894,20 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
        if (!task)
                goto out_no_task;
+        copied = -ENOMEM;
+        page = (char *)__get_free_page(GFP_TEMPORARY);
+        if (!page)
+                goto out_task;
        mm = check_mem_permission(task);
        copied = PTR_ERR(mm);
        if (IS_ERR(mm))
-                goto out_task;
+                goto out_free;
        copied = -EIO;
        if (file->private_data != (void *)((long)current->self_exec_id))
                goto out_mm;
-        copied = -ENOMEM;
-        page = (char *)__get_free_page(GFP_TEMPORARY);
-        if (!page)
-                goto out_mm;
        copied = 0;
        while (count > 0) {
                int this_len, retval;
@@ -929,9 +929,11 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
                count -= retval;                        
        }
        *ppos = dst;
-        free_page((unsigned long) page);
 out_mm:
        mmput(mm);
+out_free:
+        free_page((unsigned long) page);
 out_task:
        put_task_struct(task);
 out_no_task:
@@ -1059,7 +1061,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 {
        struct task_struct *task;
        char buffer[PROC_NUMBUF];
-        long oom_adjust;
+        int oom_adjust;
        unsigned long flags;
        int err;
@@ -1071,7 +1073,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
                goto out;
        }
-        err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
+        err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
        if (err)
                goto out;
        if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
@@ -1168,7 +1170,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        struct task_struct *task;
        char buffer[PROC_NUMBUF];
        unsigned long flags;
-        long oom_score_adj;
+        int oom_score_adj;
        int err;
        memset(buffer, 0, sizeof(buffer));
@@ -1179,7 +1181,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto out;
        }
-        err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
+        err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
        if (err)
                goto out;
        if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
@@ -1468,7 +1470,7 @@ sched_autogroup_write(struct file *file, const char __user *buf,
        struct inode *inode = file->f_path.dentry->d_inode;
        struct task_struct *p;
        char buffer[PROC_NUMBUF];
-        long nice;
+        int nice;
        int err;
        memset(buffer, 0, sizeof(buffer));
@@ -1477,9 +1479,9 @@ sched_autogroup_write(struct file *file, const char __user *buf,
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
-        err = strict_strtol(strstrip(buffer), 0, &nice);
+        err = kstrtoint(strstrip(buffer), 0, &nice);
-        if (err)
+        if (err < 0)
-                return -EINVAL;
+                return err;
        p = get_proc_task(inode);
        if (!p)
@@ -1576,57 +1578,6 @@ static const struct file_operations proc_pid_set_comm_operations = {
        .release        = single_release,
 };
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
-        mm->num_exe_file_vmas++;
-}
-void removed_exe_file_vma(struct mm_struct *mm)
-{
-        mm->num_exe_file_vmas--;
-        if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
-                fput(mm->exe_file);
-                mm->exe_file = NULL;
-        }
-}
-void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
-{
-        if (new_exe_file)
-                get_file(new_exe_file);
-        if (mm->exe_file)
-                fput(mm->exe_file);
-        mm->exe_file = new_exe_file;
-        mm->num_exe_file_vmas = 0;
-}
-struct file *get_mm_exe_file(struct mm_struct *mm)
-{
-        struct file *exe_file;
-        /* We need mmap_sem to protect against races with removal of
-         * VM_EXECUTABLE vmas */
-        down_read(&mm->mmap_sem);
-        exe_file = mm->exe_file;
-        if (exe_file)
-                get_file(exe_file);
-        up_read(&mm->mmap_sem);
-        return exe_file;
-}
-void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
-{
-        /* It's safe to write the exe_file pointer without exe_file_lock because
-         * this is called during fork when the task is not yet in /proc */
-        newmm->exe_file = get_mm_exe_file(oldmm);
-}
 static int proc_exe_link(struct inode *inode, struct path *exe_path)
 {
        struct task_struct *task;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 1cffa2b8a2fc..9758b654a1bc 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -138,9 +138,9 @@ static int stat_open(struct inode *inode, struct file *file)
        struct seq_file *m;
        int res;
-        /* don't ask for more than the kmalloc() max size, currently 128 KB */
+        /* don't ask for more than the kmalloc() max size */
-        if (size > 128 * 1024)
+        if (size > KMALLOC_MAX_SIZE)
-                size = 128 * 1024;
+                size = KMALLOC_MAX_SIZE;
        buf = kmalloc(size, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index db15935fa757..25b6a887adb9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -536,15 +536,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
        char buffer[PROC_NUMBUF];
        struct mm_struct *mm;
        struct vm_area_struct *vma;
-        long type;
+        int type;
+        int rv;
        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
-        if (strict_strtol(strstrip(buffer), 10, &type))
+        rv = kstrtoint(strstrip(buffer), 10, &type);
-                return -EINVAL;
+        if (rv < 0)
+                return rv;
        if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
                return -EINVAL;
        task = get_proc_task(file->f_path.dentry->d_inode);
@@ -769,18 +771,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!task)
                goto out;
-        mm = mm_for_maps(task);
-        ret = PTR_ERR(mm);
-        if (!mm || IS_ERR(mm))
-                goto out_task;
        ret = -EINVAL;
        /* file position must be aligned */
        if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
                goto out_task;
        ret = 0;
        if (!count)
                goto out_task;
@@ -788,7 +784,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
        ret = -ENOMEM;
        if (!pm.buffer)
-                goto out_mm;
+                goto out_task;
+        mm = mm_for_maps(task);
+        ret = PTR_ERR(mm);
+        if (!mm || IS_ERR(mm))
+                goto out_free;
        pagemap_walk.pmd_entry = pagemap_pte_range;
        pagemap_walk.pte_hole = pagemap_pte_hole;
@@ -831,7 +832,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                len = min(count, PM_ENTRY_BYTES * pm.pos);
                if (copy_to_user(buf, pm.buffer, len)) {
                        ret = -EFAULT;
-                        goto out_free;
+                        goto out_mm;
                }
                copied += len;
                buf += len;
@@ -841,10 +842,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!ret || ret == PM_END_OF_BUFFER)
                ret = copied;
-out_free:
-        kfree(pm.buffer);
 out_mm:
        mmput(mm);
+out_free:
+        kfree(pm.buffer);
 out_task:
        put_task_struct(task);
 out:
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 74802bc5ded9..cd99bf557650 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -35,6 +35,46 @@ static u64 vmcore_size;
 static struct proc_dir_entry *proc_vmcore = NULL;
+/*
+ * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
+ * The called function has to take care of module refcounting.
+ */
+static int (*oldmem_pfn_is_ram)(unsigned long pfn);
+int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn))
+{
+        if (oldmem_pfn_is_ram)
+                return -EBUSY;
+        oldmem_pfn_is_ram = fn;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram);
+void unregister_oldmem_pfn_is_ram(void)
+{
+        oldmem_pfn_is_ram = NULL;
+        wmb();
+}
+EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram);
+static int pfn_is_ram(unsigned long pfn)
+{
+        int (*fn)(unsigned long pfn);
+        /* pfn is ram unless fn() checks pagetype */
+        int ret = 1;
+        /*
+         * Ask hypervisor if the pfn is really ram.
+         * A ballooned page contains no data and reading from such a page
+         * will cause high load in the hypervisor.
+         */
+        fn = oldmem_pfn_is_ram;
+        if (fn)
+                ret = fn(pfn);
+        return ret;
+}
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
                                u64 *ppos, int userbuf)
@@ -55,9 +95,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
                else
                        nr_bytes = count;
-                tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf);
+                /* If pfn is not ram, return zeros for sparse dump files */
-                if (tmp < 0)
+                if (pfn_is_ram(pfn) == 0)
-                        return tmp;
+                        memset(buf, 0, nr_bytes);
+                else {
+                        tmp = copy_oldmem_page(pfn, buf, nr_bytes,
+                                                offset, userbuf);
+                        if (tmp < 0)
+                                return tmp;
+                }
                *ppos += nr_bytes;
                count -= nr_bytes;
                buf += nr_bytes;
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 8ab48bc2fa7d..ed0eb2a921f4 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 4b5a3fbb1f1f..f744be98cd5a 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -393,19 +393,36 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
 /*
 * Read a filesystem table (uncompressed sequence of bytes) from disk
 */
-int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
+void *squashfs_read_table(struct super_block *sb, u64 block, int length)
-        int length)
 {
        int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        int i, res;
-        void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+        void *table, *buffer, **data;
-        if (data == NULL)
-                return -ENOMEM;
+        table = buffer = kmalloc(length, GFP_KERNEL);
+        if (table == NULL)
+                return ERR_PTR(-ENOMEM);
+        data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+        if (data == NULL) {
+                res = -ENOMEM;
+                goto failed;
+        }
        for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
                data[i] = buffer;
        res = squashfs_read_data(sb, data, block, length |
                SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
        kfree(data);
-        return res;
+        if (res < 0)
+                goto failed;
+        return table;
+failed:
+        kfree(table);
+        return ERR_PTR(res);
 }
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index e921bd213738..9f1b0bb96f13 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 099745ad5691..8ba70cff09a6 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -4,7 +4,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 3f79cd1d0c19..9dfe2ce0fb70 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 7f93d5a9ee05..730c56248c9b 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -121,30 +121,38 @@ static struct dentry *squashfs_get_parent(struct dentry *child)
 * Read uncompressed inode lookup table indexes off disk into memory
 */
 __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
-                u64 lookup_table_start, unsigned int inodes)
+                u64 lookup_table_start, u64 next_table, unsigned int inodes)
 {
        unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
-        __le64 *inode_lookup_table;
+        __le64 *table;
-        int err;
        TRACE("In read_inode_lookup_table, length %d\n", length);
-        /* Allocate inode lookup table indexes */
+        /* Sanity check values */
-        inode_lookup_table = kmalloc(length, GFP_KERNEL);
-        if (inode_lookup_table == NULL) {
+        /* there should always be at least one inode */
-                ERROR("Failed to allocate inode lookup table\n");
+        if (inodes == 0)
-                return ERR_PTR(-ENOMEM);
+                return ERR_PTR(-EINVAL);
-        }
+        /* length bytes should not extend into the next table - this check
+         * also traps instances where lookup_table_start is incorrectly larger
+         * than the next table start
+         */
+        if (lookup_table_start + length > next_table)
+                return ERR_PTR(-EINVAL);
+        table = squashfs_read_table(sb, lookup_table_start, length);
-        err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start,
+        /*
-                        length);
+         * table[0] points to the first inode lookup table metadata block,
-        if (err < 0) {
+         * this should be less than lookup_table_start
-                ERROR("unable to read inode lookup table\n");
+         */
-                kfree(inode_lookup_table);
+        if (!IS_ERR(table) && table[0] >= lookup_table_start) {
-                return ERR_PTR(err);
+                kfree(table);
+                return ERR_PTR(-EINVAL);
        }
-        return inode_lookup_table;
+        return table;
 }
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index a25c5060bdcb..38bb1c640559 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 7eef571443c6..1516a6490bfb 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -71,26 +71,29 @@ int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment,
 * Read the uncompressed fragment lookup table indexes off disk into memory
 */
 __le64 *squashfs_read_fragment_index_table(struct super_block *sb,
-        u64 fragment_table_start, unsigned int fragments)
+        u64 fragment_table_start, u64 next_table, unsigned int fragments)
 {
        unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments);
-        __le64 *fragment_index;
+        __le64 *table;
-        int err;
-        /* Allocate fragment lookup table indexes */
+        /*
-        fragment_index = kmalloc(length, GFP_KERNEL);
+         * Sanity check, length bytes should not extend into the next table -
-        if (fragment_index == NULL) {
+         * this check also traps instances where fragment_table_start is
-                ERROR("Failed to allocate fragment index table\n");
+         * incorrectly larger than the next table start
-                return ERR_PTR(-ENOMEM);
+         */
-        }
+        if (fragment_table_start + length > next_table)
+                return ERR_PTR(-EINVAL);
+        table = squashfs_read_table(sb, fragment_table_start, length);
-        err = squashfs_read_table(sb, fragment_index, fragment_table_start,
+        /*
-                        length);
+         * table[0] points to the first fragment table metadata block, this
-        if (err < 0) {
+         * should be less than fragment_table_start
-                ERROR("unable to read fragment index table\n");
+         */
-                kfree(fragment_index);
+        if (!IS_ERR(table) && table[0] >= fragment_table_start) {
-                return ERR_PTR(err);
+                kfree(table);
+                return ERR_PTR(-EINVAL);
        }
-        return fragment_index;
+        return table;
 }
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index d8f32452638e..a70858e0fb44 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -66,27 +66,37 @@ int squashfs_get_id(struct super_block *sb, unsigned int index,
 * Read uncompressed id lookup table indexes from disk into memory
 */
 __le64 *squashfs_read_id_index_table(struct super_block *sb,
-                        u64 id_table_start, unsigned short no_ids)
+                u64 id_table_start, u64 next_table, unsigned short no_ids)
 {
        unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
-        __le64 *id_table;
+        __le64 *table;
-        int err;
        TRACE("In read_id_index_table, length %d\n", length);
-        /* Allocate id lookup table indexes */
+        /* Sanity check values */
-        id_table = kmalloc(length, GFP_KERNEL);
-        if (id_table == NULL) {
+        /* there should always be at least one id */
-                ERROR("Failed to allocate id index table\n");
+        if (no_ids == 0)
-                return ERR_PTR(-ENOMEM);
+                return ERR_PTR(-EINVAL);
-        }
+        /*
+         * length bytes should not extend into the next table - this check
+         * also traps instances where id_table_start is incorrectly larger
+         * than the next table start
+         */
+        if (id_table_start + length > next_table)
+                return ERR_PTR(-EINVAL);
+        table = squashfs_read_table(sb, id_table_start, length);
-        err = squashfs_read_table(sb, id_table, id_table_start, length);
+        /*
-        if (err < 0) {
+         * table[0] points to the first id lookup table metadata block, this
-                ERROR("unable to read id index table\n");
+         * should be less than id_table_start
-                kfree(id_table);
+         */
-                return ERR_PTR(err);
+        if (!IS_ERR(table) && table[0] >= id_table_start) {
+                kfree(table);
+                return ERR_PTR(-EINVAL);
        }
-        return id_table;
+        return table;
 }
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 62e63ad25075..04bebcaa2373 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5d922a6701ab..4bc63ac64bc0 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 1f2e608b8785..e3be6a71cfa7 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -44,24 +44,24 @@ extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *,
                                u64, int);
 extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
                                u64, int);
-extern int squashfs_read_table(struct super_block *, void *, u64, int);
+extern void *squashfs_read_table(struct super_block *, u64, int);
 /* decompressor.c */
 extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
 extern void *squashfs_decompressor_init(struct super_block *, unsigned short);
 /* export.c */
-extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
+extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64,
                                unsigned int);
 /* fragment.c */
 extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
 extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
-                                u64, unsigned int);
+                                u64, u64, unsigned int);
 /* id.c */
 extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
-extern __le64 *squashfs_read_id_index_table(struct super_block *, u64,
+extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64,
                                unsigned short);
 /* inode.c */
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 4582c568ef4d..b4a4e539a08c 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -4,7 +4,7 @@
 * Squashfs
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index 359baefc01fc..73588e7700ed 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -4,7 +4,7 @@
 * Squashfs
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index d9037a5215f0..651f0b31d296 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -4,7 +4,7 @@
 * Squashfs
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 5c8184c061a4..6f26abee3597 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -83,7 +83,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        long long root_inode;
        unsigned short flags;
        unsigned int fragments;
-        u64 lookup_table_start, xattr_id_table_start;
+        u64 lookup_table_start, xattr_id_table_start, next_table;
        int err;
        TRACE("Entered squashfs_fill_superblock\n");
@@ -95,12 +95,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        }
        msblk = sb->s_fs_info;
-        sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
-        if (sblk == NULL) {
-                ERROR("Failed to allocate squashfs_super_block\n");
-                goto failure;
-        }
        msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
        msblk->devblksize_log2 = ffz(~msblk->devblksize);
@@ -114,10 +108,12 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
         * of bytes_used) we need to set it to an initial sensible dummy value
         */
        msblk->bytes_used = sizeof(*sblk);
-        err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk));
+        sblk = squashfs_read_table(sb, SQUASHFS_START, sizeof(*sblk));
-        if (err < 0) {
+        if (IS_ERR(sblk)) {
                ERROR("unable to read squashfs_super_block\n");
+                err = PTR_ERR(sblk);
+                sblk = NULL;
                goto failed_mount;
        }
@@ -218,18 +214,61 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
+        /* Handle xattrs */
+        sb->s_xattr = squashfs_xattr_handlers;
+        xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
+        if (xattr_id_table_start == SQUASHFS_INVALID_BLK) {
+                next_table = msblk->bytes_used;
+                goto allocate_id_index_table;
+        }
+        /* Allocate and read xattr id lookup table */
+        msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
+                xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
+        if (IS_ERR(msblk->xattr_id_table)) {
+                ERROR("unable to read xattr id index table\n");
+                err = PTR_ERR(msblk->xattr_id_table);
+                msblk->xattr_id_table = NULL;
+                if (err != -ENOTSUPP)
+                        goto failed_mount;
+        }
+        next_table = msblk->xattr_table;
+allocate_id_index_table:
        /* Allocate and read id index table */
        msblk->id_table = squashfs_read_id_index_table(sb,
-                le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
+                le64_to_cpu(sblk->id_table_start), next_table,
+                le16_to_cpu(sblk->no_ids));
        if (IS_ERR(msblk->id_table)) {
+                ERROR("unable to read id index table\n");
                err = PTR_ERR(msblk->id_table);
                msblk->id_table = NULL;
                goto failed_mount;
        }
+        next_table = msblk->id_table[0];
+        /* Handle inode lookup table */
+        lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
+        if (lookup_table_start == SQUASHFS_INVALID_BLK)
+                goto handle_fragments;
+        /* Allocate and read inode lookup table */
+        msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
+                lookup_table_start, next_table, msblk->inodes);
+        if (IS_ERR(msblk->inode_lookup_table)) {
+                ERROR("unable to read inode lookup table\n");
+                err = PTR_ERR(msblk->inode_lookup_table);
+                msblk->inode_lookup_table = NULL;
+                goto failed_mount;
+        }
+        next_table = msblk->inode_lookup_table[0];
+        sb->s_export_op = &squashfs_export_ops;
+handle_fragments:
        fragments = le32_to_cpu(sblk->fragments);
        if (fragments == 0)
-                goto allocate_lookup_table;
+                goto check_directory_table;
        msblk->fragment_cache = squashfs_cache_init("fragment",
                SQUASHFS_CACHED_FRAGMENTS, msblk->block_size);
@@ -240,45 +279,29 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        /* Allocate and read fragment index table */
        msblk->fragment_index = squashfs_read_fragment_index_table(sb,
-                le64_to_cpu(sblk->fragment_table_start), fragments);
+                le64_to_cpu(sblk->fragment_table_start), next_table, fragments);
        if (IS_ERR(msblk->fragment_index)) {
+                ERROR("unable to read fragment index table\n");
                err = PTR_ERR(msblk->fragment_index);
                msblk->fragment_index = NULL;
                goto failed_mount;
        }
+        next_table = msblk->fragment_index[0];
-allocate_lookup_table:
+check_directory_table:
-        lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
+        /* Sanity check directory_table */
-        if (lookup_table_start == SQUASHFS_INVALID_BLK)
+        if (msblk->directory_table >= next_table) {
-                goto allocate_xattr_table;
+                err = -EINVAL;
-        /* Allocate and read inode lookup table */
-        msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
-                lookup_table_start, msblk->inodes);
-        if (IS_ERR(msblk->inode_lookup_table)) {
-                err = PTR_ERR(msblk->inode_lookup_table);
-                msblk->inode_lookup_table = NULL;
                goto failed_mount;
        }
-        sb->s_export_op = &squashfs_export_ops;
+        /* Sanity check inode_table */
+        if (msblk->inode_table >= msblk->directory_table) {
-allocate_xattr_table:
+                err = -EINVAL;
-        sb->s_xattr = squashfs_xattr_handlers;
+                goto failed_mount;
-        xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
-        if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
-                goto allocate_root;
-        /* Allocate and read xattr id lookup table */
-        msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
-                xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
-        if (IS_ERR(msblk->xattr_id_table)) {
-                err = PTR_ERR(msblk->xattr_id_table);
-                msblk->xattr_id_table = NULL;
-                if (err != -ENOTSUPP)
-                        goto failed_mount;
        }
-allocate_root:
+        /* allocate root */
        root = new_inode(sb);
        if (!root) {
                err = -ENOMEM;
@@ -318,11 +341,6 @@ failed_mount:
        sb->s_fs_info = NULL;
        kfree(sblk);
        return err;
-failure:
-        kfree(sb->s_fs_info);
-        sb->s_fs_info = NULL;
-        return -ENOMEM;
 }
@@ -475,5 +493,5 @@ static const struct super_operations squashfs_super_ops = {
 module_init(init_squashfs_fs);
 module_exit(exit_squashfs_fs);
 MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem");
-MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>");
+MODULE_AUTHOR("Phillip Lougher <phillip@squashfs.org.uk>");
 MODULE_LICENSE("GPL");
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index ec86434921e1..1191817264cc 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 3876c36699a1..92fcde7b4d61 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2010
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index b634efce4bde..c83f5d9ec125 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2010
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -31,6 +31,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
                u64 start, u64 *xattr_table_start, int *xattr_ids)
 {
        ERROR("Xattrs in filesystem, these will be ignored\n");
+        *xattr_table_start = start;
        return ERR_PTR(-ENOTSUPP);
 }
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index 05385dbe1465..c89607d690c4 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2010
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
@@ -67,34 +67,29 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
                u64 *xattr_table_start, int *xattr_ids)
 {
        unsigned int len;
-        __le64 *xid_table;
+        struct squashfs_xattr_id_table *id_table;
-        struct squashfs_xattr_id_table id_table;
-        int err;
+        id_table = squashfs_read_table(sb, start, sizeof(*id_table));
+        if (IS_ERR(id_table))
+                return (__le64 *) id_table;
+        *xattr_table_start = le64_to_cpu(id_table->xattr_table_start);
+        *xattr_ids = le32_to_cpu(id_table->xattr_ids);
+        kfree(id_table);
+        /* Sanity check values */
+        /* there is always at least one xattr id */
+        if (*xattr_ids == 0)
+                return ERR_PTR(-EINVAL);
+        /* xattr_table should be less than start */
+        if (*xattr_table_start >= start)
+                return ERR_PTR(-EINVAL);
-        err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
-        if (err < 0) {
-                ERROR("unable to read xattr id table\n");
-                return ERR_PTR(err);
-        }
-        *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
-        *xattr_ids = le32_to_cpu(id_table.xattr_ids);
        len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
        TRACE("In read_xattr_index_table, length %d\n", len);
-        /* Allocate xattr id lookup table indexes */
+        return squashfs_read_table(sb, start + sizeof(*id_table), len);
-        xid_table = kmalloc(len, GFP_KERNEL);
-        if (xid_table == NULL) {
-                ERROR("Failed to allocate xattr id index table\n");
-                return ERR_PTR(-ENOMEM);
-        }
-        err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
-        if (err < 0) {
-                ERROR("unable to read xattr id index table\n");
-                kfree(xid_table);
-                return ERR_PTR(err);
-        }
-        return xid_table;
 }
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index aa47a286d1f8..1760b7d108f6 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 517688b32ffa..55d918fd2d86 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -2,7 +2,7 @@
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
- * Phillip Lougher <phillip@lougher.demon.co.uk>
+ * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 46f7a807bbc1..42694e11c23d 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -424,8 +424,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
                        ufs_cpu_to_data_ptr(sb, p, result);
                        *err = 0;
                        UFS_I(inode)->i_lastfrag =
-                                max_t(u32, UFS_I(inode)->i_lastfrag,
+                                max(UFS_I(inode)->i_lastfrag, fragment + count);
-                                      fragment + count);
                        ufs_clear_frags(inode, result + oldcount,
                                        newcount - oldcount, locked_page != NULL);
                }
@@ -440,7 +439,8 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
        result = ufs_add_fragments (inode, tmp, oldcount, newcount, err);
        if (result) {
                *err = 0;
-                UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
+                UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
+                                                fragment + count);
                ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
                                locked_page != NULL);
                unlock_super(sb);
@@ -479,7 +479,8 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
                                   uspi->s_sbbase + result, locked_page);
                ufs_cpu_to_data_ptr(sb, p, result);
                *err = 0;
-                UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
+                UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
+                                                fragment + count);
                unlock_super(sb);
                if (newcount < request)
                        ufs_free_fragments (inode, result + newcount, request - newcount);
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 5f821dbc0579..f04f89fbd4d9 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -84,7 +84,7 @@ static int ufs_trunc_direct(struct inode *inode)
        retry = 0;
        
        frag1 = DIRECT_FRAGMENT;
-        frag4 = min_t(u32, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
+        frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
        frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1);
        frag3 = frag4 & ~uspi->s_fpbmask;
        block1 = block2 = 0;