35 files changed, 1591 insertions, 639 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index a97263be6a91..0e7da7bb5d93 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -186,32 +186,7 @@ source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
+source "fs/nilfs2/Kconfig"
-config NILFS2_FS
-        tristate "NILFS2 file system support (EXPERIMENTAL)"
-        depends on BLOCK && EXPERIMENTAL
-        select CRC32
-        help
-          NILFS2 is a log-structured file system (LFS) supporting continuous
-          snapshotting.  In addition to versioning capability of the entire
-          file system, users can even restore files mistakenly overwritten or
-          destroyed just a few seconds ago.  Since this file system can keep
-          consistency like conventional LFS, it achieves quick recovery after
-          system crashes.
-          NILFS2 creates a number of checkpoints every few seconds or per
-          synchronous write basis (unless there is no change).  Users can
-          select significant versions among continuously created checkpoints,
-          and can change them into snapshots which will be preserved for long
-          periods until they are changed back to checkpoints.  Each
-          snapshot is mountable as a read-only file system concurrently with
-          its writable mount, and this feature is convenient for online backup.
-          Some features including atime, extended attributes, and POSIX ACLs,
-          are not supported yet.
-          To compile this file system support as a module, choose M here: the
-          module will be called nilfs2.  If unsure, say N.
 endif # MISC_FILESYSTEMS
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 6e4f6c50a120..019e8af449ab 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work)
         * list
         */
        if (worker->idle) {
-                spin_lock_irqsave(&worker->workers->lock, flags);
+                spin_lock(&worker->workers->lock);
                worker->idle = 0;
                list_move_tail(&worker->worker_list,
                               &worker->workers->worker_list);
-                spin_unlock_irqrestore(&worker->workers->lock, flags);
+                spin_unlock(&worker->workers->lock);
        }
        if (!worker->working) {
                wake = 1;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 60a45f3a4e91..3fdcc0512d3a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
        btrfs_disk_key_to_cpu(&k1, disk);
-        if (k1.objectid > k2->objectid)
+        return btrfs_comp_cpu_keys(&k1, k2);
-                return 1;
-        if (k1.objectid < k2->objectid)
-                return -1;
-        if (k1.type > k2->type)
-                return 1;
-        if (k1.type < k2->type)
-                return -1;
-        if (k1.offset > k2->offset)
-                return 1;
-        if (k1.offset < k2->offset)
-                return -1;
-        return 0;
 }
 /*
@@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
                return 0;
-        if (btrfs_header_nritems(mid) > 2)
-                return 0;
        if (btrfs_header_nritems(mid) < 2)
                err_on_enospc = 1;
@@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        struct extent_buffer *b;
        int slot;
        int ret;
+        int err;
        int level;
        int lowest_unlock = 1;
        u8 lowest_level = 0;
@@ -1737,8 +1723,6 @@ again:
                        p->locks[level] = 1;
                if (cow) {
-                        int wret;
                        /*
                         * if we don't really need to cow this block
                         * then we don't want to set the path blocking,
@@ -1749,12 +1733,12 @@ again:
                        btrfs_set_path_blocking(p);
-                        wret = btrfs_cow_block(trans, root, b,
+                        err = btrfs_cow_block(trans, root, b,
-                                               p->nodes[level + 1],
+                                              p->nodes[level + 1],
-                                               p->slots[level + 1], &b);
+                                              p->slots[level + 1], &b);
-                        if (wret) {
+                        if (err) {
                                free_extent_buffer(b);
-                                ret = wret;
+                                ret = err;
                                goto done;
                        }
                }
@@ -1793,41 +1777,45 @@ cow_done:
                ret = bin_search(b, key, level, &slot);
                if (level != 0) {
-                        if (ret && slot > 0)
+                        int dec = 0;
+                        if (ret && slot > 0) {
+                                dec = 1;
                                slot -= 1;
+                        }
                        p->slots[level] = slot;
-                        ret = setup_nodes_for_search(trans, root, p, b, level,
+                        err = setup_nodes_for_search(trans, root, p, b, level,
                                                     ins_len);
-                        if (ret == -EAGAIN)
+                        if (err == -EAGAIN)
                                goto again;
-                        else if (ret)
+                        if (err) {
+                                ret = err;
                                goto done;
+                        }
                        b = p->nodes[level];
                        slot = p->slots[level];
                        unlock_up(p, level, lowest_unlock);
-                        /* this is only true while dropping a snapshot */
                        if (level == lowest_level) {
-                                ret = 0;
+                                if (dec)
+                                        p->slots[level]++;
                                goto done;
                        }
-                        ret = read_block_for_search(trans, root, p,
+                        err = read_block_for_search(trans, root, p,
                                                    &b, level, slot, key);
-                        if (ret == -EAGAIN)
+                        if (err == -EAGAIN)
                                goto again;
+                        if (err) {
-                        if (ret == -EIO)
+                                ret = err;
                                goto done;
+                        }
                        if (!p->skip_locking) {
-                                int lret;
                                btrfs_clear_path_blocking(p, NULL);
-                                lret = btrfs_try_spin_lock(b);
+                                err = btrfs_try_spin_lock(b);
-                                if (!lret) {
+                                if (!err) {
                                        btrfs_set_path_blocking(p);
                                        btrfs_tree_lock(b);
                                        btrfs_clear_path_blocking(p, b);
@@ -1837,16 +1825,14 @@ cow_done:
                        p->slots[level] = slot;
                        if (ins_len > 0 &&
                            btrfs_leaf_free_space(root, b) < ins_len) {
-                                int sret;
                                btrfs_set_path_blocking(p);
-                                sret = split_leaf(trans, root, key,
+                                err = split_leaf(trans, root, key,
-                                                      p, ins_len, ret == 0);
+                                                 p, ins_len, ret == 0);
                                btrfs_clear_path_blocking(p, NULL);
-                                BUG_ON(sret > 0);
+                                BUG_ON(err > 0);
-                                if (sret) {
+                                if (err) {
-                                        ret = sret;
+                                        ret = err;
                                        goto done;
                                }
                        }
@@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                }
                /* delete the leaf if it is mostly empty */
-                if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
+                if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
                        /* push_leaf_left fixes the path.
                         * make sure the path still points to our leaf
                         * for possible call to del_ptr below
@@ -4042,10 +4028,9 @@ out:
 * calling this function.
 */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-                        struct btrfs_key *key, int lowest_level,
+                        struct btrfs_key *key, int level,
                        int cache_only, u64 min_trans)
 {
-        int level = lowest_level;
        int slot;
        struct extent_buffer *c;
@@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
                c = path->nodes[level];
 next:
                if (slot >= btrfs_header_nritems(c)) {
-                        level++;
+                        int ret;
-                        if (level == BTRFS_MAX_LEVEL)
+                        int orig_lowest;
+                        struct btrfs_key cur_key;
+                        if (level + 1 >= BTRFS_MAX_LEVEL ||
+                            !path->nodes[level + 1])
                                return 1;
-                        continue;
+                        if (path->locks[level + 1]) {
+                                level++;
+                                continue;
+                        }
+                        slot = btrfs_header_nritems(c) - 1;
+                        if (level == 0)
+                                btrfs_item_key_to_cpu(c, &cur_key, slot);
+                        else
+                                btrfs_node_key_to_cpu(c, &cur_key, slot);
+                        orig_lowest = path->lowest_level;
+                        btrfs_release_path(root, path);
+                        path->lowest_level = level;
+                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
+                                                0, 0);
+                        path->lowest_level = orig_lowest;
+                        if (ret < 0)
+                                return ret;
+                        c = path->nodes[level];
+                        slot = path->slots[level];
+                        if (ret == 0)
+                                slot++;
+                        goto next;
                }
                if (level == 0)
                        btrfs_item_key_to_cpu(c, key, slot);
                else {
@@ -4146,7 +4160,8 @@ again:
         * advance the path if there are now more items available.
         */
        if (nritems > 0 && path->slots[0] < nritems - 1) {
-                path->slots[0]++;
+                if (ret == 0)
+                        path->slots[0]++;
                ret = 0;
                goto done;
        }
@@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root,
                        path->slots[0]--;
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.type == type)
-                        return 0;
                if (found_key.objectid < min_objectid)
                        break;
+                if (found_key.type == type)
+                        return 0;
                if (found_key.objectid == min_objectid &&
                    found_key.type < type)
                        break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 98a873838717..215ef8cae823 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -481,7 +481,7 @@ struct btrfs_shared_data_ref {
 struct btrfs_extent_inline_ref {
        u8 type;
-        u64 offset;
+        __le64 offset;
 } __attribute__ ((__packed__));
 /* old style backrefs item */
@@ -689,6 +689,7 @@ struct btrfs_space_info {
        struct list_head block_groups;
        spinlock_t lock;
        struct rw_semaphore groups_sem;
+        atomic_t caching_threads;
 };
 /*
@@ -707,6 +708,9 @@ struct btrfs_free_cluster {
        /* first extent starting offset */
        u64 window_start;
+        /* if this cluster simply points at a bitmap in the block group */
+        bool points_to_bitmap;
        struct btrfs_block_group_cache *block_group;
        /*
         * when a cluster is allocated from a block group, we put the
@@ -716,24 +720,37 @@ struct btrfs_free_cluster {
        struct list_head block_group_list;
 };
+enum btrfs_caching_type {
+        BTRFS_CACHE_NO          = 0,
+        BTRFS_CACHE_STARTED     = 1,
+        BTRFS_CACHE_FINISHED    = 2,
+};
 struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
+        struct btrfs_fs_info *fs_info;
        spinlock_t lock;
-        struct mutex cache_mutex;
        u64 pinned;
        u64 reserved;
        u64 flags;
-        int cached;
+        u64 sectorsize;
+        int extents_thresh;
+        int free_extents;
+        int total_bitmaps;
        int ro;
        int dirty;
+        /* cache tracking stuff */
+        wait_queue_head_t caching_q;
+        int cached;
        struct btrfs_space_info *space_info;
        /* free space cache stuff */
        spinlock_t tree_lock;
-        struct rb_root free_space_bytes;
        struct rb_root free_space_offset;
+        u64 free_space;
        /* block group cache stuff */
        struct rb_node cache_node;
@@ -942,6 +959,9 @@ struct btrfs_root {
        /* the node lock is held while changing the node pointer */
        spinlock_t node_lock;
+        /* taken when updating the commit root */
+        struct rw_semaphore commit_root_sem;
        struct extent_buffer *commit_root;
        struct btrfs_root *log_root;
        struct btrfs_root *reloc_root;
@@ -1988,6 +2008,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
                                 u64 bytes);
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
                              u64 bytes);
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d28d29c95f7c..7dcaa8138864 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -909,6 +909,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        spin_lock_init(&root->inode_lock);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
+        init_rwsem(&root->commit_root_sem);
        init_waitqueue_head(&root->log_writer_wait);
        init_waitqueue_head(&root->log_commit_wait[0]);
        init_waitqueue_head(&root->log_commit_wait[1]);
@@ -1799,6 +1800,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                           btrfs_super_chunk_root(disk_super),
                                           blocksize, generation);
        BUG_ON(!chunk_root->node);
+        if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+                printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+                       sb->s_id);
+                goto fail_chunk_root;
+        }
        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
        chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1826,6 +1832,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                          blocksize, generation);
        if (!tree_root->node)
                goto fail_chunk_root;
+        if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+                printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+                       sb->s_id);
+                goto fail_tree_root;
+        }
        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
        tree_root->commit_root = btrfs_root_node(tree_root);
@@ -2322,6 +2333,9 @@ int close_ctree(struct btrfs_root *root)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
+        fs_info->closing = 2;
+        smp_mb();
        if (fs_info->delalloc_bytes) {
                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
                       (unsigned long long)fs_info->delalloc_bytes);
@@ -2343,6 +2357,7 @@ int close_ctree(struct btrfs_root *root)
        free_extent_buffer(root->fs_info->csum_root->commit_root);
        btrfs_free_block_groups(root->fs_info);
+        btrfs_free_pinned_extents(root->fs_info);
        del_fs_roots(fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a5aca3997d42..fadf69a2764b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,6 +21,7 @@
 #include <linux/blkdev.h>
 #include <linux/sort.h>
 #include <linux/rcupdate.h>
+#include <linux/kthread.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force);
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+        smp_mb();
+        return cache->cached == BTRFS_CACHE_FINISHED;
+}
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
        return (cache->flags & bits) == bits;
@@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 }
 /*
+ * We always set EXTENT_LOCKED for the super mirror extents so we don't
+ * overwrite them, so those bits need to be unset.  Also, if we are unmounting
+ * with pinned extents still sitting there because we had a block group caching,
+ * we need to clear those now, since we are done.
+ */
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
+{
+        u64 start, end, last = 0;
+        int ret;
+        while (1) {
+                ret = find_first_extent_bit(&info->pinned_extents, last,
+                                            &start, &end,
+                                            EXTENT_LOCKED|EXTENT_DIRTY);
+                if (ret)
+                        break;
+                clear_extent_bits(&info->pinned_extents, start, end,
+                                  EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
+                last = end+1;
+        }
+}
+static int remove_sb_from_cache(struct btrfs_root *root,
+                                struct btrfs_block_group_cache *cache)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u64 bytenr;
+        u64 *logical;
+        int stripe_len;
+        int i, nr, ret;
+        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                bytenr = btrfs_sb_offset(i);
+                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+                                       cache->key.objectid, bytenr,
+                                       0, &logical, &nr, &stripe_len);
+                BUG_ON(ret);
+                while (nr--) {
+                        try_lock_extent(&fs_info->pinned_extents,
+                                        logical[nr],
+                                        logical[nr] + stripe_len - 1, GFP_NOFS);
+                }
+                kfree(logical);
+        }
+        return 0;
+}
+/*
 * this is only called by cache_block_group, since we could have freed extents
 * we need to check the pinned_extents for any extents that can't be used yet
 * since their free space will be released as soon as the transaction commits.
 */
-static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
                              struct btrfs_fs_info *info, u64 start, u64 end)
 {
-        u64 extent_start, extent_end, size;
+        u64 extent_start, extent_end, size, total_added = 0;
        int ret;
        while (start < end) {
                ret = find_first_extent_bit(&info->pinned_extents, start,
                                            &extent_start, &extent_end,
-                                            EXTENT_DIRTY);
+                                            EXTENT_DIRTY|EXTENT_LOCKED);
                if (ret)
                        break;
@@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
                        start = extent_end + 1;
                } else if (extent_start > start && extent_start < end) {
                        size = extent_start - start;
+                        total_added += size;
                        ret = btrfs_add_free_space(block_group, start,
                                                   size);
                        BUG_ON(ret);
@@ -178,84 +237,79 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
        if (start < end) {
                size = end - start;
+                total_added += size;
                ret = btrfs_add_free_space(block_group, start, size);
                BUG_ON(ret);
        }
-        return 0;
+        return total_added;
 }
-static int remove_sb_from_cache(struct btrfs_root *root,
+static int caching_kthread(void *data)
-                                struct btrfs_block_group_cache *cache)
-{
-        u64 bytenr;
-        u64 *logical;
-        int stripe_len;
-        int i, nr, ret;
-        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-                bytenr = btrfs_sb_offset(i);
-                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
-                                       cache->key.objectid, bytenr, 0,
-                                       &logical, &nr, &stripe_len);
-                BUG_ON(ret);
-                while (nr--) {
-                        btrfs_remove_free_space(cache, logical[nr],
-                                                stripe_len);
-                }
-                kfree(logical);
-        }
-        return 0;
-}
-static int cache_block_group(struct btrfs_root *root,
-                             struct btrfs_block_group_cache *block_group)
 {
+        struct btrfs_block_group_cache *block_group = data;
+        struct btrfs_fs_info *fs_info = block_group->fs_info;
+        u64 last = 0;
        struct btrfs_path *path;
        int ret = 0;
        struct btrfs_key key;
        struct extent_buffer *leaf;
        int slot;
-        u64 last;
+        u64 total_found = 0;
-        if (!block_group)
-                return 0;
-        root = root->fs_info->extent_root;
+        BUG_ON(!fs_info);
-        if (block_group->cached)
-                return 0;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        path->reada = 2;
+        atomic_inc(&block_group->space_info->caching_threads);
+        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+again:
+        /* need to make sure the commit_root doesn't disappear */
+        down_read(&fs_info->extent_root->commit_root_sem);
        /*
-         * we get into deadlocks with paths held by callers of this function.
+         * We don't want to deadlock with somebody trying to allocate a new
-         * since the alloc_mutex is protecting things right now, just
+         * extent for the extent root while also trying to search the extent
-         * skip the locking here
+         * root to add free space.  So we skip locking and search the commit
+         * root, since its read-only
         */
        path->skip_locking = 1;
-        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+        path->search_commit_root = 1;
+        path->reada = 2;
        key.objectid = last;
        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
        while (1) {
+                smp_mb();
+                if (block_group->fs_info->closing > 1) {
+                        last = (u64)-1;
+                        break;
+                }
                leaf = path->nodes[0];
                slot = path->slots[0];
                if (slot >= btrfs_header_nritems(leaf)) {
-                        ret = btrfs_next_leaf(root, path);
+                        ret = btrfs_next_leaf(fs_info->extent_root, path);
                        if (ret < 0)
                                goto err;
-                        if (ret == 0)
+                        else if (ret)
-                                continue;
-                        else
                                break;
+                        if (need_resched()) {
+                                btrfs_release_path(fs_info->extent_root, path);
+                                up_read(&fs_info->extent_root->commit_root_sem);
+                                cond_resched();
+                                goto again;
+                        }
+                        continue;
                }
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid < block_group->key.objectid)
@@ -266,24 +320,59 @@ static int cache_block_group(struct btrfs_root *root,
                        break;
                if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
-                        add_new_free_space(block_group, root->fs_info, last,
+                        total_found += add_new_free_space(block_group,
-                                           key.objectid);
+                                                          fs_info, last,
+                                                          key.objectid);
                        last = key.objectid + key.offset;
                }
+                if (total_found > (1024 * 1024 * 2)) {
+                        total_found = 0;
+                        wake_up(&block_group->caching_q);
+                }
 next:
                path->slots[0]++;
        }
+        ret = 0;
-        add_new_free_space(block_group, root->fs_info, last,
+        total_found += add_new_free_space(block_group, fs_info, last,
-                           block_group->key.objectid +
+                                          block_group->key.objectid +
-                           block_group->key.offset);
+                                          block_group->key.offset);
+        spin_lock(&block_group->lock);
+        block_group->cached = BTRFS_CACHE_FINISHED;
+        spin_unlock(&block_group->lock);
-        block_group->cached = 1;
-        remove_sb_from_cache(root, block_group);
-        ret = 0;
 err:
        btrfs_free_path(path);
+        up_read(&fs_info->extent_root->commit_root_sem);
+        atomic_dec(&block_group->space_info->caching_threads);
+        wake_up(&block_group->caching_q);
+        return 0;
+}
+static int cache_block_group(struct btrfs_block_group_cache *cache)
+{
+        struct task_struct *tsk;
+        int ret = 0;
+        spin_lock(&cache->lock);
+        if (cache->cached != BTRFS_CACHE_NO) {
+                spin_unlock(&cache->lock);
+                return ret;
+        }
+        cache->cached = BTRFS_CACHE_STARTED;
+        spin_unlock(&cache->lock);
+        tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+                          cache->key.objectid);
+        if (IS_ERR(tsk)) {
+                ret = PTR_ERR(tsk);
+                printk(KERN_ERR "error running thread %d\n", ret);
+                BUG();
+        }
        return ret;
 }
@@ -2387,13 +2476,29 @@ fail:
 }
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+                 struct btrfs_block_group_cache *cache)
+{
+        struct rb_node *node;
+        spin_lock(&root->fs_info->block_group_cache_lock);
+        node = rb_next(&cache->cache_node);
+        btrfs_put_block_group(cache);
+        if (node) {
+                cache = rb_entry(node, struct btrfs_block_group_cache,
+                                 cache_node);
+                atomic_inc(&cache->count);
+        } else
+                cache = NULL;
+        spin_unlock(&root->fs_info->block_group_cache_lock);
+        return cache;
+}
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
-        struct btrfs_block_group_cache *cache, *entry;
+        struct btrfs_block_group_cache *cache;
-        struct rb_node *n;
        int err = 0;
-        int werr = 0;
        struct btrfs_path *path;
        u64 last = 0;
@@ -2402,39 +2507,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        while (1) {
-                cache = NULL;
+                if (last == 0) {
-                spin_lock(&root->fs_info->block_group_cache_lock);
+                        err = btrfs_run_delayed_refs(trans, root,
-                for (n = rb_first(&root->fs_info->block_group_cache_tree);
+                                                     (unsigned long)-1);
-                     n; n = rb_next(n)) {
+                        BUG_ON(err);
-                        entry = rb_entry(n, struct btrfs_block_group_cache,
-                                         cache_node);
-                        if (entry->dirty) {
-                                cache = entry;
-                                break;
-                        }
                }
-                spin_unlock(&root->fs_info->block_group_cache_lock);
-                if (!cache)
+                cache = btrfs_lookup_first_block_group(root->fs_info, last);
-                        break;
+                while (cache) {
+                        if (cache->dirty)
+                                break;
+                        cache = next_block_group(root, cache);
+                }
+                if (!cache) {
+                        if (last == 0)
+                                break;
+                        last = 0;
+                        continue;
+                }
                cache->dirty = 0;
-                last += cache->key.offset;
+                last = cache->key.objectid + cache->key.offset;
-                err = write_one_cache_group(trans, root,
+                err = write_one_cache_group(trans, root, path, cache);
-                                            path, cache);
+                BUG_ON(err);
-                /*
+                btrfs_put_block_group(cache);
-                 * if we fail to write the cache group, we want
-                 * to keep it marked dirty in hopes that a later
-                 * write will work
-                 */
-                if (err) {
-                        werr = err;
-                        continue;
-                }
        }
        btrfs_free_path(path);
-        return werr;
+        return 0;
 }
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2484,6 +2585,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->force_alloc = 0;
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
+        atomic_set(&found->caching_threads, 0);
        return 0;
 }
@@ -2947,13 +3049,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        if (pin) {
+        if (pin)
                set_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
-        } else {
-                clear_extent_dirty(&fs_info->pinned_extents,
-                                bytenr, bytenr + num - 1, GFP_NOFS);
-        }
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2969,14 +3067,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                        spin_unlock(&cache->space_info->lock);
                        fs_info->total_pinned += len;
                } else {
+                        int unpin = 0;
+                        /*
+                         * in order to not race with the block group caching, we
+                         * only want to unpin the extent if we are cached.  If
+                         * we aren't cached, we want to start async caching this
+                         * block group so we can free the extent the next time
+                         * around.
+                         */
                        spin_lock(&cache->space_info->lock);
                        spin_lock(&cache->lock);
-                        cache->pinned -= len;
+                        unpin = (cache->cached == BTRFS_CACHE_FINISHED);
-                        cache->space_info->bytes_pinned -= len;
+                        if (likely(unpin)) {
+                                cache->pinned -= len;
+                                cache->space_info->bytes_pinned -= len;
+                                fs_info->total_pinned -= len;
+                        }
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
-                        fs_info->total_pinned -= len;
-                        if (cache->cached)
+                        if (likely(unpin))
+                                clear_extent_dirty(&fs_info->pinned_extents,
+                                                   bytenr, bytenr + len -1,
+                                                   GFP_NOFS);
+                        else
+                                cache_block_group(cache);
+                        if (unpin)
                                btrfs_add_free_space(cache, bytenr, len);
                }
                btrfs_put_block_group(cache);
@@ -3030,6 +3148,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
                                            &start, &end, EXTENT_DIRTY);
                if (ret)
                        break;
                set_extent_dirty(copy, start, end, GFP_NOFS);
                last = end + 1;
        }
@@ -3058,6 +3177,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
        return ret;
 }
@@ -3436,6 +3556,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
 }
 /*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once.  So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes.  Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ */
+static noinline int
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+                                u64 num_bytes)
+{
+        DEFINE_WAIT(wait);
+        prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
+        if (block_group_cache_done(cache)) {
+                finish_wait(&cache->caching_q, &wait);
+                return 0;
+        }
+        schedule();
+        finish_wait(&cache->caching_q, &wait);
+        wait_event(cache->caching_q, block_group_cache_done(cache) ||
+                   (cache->free_space >= num_bytes));
+        return 0;
+}
+enum btrfs_loop_type {
+        LOOP_CACHED_ONLY = 0,
+        LOOP_CACHING_NOWAIT = 1,
+        LOOP_CACHING_WAIT = 2,
+        LOOP_ALLOC_CHUNK = 3,
+        LOOP_NO_EMPTY_SIZE = 4,
+};
+/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
 * ins->objectid == block start
@@ -3460,6 +3619,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
        int loop = 0;
+        bool found_uncached_bg = false;
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3491,15 +3651,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        search_start = max(search_start, first_logical_byte(root, 0));
        search_start = max(search_start, hint_byte);
-        if (!last_ptr) {
+        if (!last_ptr)
                empty_cluster = 0;
-                loop = 1;
-        }
        if (search_start == hint_byte) {
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
-                if (block_group && block_group_bits(block_group, data)) {
+                /*
+                 * we don't want to use the block group if it doesn't match our
+                 * allocation bits, or if its not cached.
+                 */
+                if (block_group && block_group_bits(block_group, data) &&
+                    block_group_cache_done(block_group)) {
                        down_read(&space_info->groups_sem);
                        if (list_empty(&block_group->list) ||
                            block_group->ro) {
@@ -3522,21 +3685,35 @@ search:
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups, list) {
                u64 offset;
+                int cached;
                atomic_inc(&block_group->count);
                search_start = block_group->key.objectid;
 have_block_group:
-                if (unlikely(!block_group->cached)) {
+                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
-                        mutex_lock(&block_group->cache_mutex);
+                        /*
-                        ret = cache_block_group(root, block_group);
+                         * we want to start caching kthreads, but not too many
-                        mutex_unlock(&block_group->cache_mutex);
+                         * right off the bat so we don't overwhelm the system,
-                        if (ret) {
+                         * so only start them if there are less than 2 and we're
-                                btrfs_put_block_group(block_group);
+                         * in the initial allocation phase.
-                                break;
+                         */
+                        if (loop > LOOP_CACHING_NOWAIT ||
+                            atomic_read(&space_info->caching_threads) < 2) {
+                                ret = cache_block_group(block_group);
+                                BUG_ON(ret);
                        }
                }
+                cached = block_group_cache_done(block_group);
+                if (unlikely(!cached)) {
+                        found_uncached_bg = true;
+                        /* if we only want cached bgs, loop */
+                        if (loop == LOOP_CACHED_ONLY)
+                                goto loop;
+                }
                if (unlikely(block_group->ro))
                        goto loop;
@@ -3615,14 +3792,21 @@ refill_cluster:
                                        spin_unlock(&last_ptr->refill_lock);
                                        goto checks;
                                }
+                        } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
+                                spin_unlock(&last_ptr->refill_lock);
+                                wait_block_group_cache_progress(block_group,
+                                       num_bytes + empty_cluster + empty_size);
+                                goto have_block_group;
                        }
                        /*
                         * at this point we either didn't find a cluster
                         * or we weren't able to allocate a block from our
                         * cluster.  Free the cluster we've been trying
                         * to use, and go to the next block group
                         */
-                        if (loop < 2) {
+                        if (loop < LOOP_NO_EMPTY_SIZE) {
                                btrfs_return_cluster_to_free_space(NULL,
                                                                   last_ptr);
                                spin_unlock(&last_ptr->refill_lock);
@@ -3633,11 +3817,17 @@ refill_cluster:
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
-                if (!offset)
+                if (!offset && (cached || (!cached &&
+                                           loop == LOOP_CACHING_NOWAIT))) {
                        goto loop;
+                } else if (!offset && (!cached &&
+                                       loop > LOOP_CACHING_NOWAIT)) {
+                        wait_block_group_cache_progress(block_group,
+                                        num_bytes + empty_size);
+                        goto have_block_group;
+                }
 checks:
                search_start = stripe_align(root, offset);
                /* move on to the next group */
                if (search_start + num_bytes >= search_end) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
@@ -3683,13 +3873,26 @@ loop:
        }
        up_read(&space_info->groups_sem);
-        /* loop == 0, try to find a clustered alloc in every block group
+        /* LOOP_CACHED_ONLY, only search fully cached block groups
-         * loop == 1, try again after forcing a chunk allocation
+         * LOOP_CACHING_NOWAIT, search partially cached block groups, but
-         * loop == 2, set empty_size and empty_cluster to 0 and try again
+         *                      dont wait foR them to finish caching
+         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+         *                      again
         */
-        if (!ins->objectid && loop < 3 &&
+        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
-            (empty_size || empty_cluster || allowed_chunk_alloc)) {
+            (found_uncached_bg || empty_size || empty_cluster ||
-                if (loop >= 2) {
+             allowed_chunk_alloc)) {
+                if (found_uncached_bg) {
+                        found_uncached_bg = false;
+                        if (loop < LOOP_CACHING_WAIT) {
+                                loop++;
+                                goto search;
+                        }
+                }
+                if (loop == LOOP_ALLOC_CHUNK) {
                        empty_size = 0;
                        empty_cluster = 0;
                }
@@ -3702,7 +3905,7 @@ loop:
                        space_info->force_alloc = 1;
                }
-                if (loop < 3) {
+                if (loop < LOOP_NO_EMPTY_SIZE) {
                        loop++;
                        goto search;
                }
@@ -3798,7 +4001,7 @@ again:
                               num_bytes, data, 1);
                goto again;
        }
-        if (ret) {
+        if (ret == -ENOSPC) {
                struct btrfs_space_info *sinfo;
                sinfo = __find_space_info(root->fs_info, data);
@@ -3806,7 +4009,6 @@ again:
                       "wanted %llu\n", (unsigned long long)data,
                       (unsigned long long)num_bytes);
                dump_space_info(sinfo, num_bytes);
-                BUG();
        }
        return ret;
@@ -3844,7 +4046,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
        ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
                                     empty_size, hint_byte, search_end, ins,
                                     data);
-        update_reserved_extents(root, ins->objectid, ins->offset, 1);
+        if (!ret)
+                update_reserved_extents(root, ins->objectid, ins->offset, 1);
        return ret;
 }
@@ -4006,9 +4210,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *block_group;
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-        mutex_lock(&block_group->cache_mutex);
+        cache_block_group(block_group);
-        cache_block_group(root, block_group);
+        wait_event(block_group->caching_q,
-        mutex_unlock(&block_group->cache_mutex);
+                   block_group_cache_done(block_group));
        ret = btrfs_remove_free_space(block_group, ins->objectid,
                                      ins->offset);
@@ -4039,7 +4243,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
        ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
                                     empty_size, hint_byte, search_end,
                                     ins, 0);
-        BUG_ON(ret);
+        if (ret)
+                return ret;
        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
                if (parent == 0)
@@ -6955,11 +7160,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                         &info->block_group_cache_tree);
                spin_unlock(&info->block_group_cache_lock);
-                btrfs_remove_free_space_cache(block_group);
                down_write(&block_group->space_info->groups_sem);
                list_del(&block_group->list);
                up_write(&block_group->space_info->groups_sem);
+                if (block_group->cached == BTRFS_CACHE_STARTED)
+                        wait_event(block_group->caching_q,
+                                   block_group_cache_done(block_group));
+                btrfs_remove_free_space_cache(block_group);
                WARN_ON(atomic_read(&block_group->count) != 1);
                kfree(block_group);
@@ -7025,9 +7235,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                atomic_set(&cache->count, 1);
                spin_lock_init(&cache->lock);
                spin_lock_init(&cache->tree_lock);
-                mutex_init(&cache->cache_mutex);
+                cache->fs_info = info;
+                init_waitqueue_head(&cache->caching_q);
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
+                /*
+                 * we only want to have 32k of ram per block group for keeping
+                 * track of free space, and if we pass 1/2 of that we want to
+                 * start converting things over to using bitmaps
+                 */
+                cache->extents_thresh = ((1024 * 32) / 2) /
+                        sizeof(struct btrfs_free_space);
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
@@ -7036,6 +7256,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                key.objectid = found_key.objectid + found_key.offset;
                btrfs_release_path(root, path);
                cache->flags = btrfs_block_group_flags(&cache->item);
+                cache->sectorsize = root->sectorsize;
+                remove_sb_from_cache(root, cache);
+                /*
+                 * check for two cases, either we are full, and therefore
+                 * don't need to bother with the caching work since we won't
+                 * find any space, or we are empty, and we can just add all
+                 * the space in and be done with it.  This saves us _alot_ of
+                 * time, particularly in the full case.
+                 */
+                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+                        cache->cached = BTRFS_CACHE_FINISHED;
+                } else if (btrfs_block_group_used(&cache->item) == 0) {
+                        cache->cached = BTRFS_CACHE_FINISHED;
+                        add_new_free_space(cache, root->fs_info,
+                                           found_key.objectid,
+                                           found_key.objectid +
+                                           found_key.offset);
+                }
                ret = update_space_info(info, cache->flags, found_key.offset,
                                        btrfs_block_group_used(&cache->item),
@@ -7079,10 +7319,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.objectid = chunk_offset;
        cache->key.offset = size;
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+        cache->sectorsize = root->sectorsize;
+        /*
+         * we only want to have 32k of ram per block group for keeping track
+         * of free space, and if we pass 1/2 of that we want to start
+         * converting things over to using bitmaps
+         */
+        cache->extents_thresh = ((1024 * 32) / 2) /
+                sizeof(struct btrfs_free_space);
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
        spin_lock_init(&cache->tree_lock);
-        mutex_init(&cache->cache_mutex);
+        init_waitqueue_head(&cache->caching_q);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
@@ -7091,6 +7340,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->flags = type;
        btrfs_set_block_group_flags(&cache->item, type);
+        cache->cached = BTRFS_CACHE_FINISHED;
+        remove_sb_from_cache(root, cache);
+        add_new_free_space(cache, root->fs_info, chunk_offset,
+                           chunk_offset + size);
        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                &cache->space_info);
        BUG_ON(ret);
@@ -7149,7 +7404,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
        spin_unlock(&root->fs_info->block_group_cache_lock);
-        btrfs_remove_free_space_cache(block_group);
        down_write(&block_group->space_info->groups_sem);
        /*
         * we must use list_del_init so people can check to see if they
@@ -7158,11 +7413,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        list_del_init(&block_group->list);
        up_write(&block_group->space_info->groups_sem);
+        if (block_group->cached == BTRFS_CACHE_STARTED)
+                wait_event(block_group->caching_q,
+                           block_group_cache_done(block_group));
+        btrfs_remove_free_space_cache(block_group);
        spin_lock(&block_group->space_info->lock);
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        spin_unlock(&block_group->space_info->lock);
-        block_group->space_info->full = 0;
+        btrfs_clear_space_info_full(root->fs_info);
        btrfs_put_block_group(block_group);
        btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4538e48581a5..af99b78b288e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
 * Boston, MA 021110-1307, USA.
 */
+#include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/math64.h>
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
-struct btrfs_free_space {
+#define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
-        struct rb_node bytes_index;
+#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
-        struct rb_node offset_index;
-        u64 offset;
-        u64 bytes;
-};
-static int tree_insert_offset(struct rb_root *root, u64 offset,
+static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
-                              struct rb_node *node)
+                                          u64 offset)
 {
-        struct rb_node **p = &root->rb_node;
+        BUG_ON(offset < bitmap_start);
-        struct rb_node *parent = NULL;
+        offset -= bitmap_start;
-        struct btrfs_free_space *info;
+        return (unsigned long)(div64_u64(offset, sectorsize));
+}
-        while (*p) {
+static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
-                parent = *p;
+{
-                info = rb_entry(parent, struct btrfs_free_space, offset_index);
+        return (unsigned long)(div64_u64(bytes, sectorsize));
+}
-                if (offset < info->offset)
+static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
-                        p = &(*p)->rb_left;
+                                   u64 offset)
-                else if (offset > info->offset)
+{
-                        p = &(*p)->rb_right;
+        u64 bitmap_start;
-                else
+        u64 bytes_per_bitmap;
-                        return -EEXIST;
-        }
-        rb_link_node(node, parent, p);
+        bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
-        rb_insert_color(node, root);
+        bitmap_start = offset - block_group->key.objectid;
+        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+        bitmap_start *= bytes_per_bitmap;
+        bitmap_start += block_group->key.objectid;
-        return 0;
+        return bitmap_start;
 }
-static int tree_insert_bytes(struct rb_root *root, u64 bytes,
+static int tree_insert_offset(struct rb_root *root, u64 offset,
-                             struct rb_node *node)
+                              struct rb_node *node, int bitmap)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
        while (*p) {
                parent = *p;
-                info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+                info = rb_entry(parent, struct btrfs_free_space, offset_index);
-                if (bytes < info->bytes)
+                if (offset < info->offset) {
                        p = &(*p)->rb_left;
-                else
+                } else if (offset > info->offset) {
                        p = &(*p)->rb_right;
+                } else {
+                        /*
+                         * we could have a bitmap entry and an extent entry
+                         * share the same offset.  If this is the case, we want
+                         * the extent entry to always be found first if we do a
+                         * linear search through the tree, since we want to have
+                         * the quickest allocation time, and allocating from an
+                         * extent is faster than allocating from a bitmap.  So
+                         * if we're inserting a bitmap and we find an entry at
+                         * this offset, we want to go right, or after this entry
+                         * logically.  If we are inserting an extent and we've
+                         * found a bitmap, we want to go left, or before
+                         * logically.
+                         */
+                        if (bitmap) {
+                                WARN_ON(info->bitmap);
+                                p = &(*p)->rb_right;
+                        } else {
+                                WARN_ON(!info->bitmap);
+                                p = &(*p)->rb_left;
+                        }
+                }
        }
        rb_link_node(node, parent, p);
@@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
 /*
 * searches the tree for the given offset.
 *
- * fuzzy == 1: this is used for allocations where we are given a hint of where
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
- * to look for free space.  Because the hint may not be completely on an offset
+ * want a section that has at least bytes size and comes at or after the given
- * mark, or the hint may no longer point to free space we need to fudge our
+ * offset.
- * results a bit.  So we look for free space starting at or after offset with at
- * least bytes size.  We prefer to find as close to the given offset as we can.
- * Also if the offset is within a free space range, then we will return the free
- * space that contains the given offset, which means we can return a free space
- * chunk with an offset before the provided offset.
- *
- * fuzzy == 0: this is just a normal tree search.  Give us the free space that
- * starts at the given offset which is at least bytes size, and if its not there
- * return NULL.
 */
-static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
+static struct btrfs_free_space *
-                                                   u64 offset, u64 bytes,
+tree_search_offset(struct btrfs_block_group_cache *block_group,
-                                                   int fuzzy)
+                   u64 offset, int bitmap_only, int fuzzy)
 {
-        struct rb_node *n = root->rb_node;
+        struct rb_node *n = block_group->free_space_offset.rb_node;
-        struct btrfs_free_space *entry, *ret = NULL;
+        struct btrfs_free_space *entry, *prev = NULL;
+        /* find entry that is closest to the 'offset' */
+        while (1) {
+                if (!n) {
+                        entry = NULL;
+                        break;
+                }
-        while (n) {
                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+                prev = entry;
-                if (offset < entry->offset) {
+                if (offset < entry->offset)
-                        if (fuzzy &&
-                            (!ret || entry->offset < ret->offset) &&
-                            (bytes <= entry->bytes))
-                                ret = entry;
                        n = n->rb_left;
-                } else if (offset > entry->offset) {
+                else if (offset > entry->offset)
-                        if (fuzzy &&
-                            (entry->offset + entry->bytes - 1) >= offset &&
-                            bytes <= entry->bytes) {
-                                ret = entry;
-                                break;
-                        }
                        n = n->rb_right;
-                } else {
+                else
-                        if (bytes > entry->bytes) {
-                                n = n->rb_right;
-                                continue;
-                        }
-                        ret = entry;
                        break;
-                }
        }
-        return ret;
+        if (bitmap_only) {
-}
+                if (!entry)
+                        return NULL;
-/*
+                if (entry->bitmap)
- * return a chunk at least bytes size, as close to offset that we can get.
+                        return entry;
- */
-static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
-                                                  u64 offset, u64 bytes)
-{
-        struct rb_node *n = root->rb_node;
-        struct btrfs_free_space *entry, *ret = NULL;
-        while (n) {
+                /*
-                entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+                 * bitmap entry and extent entry may share same offset,
+                 * in that case, bitmap entry comes after extent entry.
+                 */
+                n = rb_next(n);
+                if (!n)
+                        return NULL;
+                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+                if (entry->offset != offset)
+                        return NULL;
-                if (bytes < entry->bytes) {
+                WARN_ON(!entry->bitmap);
+                return entry;
+        } else if (entry) {
+                if (entry->bitmap) {
                        /*
-                         * We prefer to get a hole size as close to the size we
+                         * if previous extent entry covers the offset,
-                         * are asking for so we don't take small slivers out of
+                         * we should return it instead of the bitmap entry
-                         * huge holes, but we also want to get as close to the
-                         * offset as possible so we don't have a whole lot of
-                         * fragmentation.
                         */
-                        if (offset <= entry->offset) {
+                        n = &entry->offset_index;
-                                if (!ret)
+                        while (1) {
-                                        ret = entry;
+                                n = rb_prev(n);
-                                else if (entry->bytes < ret->bytes)
+                                if (!n)
-                                        ret = entry;
+                                        break;
-                                else if (entry->offset < ret->offset)
+                                prev = rb_entry(n, struct btrfs_free_space,
-                                        ret = entry;
+                                                offset_index);
+                                if (!prev->bitmap) {
+                                        if (prev->offset + prev->bytes > offset)
+                                                entry = prev;
+                                        break;
+                                }
                        }
-                        n = n->rb_left;
+                }
-                } else if (bytes > entry->bytes) {
+                return entry;
-                        n = n->rb_right;
+        }
+        if (!prev)
+                return NULL;
+        /* find last entry before the 'offset' */
+        entry = prev;
+        if (entry->offset > offset) {
+                n = rb_prev(&entry->offset_index);
+                if (n) {
+                        entry = rb_entry(n, struct btrfs_free_space,
+                                        offset_index);
+                        BUG_ON(entry->offset > offset);
                } else {
-                        /*
+                        if (fuzzy)
-                         * Ok we may have multiple chunks of the wanted size,
+                                return entry;
-                         * so we don't want to take the first one we find, we
+                        else
-                         * want to take the one closest to our given offset, so
+                                return NULL;
-                         * keep searching just in case theres a better match.
-                         */
-                        n = n->rb_right;
-                        if (offset > entry->offset)
-                                continue;
-                        else if (!ret || entry->offset < ret->offset)
-                                ret = entry;
                }
        }
-        return ret;
+        if (entry->bitmap) {
+                n = &entry->offset_index;
+                while (1) {
+                        n = rb_prev(n);
+                        if (!n)
+                                break;
+                        prev = rb_entry(n, struct btrfs_free_space,
+                                        offset_index);
+                        if (!prev->bitmap) {
+                                if (prev->offset + prev->bytes > offset)
+                                        return prev;
+                                break;
+                        }
+                }
+                if (entry->offset + BITS_PER_BITMAP *
+                    block_group->sectorsize > offset)
+                        return entry;
+        } else if (entry->offset + entry->bytes > offset)
+                return entry;
+        if (!fuzzy)
+                return NULL;
+        while (1) {
+                if (entry->bitmap) {
+                        if (entry->offset + BITS_PER_BITMAP *
+                            block_group->sectorsize > offset)
+                                break;
+                } else {
+                        if (entry->offset + entry->bytes > offset)
+                                break;
+                }
+                n = rb_next(&entry->offset_index);
+                if (!n)
+                        return NULL;
+                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+        }
+        return entry;
 }
 static void unlink_free_space(struct btrfs_block_group_cache *block_group,
                              struct btrfs_free_space *info)
 {
        rb_erase(&info->offset_index, &block_group->free_space_offset);
-        rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+        block_group->free_extents--;
+        block_group->free_space -= info->bytes;
 }
 static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +246,314 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 {
        int ret = 0;
+        BUG_ON(!info->bitmap && !info->bytes);
-        BUG_ON(!info->bytes);
        ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
-                                 &info->offset_index);
+                                 &info->offset_index, (info->bitmap != NULL));
        if (ret)
                return ret;
-        ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
+        block_group->free_space += info->bytes;
-                                &info->bytes_index);
+        block_group->free_extents++;
-        if (ret)
+        return ret;
-                return ret;
+}
+static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+{
+        u64 max_bytes, possible_bytes;
+        /*
+         * The goal is to keep the total amount of memory used per 1gb of space
+         * at or below 32k, so we need to adjust how much memory we allow to be
+         * used by extent based free space tracking
+         */
+        max_bytes = MAX_CACHE_BYTES_PER_GIG *
+                (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+        possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
+                (sizeof(struct btrfs_free_space) *
+                 block_group->extents_thresh);
+        if (possible_bytes > max_bytes) {
+                int extent_bytes = max_bytes -
+                        (block_group->total_bitmaps * PAGE_CACHE_SIZE);
+                if (extent_bytes <= 0) {
+                        block_group->extents_thresh = 0;
+                        return;
+                }
+                block_group->extents_thresh = extent_bytes /
+                        (sizeof(struct btrfs_free_space));
+        }
+}
+static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info, u64 offset,
+                              u64 bytes)
+{
+        unsigned long start, end;
+        unsigned long i;
+        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        BUG_ON(end > BITS_PER_BITMAP);
+        for (i = start; i < end; i++)
+                clear_bit(i, info->bitmap);
+        info->bytes -= bytes;
+        block_group->free_space -= bytes;
+}
+static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+                            struct btrfs_free_space *info, u64 offset,
+                            u64 bytes)
+{
+        unsigned long start, end;
+        unsigned long i;
+        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        BUG_ON(end > BITS_PER_BITMAP);
+        for (i = start; i < end; i++)
+                set_bit(i, info->bitmap);
+        info->bytes += bytes;
+        block_group->free_space += bytes;
+}
+static int search_bitmap(struct btrfs_block_group_cache *block_group,
+                         struct btrfs_free_space *bitmap_info, u64 *offset,
+                         u64 *bytes)
+{
+        unsigned long found_bits = 0;
+        unsigned long bits, i;
+        unsigned long next_zero;
+        i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+                          max_t(u64, *offset, bitmap_info->offset));
+        bits = bytes_to_bits(*bytes, block_group->sectorsize);
+        for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
+             i < BITS_PER_BITMAP;
+             i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+                next_zero = find_next_zero_bit(bitmap_info->bitmap,
+                                               BITS_PER_BITMAP, i);
+                if ((next_zero - i) >= bits) {
+                        found_bits = next_zero - i;
+                        break;
+                }
+                i = next_zero;
+        }
+        if (found_bits) {
+                *offset = (u64)(i * block_group->sectorsize) +
+                        bitmap_info->offset;
+                *bytes = (u64)(found_bits) * block_group->sectorsize;
+                return 0;
+        }
+        return -1;
+}
+static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
+                                                *block_group, u64 *offset,
+                                                u64 *bytes, int debug)
+{
+        struct btrfs_free_space *entry;
+        struct rb_node *node;
+        int ret;
+        if (!block_group->free_space_offset.rb_node)
+                return NULL;
+        entry = tree_search_offset(block_group,
+                                   offset_to_bitmap(block_group, *offset),
+                                   0, 1);
+        if (!entry)
+                return NULL;
+        for (node = &entry->offset_index; node; node = rb_next(node)) {
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                if (entry->bytes < *bytes)
+                        continue;
+                if (entry->bitmap) {
+                        ret = search_bitmap(block_group, entry, offset, bytes);
+                        if (!ret)
+                                return entry;
+                        continue;
+                }
+                *offset = entry->offset;
+                *bytes = entry->bytes;
+                return entry;
+        }
+        return NULL;
+}
+static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+                           struct btrfs_free_space *info, u64 offset)
+{
+        u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+        int max_bitmaps = (int)div64_u64(block_group->key.offset +
+                                         bytes_per_bg - 1, bytes_per_bg);
+        BUG_ON(block_group->total_bitmaps >= max_bitmaps);
+        info->offset = offset_to_bitmap(block_group, offset);
+        link_free_space(block_group, info);
+        block_group->total_bitmaps++;
+        recalculate_thresholds(block_group);
+}
+static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *bitmap_info,
+                              u64 *offset, u64 *bytes)
+{
+        u64 end;
+again:
+        end = bitmap_info->offset +
+                (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
+        if (*offset > bitmap_info->offset && *offset + *bytes > end) {
+                bitmap_clear_bits(block_group, bitmap_info, *offset,
+                                  end - *offset + 1);
+                *bytes -= end - *offset + 1;
+                *offset = end + 1;
+        } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
+                bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
+                *bytes = 0;
+        }
+        if (*bytes) {
+                if (!bitmap_info->bytes) {
+                        unlink_free_space(block_group, bitmap_info);
+                        kfree(bitmap_info->bitmap);
+                        kfree(bitmap_info);
+                        block_group->total_bitmaps--;
+                        recalculate_thresholds(block_group);
+                }
+                bitmap_info = tree_search_offset(block_group,
+                                                 offset_to_bitmap(block_group,
+                                                                  *offset),
+                                                 1, 0);
+                if (!bitmap_info)
+                        return -EINVAL;
+                if (!bitmap_info->bitmap)
+                        return -EAGAIN;
+                goto again;
+        } else if (!bitmap_info->bytes) {
+                unlink_free_space(block_group, bitmap_info);
+                kfree(bitmap_info->bitmap);
+                kfree(bitmap_info);
+                block_group->total_bitmaps--;
+                recalculate_thresholds(block_group);
+        }
+        return 0;
+}
+static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info)
+{
+        struct btrfs_free_space *bitmap_info;
+        int added = 0;
+        u64 bytes, offset, end;
+        int ret;
+        /*
+         * If we are below the extents threshold then we can add this as an
+         * extent, and don't have to deal with the bitmap
+         */
+        if (block_group->free_extents < block_group->extents_thresh &&
+            info->bytes > block_group->sectorsize * 4)
+                return 0;
+        /*
+         * some block groups are so tiny they can't be enveloped by a bitmap, so
+         * don't even bother to create a bitmap for this
+         */
+        if (BITS_PER_BITMAP * block_group->sectorsize >
+            block_group->key.offset)
+                return 0;
+        bytes = info->bytes;
+        offset = info->offset;
+again:
+        bitmap_info = tree_search_offset(block_group,
+                                         offset_to_bitmap(block_group, offset),
+                                         1, 0);
+        if (!bitmap_info) {
+                BUG_ON(added);
+                goto new_bitmap;
+        }
+        end = bitmap_info->offset +
+                (u64)(BITS_PER_BITMAP * block_group->sectorsize);
+        if (offset >= bitmap_info->offset && offset + bytes > end) {
+                bitmap_set_bits(block_group, bitmap_info, offset,
+                                end - offset);
+                bytes -= end - offset;
+                offset = end;
+                added = 0;
+        } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
+                bitmap_set_bits(block_group, bitmap_info, offset, bytes);
+                bytes = 0;
+        } else {
+                BUG();
+        }
+        if (!bytes) {
+                ret = 1;
+                goto out;
+        } else
+                goto again;
+new_bitmap:
+        if (info && info->bitmap) {
+                add_new_bitmap(block_group, info, offset);
+                added = 1;
+                info = NULL;
+                goto again;
+        } else {
+                spin_unlock(&block_group->tree_lock);
+                /* no pre-allocated info, allocate a new one */
+                if (!info) {
+                        info = kzalloc(sizeof(struct btrfs_free_space),
+                                       GFP_NOFS);
+                        if (!info) {
+                                spin_lock(&block_group->tree_lock);
+                                ret = -ENOMEM;
+                                goto out;
+                        }
+                }
+                /* allocate the bitmap */
+                info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+                spin_lock(&block_group->tree_lock);
+                if (!info->bitmap) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                goto again;
+        }
+out:
+        if (info) {
+                if (info->bitmap)
+                        kfree(info->bitmap);
+                kfree(info);
+        }
        return ret;
 }
@@ -208,8 +561,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                         u64 offset, u64 bytes)
 {
-        struct btrfs_free_space *right_info;
+        struct btrfs_free_space *right_info = NULL;
-        struct btrfs_free_space *left_info;
+        struct btrfs_free_space *left_info = NULL;
        struct btrfs_free_space *info = NULL;
        int ret = 0;
@@ -227,18 +580,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
         * are adding, if there is remove that struct and add a new one to
         * cover the entire range
         */
-        right_info = tree_search_offset(&block_group->free_space_offset,
+        right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
-                                        offset+bytes, 0, 0);
+        if (right_info && rb_prev(&right_info->offset_index))
-        left_info = tree_search_offset(&block_group->free_space_offset,
+                left_info = rb_entry(rb_prev(&right_info->offset_index),
-                                       offset-1, 0, 1);
+                                     struct btrfs_free_space, offset_index);
+        else
+                left_info = tree_search_offset(block_group, offset - 1, 0, 0);
-        if (right_info) {
+        /*
+         * If there was no extent directly to the left or right of this new
+         * extent then we know we're going to have to allocate a new extent, so
+         * before we do that see if we need to drop this into a bitmap
+         */
+        if ((!left_info || left_info->bitmap) &&
+            (!right_info || right_info->bitmap)) {
+                ret = insert_into_bitmap(block_group, info);
+                if (ret < 0) {
+                        goto out;
+                } else if (ret) {
+                        ret = 0;
+                        goto out;
+                }
+        }
+        if (right_info && !right_info->bitmap) {
                unlink_free_space(block_group, right_info);
                info->bytes += right_info->bytes;
                kfree(right_info);
        }
-        if (left_info && left_info->offset + left_info->bytes == offset) {
+        if (left_info && !left_info->bitmap &&
+            left_info->offset + left_info->bytes == offset) {
                unlink_free_space(block_group, left_info);
                info->offset = left_info->offset;
                info->bytes += left_info->bytes;
@@ -248,11 +621,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
        ret = link_free_space(block_group, info);
        if (ret)
                kfree(info);
+out:
        spin_unlock(&block_group->tree_lock);
        if (ret) {
-                printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+                printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
                BUG_ON(ret == -EEXIST);
        }
@@ -263,40 +636,65 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                            u64 offset, u64 bytes)
 {
        struct btrfs_free_space *info;
+        struct btrfs_free_space *next_info = NULL;
        int ret = 0;
        spin_lock(&block_group->tree_lock);
-        info = tree_search_offset(&block_group->free_space_offset, offset, 0,
+again:
-                                  1);
+        info = tree_search_offset(block_group, offset, 0, 0);
-        if (info && info->offset == offset) {
+        if (!info) {
-                if (info->bytes < bytes) {
+                WARN_ON(1);
-                        printk(KERN_ERR "Found free space at %llu, size %llu,"
+                goto out_lock;
-                               "trying to use %llu\n",
+        }
-                               (unsigned long long)info->offset,
-                               (unsigned long long)info->bytes,
+        if (info->bytes < bytes && rb_next(&info->offset_index)) {
-                               (unsigned long long)bytes);
+                u64 end;
+                next_info = rb_entry(rb_next(&info->offset_index),
+                                             struct btrfs_free_space,
+                                             offset_index);
+                if (next_info->bitmap)
+                        end = next_info->offset + BITS_PER_BITMAP *
+                                block_group->sectorsize - 1;
+                else
+                        end = next_info->offset + next_info->bytes;
+                if (next_info->bytes < bytes ||
+                    next_info->offset > offset || offset > end) {
+                        printk(KERN_CRIT "Found free space at %llu, size %llu,"
+                              " trying to use %llu\n",
+                              (unsigned long long)info->offset,
+                              (unsigned long long)info->bytes,
+                              (unsigned long long)bytes);
                        WARN_ON(1);
                        ret = -EINVAL;
-                        spin_unlock(&block_group->tree_lock);
+                        goto out_lock;
-                        goto out;
                }
-                unlink_free_space(block_group, info);
-                if (info->bytes == bytes) {
+                info = next_info;
-                        kfree(info);
+        }
-                        spin_unlock(&block_group->tree_lock);
-                        goto out;
+        if (info->bytes == bytes) {
+                unlink_free_space(block_group, info);
+                if (info->bitmap) {
+                        kfree(info->bitmap);
+                        block_group->total_bitmaps--;
                }
+                kfree(info);
+                goto out_lock;
+        }
+        if (!info->bitmap && info->offset == offset) {
+                unlink_free_space(block_group, info);
                info->offset += bytes;
                info->bytes -= bytes;
+                link_free_space(block_group, info);
+                goto out_lock;
+        }
-                ret = link_free_space(block_group, info);
+        if (!info->bitmap && info->offset <= offset &&
-                spin_unlock(&block_group->tree_lock);
+            info->offset + info->bytes >= offset + bytes) {
-                BUG_ON(ret);
-        } else if (info && info->offset < offset &&
-                   info->offset + info->bytes >= offset + bytes) {
                u64 old_start = info->offset;
                /*
                 * we're freeing space in the middle of the info,
@@ -312,7 +710,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                        info->offset = offset + bytes;
                        info->bytes = old_end - info->offset;
                        ret = link_free_space(block_group, info);
-                        BUG_ON(ret);
+                        WARN_ON(ret);
+                        if (ret)
+                                goto out_lock;
                } else {
                        /* the hole we're creating ends at the end
                         * of the info struct, just free the info
@@ -320,32 +720,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                        kfree(info);
                }
                spin_unlock(&block_group->tree_lock);
-                /* step two, insert a new info struct to cover anything
-                 * before the hole
+                /* step two, insert a new info struct to cover
+                 * anything before the hole
                 */
                ret = btrfs_add_free_space(block_group, old_start,
                                           offset - old_start);
-                BUG_ON(ret);
+                WARN_ON(ret);
-        } else {
+                goto out;
-                spin_unlock(&block_group->tree_lock);
-                if (!info) {
-                        printk(KERN_ERR "couldn't find space %llu to free\n",
-                               (unsigned long long)offset);
-                        printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
-                               block_group->cached,
-                               (unsigned long long)block_group->key.objectid,
-                               (unsigned long long)block_group->key.offset);
-                        btrfs_dump_free_space(block_group, bytes);
-                } else if (info) {
-                        printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
-                               "but wanted offset=%llu bytes=%llu\n",
-                               (unsigned long long)info->offset,
-                               (unsigned long long)info->bytes,
-                               (unsigned long long)offset,
-                               (unsigned long long)bytes);
-                }
-                WARN_ON(1);
        }
+        ret = remove_from_bitmap(block_group, info, &offset, &bytes);
+        if (ret == -EAGAIN)
+                goto again;
+        BUG_ON(ret);
+out_lock:
+        spin_unlock(&block_group->tree_lock);
 out:
        return ret;
 }
@@ -361,10 +751,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes)
                        count++;
-                printk(KERN_ERR "entry offset %llu, bytes %llu\n",
+                printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
                       (unsigned long long)info->offset,
-                       (unsigned long long)info->bytes);
+                       (unsigned long long)info->bytes,
+                       (info->bitmap) ? "yes" : "no");
        }
+        printk(KERN_INFO "block group has cluster?: %s\n",
+               list_empty(&block_group->cluster_list) ? "no" : "yes");
        printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
               "\n", count);
 }
@@ -397,26 +790,35 @@ __btrfs_return_cluster_to_free_space(
 {
        struct btrfs_free_space *entry;
        struct rb_node *node;
+        bool bitmap;
        spin_lock(&cluster->lock);
        if (cluster->block_group != block_group)
                goto out;
+        bitmap = cluster->points_to_bitmap;
+        cluster->block_group = NULL;
        cluster->window_start = 0;
+        list_del_init(&cluster->block_group_list);
+        cluster->points_to_bitmap = false;
+        if (bitmap)
+                goto out;
        node = rb_first(&cluster->root);
-        while(node) {
+        while (node) {
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
                rb_erase(&entry->offset_index, &cluster->root);
-                link_free_space(block_group, entry);
+                BUG_ON(entry->bitmap);
+                tree_insert_offset(&block_group->free_space_offset,
+                                   entry->offset, &entry->offset_index, 0);
        }
-        list_del_init(&cluster->block_group_list);
-        btrfs_put_block_group(cluster->block_group);
-        cluster->block_group = NULL;
        cluster->root.rb_node = NULL;
 out:
        spin_unlock(&cluster->lock);
+        btrfs_put_block_group(block_group);
        return 0;
 }
@@ -425,20 +827,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
        struct btrfs_free_space *info;
        struct rb_node *node;
        struct btrfs_free_cluster *cluster;
-        struct btrfs_free_cluster *safe;
+        struct list_head *head;
        spin_lock(&block_group->tree_lock);
+        while ((head = block_group->cluster_list.next) !=
-        list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
+               &block_group->cluster_list) {
-                                 block_group_list) {
+                cluster = list_entry(head, struct btrfs_free_cluster,
+                                     block_group_list);
                WARN_ON(cluster->block_group != block_group);
                __btrfs_return_cluster_to_free_space(block_group, cluster);
+                if (need_resched()) {
+                        spin_unlock(&block_group->tree_lock);
+                        cond_resched();
+                        spin_lock(&block_group->tree_lock);
+                }
        }
-        while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
+        while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
-                info = rb_entry(node, struct btrfs_free_space, bytes_index);
+                info = rb_entry(node, struct btrfs_free_space, offset_index);
                unlink_free_space(block_group, info);
+                if (info->bitmap)
+                        kfree(info->bitmap);
                kfree(info);
                if (need_resched()) {
                        spin_unlock(&block_group->tree_lock);
@@ -446,6 +856,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
                        spin_lock(&block_group->tree_lock);
                }
        }
        spin_unlock(&block_group->tree_lock);
 }
@@ -453,25 +864,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
                               u64 offset, u64 bytes, u64 empty_size)
 {
        struct btrfs_free_space *entry = NULL;
+        u64 bytes_search = bytes + empty_size;
        u64 ret = 0;
        spin_lock(&block_group->tree_lock);
-        entry = tree_search_offset(&block_group->free_space_offset, offset,
+        entry = find_free_space(block_group, &offset, &bytes_search, 0);
-                                   bytes + empty_size, 1);
        if (!entry)
-                entry = tree_search_bytes(&block_group->free_space_bytes,
+                goto out;
-                                          offset, bytes + empty_size);
-        if (entry) {
+        ret = offset;
+        if (entry->bitmap) {
+                bitmap_clear_bits(block_group, entry, offset, bytes);
+                if (!entry->bytes) {
+                        unlink_free_space(block_group, entry);
+                        kfree(entry->bitmap);
+                        kfree(entry);
+                        block_group->total_bitmaps--;
+                        recalculate_thresholds(block_group);
+                }
+        } else {
                unlink_free_space(block_group, entry);
-                ret = entry->offset;
                entry->offset += bytes;
                entry->bytes -= bytes;
                if (!entry->bytes)
                        kfree(entry);
                else
                        link_free_space(block_group, entry);
        }
+out:
        spin_unlock(&block_group->tree_lock);
        return ret;
@@ -517,6 +938,47 @@ int btrfs_return_cluster_to_free_space(
        return ret;
 }
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+                                   struct btrfs_free_cluster *cluster,
+                                   u64 bytes, u64 min_start)
+{
+        struct btrfs_free_space *entry;
+        int err;
+        u64 search_start = cluster->window_start;
+        u64 search_bytes = bytes;
+        u64 ret = 0;
+        spin_lock(&block_group->tree_lock);
+        spin_lock(&cluster->lock);
+        if (!cluster->points_to_bitmap)
+                goto out;
+        if (cluster->block_group != block_group)
+                goto out;
+        entry = tree_search_offset(block_group, search_start, 0, 0);
+        if (!entry || !entry->bitmap)
+                goto out;
+        search_start = min_start;
+        search_bytes = bytes;
+        err = search_bitmap(block_group, entry, &search_start,
+                            &search_bytes);
+        if (err)
+                goto out;
+        ret = search_start;
+        bitmap_clear_bits(block_group, entry, ret, bytes);
+out:
+        spin_unlock(&cluster->lock);
+        spin_unlock(&block_group->tree_lock);
+        return ret;
+}
 /*
 * given a cluster, try to allocate 'bytes' from it, returns 0
 * if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +992,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
        struct rb_node *node;
        u64 ret = 0;
+        if (cluster->points_to_bitmap)
+                return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
+                                               min_start);
        spin_lock(&cluster->lock);
        if (bytes > cluster->max_size)
                goto out;
@@ -567,9 +1033,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
        }
 out:
        spin_unlock(&cluster->lock);
        return ret;
 }
+static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+                                struct btrfs_free_space *entry,
+                                struct btrfs_free_cluster *cluster,
+                                u64 offset, u64 bytes, u64 min_bytes)
+{
+        unsigned long next_zero;
+        unsigned long i;
+        unsigned long search_bits;
+        unsigned long total_bits;
+        unsigned long found_bits;
+        unsigned long start = 0;
+        unsigned long total_found = 0;
+        bool found = false;
+        i = offset_to_bit(entry->offset, block_group->sectorsize,
+                          max_t(u64, offset, entry->offset));
+        search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+        total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+again:
+        found_bits = 0;
+        for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
+             i < BITS_PER_BITMAP;
+             i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+                next_zero = find_next_zero_bit(entry->bitmap,
+                                               BITS_PER_BITMAP, i);
+                if (next_zero - i >= search_bits) {
+                        found_bits = next_zero - i;
+                        break;
+                }
+                i = next_zero;
+        }
+        if (!found_bits)
+                return -1;
+        if (!found) {
+                start = i;
+                found = true;
+        }
+        total_found += found_bits;
+        if (cluster->max_size < found_bits * block_group->sectorsize)
+                cluster->max_size = found_bits * block_group->sectorsize;
+        if (total_found < total_bits) {
+                i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
+                if (i - start > total_bits * 2) {
+                        total_found = 0;
+                        cluster->max_size = 0;
+                        found = false;
+                }
+                goto again;
+        }
+        cluster->window_start = start * block_group->sectorsize +
+                entry->offset;
+        cluster->points_to_bitmap = true;
+        return 0;
+}
 /*
 * here we try to find a cluster of blocks in a block group.  The goal
 * is to find at least bytes free and up to empty_size + bytes free.
@@ -587,12 +1117,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
        struct btrfs_free_space *entry = NULL;
        struct rb_node *node;
        struct btrfs_free_space *next;
-        struct btrfs_free_space *last;
+        struct btrfs_free_space *last = NULL;
        u64 min_bytes;
        u64 window_start;
        u64 window_free;
        u64 max_extent = 0;
-        int total_retries = 0;
+        bool found_bitmap = false;
        int ret;
        /* for metadata, allow allocates with more holes */
@@ -620,31 +1150,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                goto out;
        }
 again:
-        min_bytes = min(min_bytes, bytes + empty_size);
+        entry = tree_search_offset(block_group, offset, found_bitmap, 1);
-        entry = tree_search_bytes(&block_group->free_space_bytes,
-                                  offset, min_bytes);
        if (!entry) {
                ret = -ENOSPC;
                goto out;
        }
+        /*
+         * If found_bitmap is true, we exhausted our search for extent entries,
+         * and we just want to search all of the bitmaps that we can find, and
+         * ignore any extent entries we find.
+         */
+        while (entry->bitmap || found_bitmap ||
+               (!entry->bitmap && entry->bytes < min_bytes)) {
+                struct rb_node *node = rb_next(&entry->offset_index);
+                if (entry->bitmap && entry->bytes > bytes + empty_size) {
+                        ret = btrfs_bitmap_cluster(block_group, entry, cluster,
+                                                   offset, bytes + empty_size,
+                                                   min_bytes);
+                        if (!ret)
+                                goto got_it;
+                }
+                if (!node) {
+                        ret = -ENOSPC;
+                        goto out;
+                }
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+        }
+        /*
+         * We already searched all the extent entries from the passed in offset
+         * to the end and didn't find enough space for the cluster, and we also
+         * didn't find any bitmaps that met our criteria, just go ahead and exit
+         */
+        if (found_bitmap) {
+                ret = -ENOSPC;
+                goto out;
+        }
+        cluster->points_to_bitmap = false;
        window_start = entry->offset;
        window_free = entry->bytes;
        last = entry;
        max_extent = entry->bytes;
-        while(1) {
+        while (1) {
                /* out window is just right, lets fill it */
                if (window_free >= bytes + empty_size)
                        break;
                node = rb_next(&last->offset_index);
                if (!node) {
+                        if (found_bitmap)
+                                goto again;
                        ret = -ENOSPC;
                        goto out;
                }
                next = rb_entry(node, struct btrfs_free_space, offset_index);
                /*
+                 * we found a bitmap, so if this search doesn't result in a
+                 * cluster, we know to go and search again for the bitmaps and
+                 * start looking for space there
+                 */
+                if (next->bitmap) {
+                        if (!found_bitmap)
+                                offset = next->offset;
+                        found_bitmap = true;
+                        last = next;
+                        continue;
+                }
+                /*
                 * we haven't filled the empty size and the window is
                 * very large.  reset and try again
                 */
@@ -655,19 +1234,6 @@ again:
                        window_free = entry->bytes;
                        last = entry;
                        max_extent = 0;
-                        total_retries++;
-                        if (total_retries % 64 == 0) {
-                                if (min_bytes >= (bytes + empty_size)) {
-                                        ret = -ENOSPC;
-                                        goto out;
-                                }
-                                /*
-                                 * grow our allocation a bit, we're not having
-                                 * much luck
-                                 */
-                                min_bytes *= 2;
-                                goto again;
-                        }
                } else {
                        last = next;
                        window_free += next->bytes;
@@ -685,11 +1251,19 @@ again:
         * The cluster includes an rbtree, but only uses the offset index
         * of each free space cache entry.
         */
-        while(1) {
+        while (1) {
                node = rb_next(&entry->offset_index);
-                unlink_free_space(block_group, entry);
+                if (entry->bitmap && node) {
+                        entry = rb_entry(node, struct btrfs_free_space,
+                                         offset_index);
+                        continue;
+                } else if (entry->bitmap && !node) {
+                        break;
+                }
+                rb_erase(&entry->offset_index, &block_group->free_space_offset);
                ret = tree_insert_offset(&cluster->root, entry->offset,
-                                         &entry->offset_index);
+                                         &entry->offset_index, 0);
                BUG_ON(ret);
                if (!node || entry == last)
@@ -697,8 +1271,10 @@ again:
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
        }
-        ret = 0;
        cluster->max_size = max_extent;
+got_it:
+        ret = 0;
        atomic_inc(&block_group->count);
        list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
        cluster->block_group = block_group;
@@ -718,6 +1294,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
        spin_lock_init(&cluster->refill_lock);
        cluster->root.rb_node = NULL;
        cluster->max_size = 0;
+        cluster->points_to_bitmap = false;
        INIT_LIST_HEAD(&cluster->block_group_list);
        cluster->block_group = NULL;
 }
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 266fb8764054..890a8e79011b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
 #ifndef __BTRFS_FREE_SPACE_CACHE
 #define __BTRFS_FREE_SPACE_CACHE
+struct btrfs_free_space {
+        struct rb_node offset_index;
+        u64 offset;
+        u64 bytes;
+        unsigned long *bitmap;
+        struct list_head list;
+};
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                         u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 791eab19e330..56fe83fa60c4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2603,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        if (root->ref_cows)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
        path = btrfs_alloc_path();
-        path->reada = -1;
        BUG_ON(!path);
+        path->reada = -1;
        /* FIXME, add redo link to tree so we don't leak on crash */
        key.objectid = inode->i_ino;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6d6523da0a30..0d126be22b63 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
        }
        printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
               (unsigned long long)btrfs_header_bytenr(c),
-               btrfs_header_level(c), nr,
+              level, nr,
               (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
        for (i = 0; i < nr; i++) {
                btrfs_node_key_to_cpu(c, &key, i);
@@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
                                        btrfs_level_size(root, level - 1),
                                        btrfs_node_ptr_generation(c, i));
                if (btrfs_is_leaf(next) &&
-                    btrfs_header_level(c) != 1)
+                   level != 1)
                        BUG();
                if (btrfs_header_level(next) !=
-                        btrfs_header_level(c) - 1)
+                       level - 1)
                        BUG();
                btrfs_print_tree(root, next);
                free_extent_buffer(next);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 008397934778..e71264d1c2c9 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -670,6 +670,8 @@ again:
                        err = ret;
                        goto out;
                }
+                if (ret > 0 && path2->slots[level] > 0)
+                        path2->slots[level]--;
                eb = path2->nodes[level];
                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
@@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                BUG_ON(level == 0);
                path->lowest_level = level;
                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+                path->lowest_level = 0;
                if (ret < 0) {
                        btrfs_free_path(path);
                        return ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2dbf1c1f56ee..e51d2bc532f8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,6 +40,14 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
        }
 }
+static noinline void switch_commit_root(struct btrfs_root *root)
+{
+        down_write(&root->commit_root_sem);
+        free_extent_buffer(root->commit_root);
+        root->commit_root = btrfs_root_node(root);
+        up_write(&root->commit_root_sem);
+}
 /*
 * either allocate a new transaction or hop into the existing one
 */
@@ -444,9 +452,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        btrfs_write_dirty_block_groups(trans, root);
-        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-        BUG_ON(ret);
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start)
@@ -457,13 +462,11 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                                        &root->root_key,
                                        &root->root_item);
                BUG_ON(ret);
-                btrfs_write_dirty_block_groups(trans, root);
-                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                ret = btrfs_write_dirty_block_groups(trans, root);
                BUG_ON(ret);
        }
-        free_extent_buffer(root->commit_root);
+        switch_commit_root(root);
-        root->commit_root = btrfs_root_node(root);
        return 0;
 }
@@ -495,9 +498,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                root = list_entry(next, struct btrfs_root, dirty_list);
                update_cowonly_root(trans, root);
-                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-                BUG_ON(ret);
        }
        return 0;
 }
@@ -544,8 +544,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        btrfs_update_reloc_root(trans, root);
                        if (root->commit_root != root->node) {
-                                free_extent_buffer(root->commit_root);
+                                switch_commit_root(root);
-                                root->commit_root = btrfs_root_node(root);
                                btrfs_set_root_node(&root->root_item,
                                                    root->node);
                        }
@@ -943,9 +942,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                mutex_unlock(&root->fs_info->trans_mutex);
-                if (flush_on_commit || snap_pending) {
+                if (flush_on_commit) {
-                        if (flush_on_commit)
+                        btrfs_start_delalloc_inodes(root);
-                                btrfs_start_delalloc_inodes(root);
+                        ret = btrfs_wait_ordered_extents(root, 0);
+                        BUG_ON(ret);
+                } else if (snap_pending) {
                        ret = btrfs_wait_ordered_extents(root, 1);
                        BUG_ON(ret);
                }
@@ -1009,15 +1010,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
                            root->fs_info->tree_root->node);
-        free_extent_buffer(root->fs_info->tree_root->commit_root);
+        switch_commit_root(root->fs_info->tree_root);
-        root->fs_info->tree_root->commit_root =
-                                btrfs_root_node(root->fs_info->tree_root);
        btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
                            root->fs_info->chunk_root->node);
-        free_extent_buffer(root->fs_info->chunk_root->commit_root);
+        switch_commit_root(root->fs_info->chunk_root);
-        root->fs_info->chunk_root->commit_root =
-                                btrfs_root_node(root->fs_info->chunk_root);
        update_super_roots(root);
@@ -1057,6 +1054,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        cur_trans->commit_done = 1;
        root->fs_info->last_trans_committed = cur_trans->transid;
        wake_up(&cur_trans->commit_wait);
        put_transaction(cur_trans);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c13922206d1b..d91b0de7c502 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                return -ENOENT;
        inode = read_one_inode(root, key->objectid);
-        BUG_ON(!dir);
+        BUG_ON(!inode);
        ref_ptr = btrfs_item_ptr_offset(eb, slot);
        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ab80e9cd767..5dbefd11b4af 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -721,7 +721,8 @@ error:
 */
 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
                                         struct btrfs_device *device,
-                                         u64 num_bytes, u64 *start)
+                                         u64 num_bytes, u64 *start,
+                                         u64 *max_avail)
 {
        struct btrfs_key key;
        struct btrfs_root *root = device->dev_root;
@@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
        if (ret < 0)
                goto error;
-        ret = btrfs_previous_item(root, path, 0, key.type);
+        if (ret > 0) {
-        if (ret < 0)
+                ret = btrfs_previous_item(root, path, key.objectid, key.type);
-                goto error;
+                if (ret < 0)
+                        goto error;
+                if (ret > 0)
+                        start_found = 1;
+        }
        l = path->nodes[0];
        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
        while (1) {
@@ -803,6 +808,10 @@ no_more_items:
                        if (last_byte < search_start)
                                last_byte = search_start;
                        hole_size = key.offset - last_byte;
+                        if (hole_size > *max_avail)
+                                *max_avail = hole_size;
                        if (key.offset > last_byte &&
                            hole_size >= num_bytes) {
                                *start = last_byte;
@@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
        device->fs_devices->total_rw_bytes += diff;
        device->total_bytes = new_size;
+        device->disk_total_bytes = new_size;
        btrfs_clear_space_info_full(device->dev_root->fs_info);
        return btrfs_update_device(trans, device);
@@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                        goto done;
                if (ret) {
                        ret = 0;
-                        goto done;
+                        break;
                }
                l = path->nodes[0];
@@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
                if (key.objectid != device->devid)
-                        goto done;
+                        break;
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
                length = btrfs_dev_extent_length(l, dev_extent);
@@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                             max_chunk_size);
 again:
+        max_avail = 0;
        if (!map || map->num_stripes != num_stripes) {
                kfree(map);
                map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2219,7 +2230,8 @@ again:
                if (device->in_fs_metadata && avail >= min_free) {
                        ret = find_free_dev_extent(trans, device,
-                                                   min_free, &dev_offset);
+                                                   min_free, &dev_offset,
+                                                   &max_avail);
                        if (ret == 0) {
                                list_move_tail(&device->dev_alloc_list,
                                               &private_devs);
@@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                }
        }
-        for (i = 0; i > nr; i++) {
-                struct btrfs_multi_bio *multi;
-                struct btrfs_bio_stripe *stripe;
-                int ret;
-                length = 1;
-                ret = btrfs_map_block(map_tree, WRITE, buf[i],
-                                      &length, &multi, 0);
-                BUG_ON(ret);
-                stripe = multi->stripes;
-                for (j = 0; j < multi->num_stripes; j++) {
-                        if (stripe->physical >= physical &&
-                            physical < stripe->physical + length)
-                                break;
-                }
-                BUG_ON(j >= multi->num_stripes);
-                kfree(multi);
-        }
        *logical = buf;
        *naddrs = nr;
        *stripe_len = map->stripe_len;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9bb5c8750736..fc44d316d0bb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2452,10 +2452,10 @@ try_mount_again:
                tcon->local_lease = volume_info->local_lease;
        }
        if (pSesInfo) {
-                if (pSesInfo->capabilities & CAP_LARGE_FILES) {
+                if (pSesInfo->capabilities & CAP_LARGE_FILES)
-                        sb->s_maxbytes = (u64) 1 << 63;
+                        sb->s_maxbytes = MAX_LFS_FILESIZE;
-                } else
+                else
-                        sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */
+                        sb->s_maxbytes = MAX_NON_LFS;
        }
        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 18afe57b2461..82d83839655e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -212,7 +212,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
 * junction to the new submount (ie to setup the fake directory
 * which represents a DFS referral).
 */
-void
+static void
 cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -388,7 +388,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 }
 /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
-void
+static void
 cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
                       struct cifs_sb_info *cifs_sb, bool adjust_tz)
 {
@@ -513,9 +513,12 @@ int cifs_get_inode_info(struct inode **pinode,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        if (rc1) {
-                                /* BB EOPNOSUPP disable SERVER_INUM? */
                                cFYI(1, ("GetSrvInodeNum rc %d", rc1));
                                fattr.cf_uniqueid = iunique(sb, ROOT_I);
+                                /* disable serverino if call not supported */
+                                if (rc1 == -EINVAL)
+                                        cifs_sb->mnt_cifs_flags &=
+                                                        ~CIFS_MOUNT_SERVER_INUM;
                        }
                } else {
                        fattr.cf_uniqueid = iunique(sb, ROOT_I);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index af737bb56cb7..259525c9abb8 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
        }
        (*new_auth_tok)->session_key.encrypted_key_size =
                (body_size - (ECRYPTFS_SALT_SIZE + 5));
+        if ((*new_auth_tok)->session_key.encrypted_key_size
+            > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+                printk(KERN_WARNING "Tag 3 packet contains key larger "
+                       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
+                rc = -EINVAL;
+                goto out_free;
+        }
        if (unlikely(data[(*packet_size)++] != 0x04)) {
                printk(KERN_WARNING "Unknown version number [%d]\n",
                       data[(*packet_size) - 1]);
@@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents,
                rc = -EINVAL;
                goto out;
        }
+        if (unlikely((*tag_11_contents_size) > max_contents_bytes)) {
+                printk(KERN_ERR "Literal data section in tag 11 packet exceeds "
+                       "expected size\n");
+                rc = -EINVAL;
+                goto out;
+        }
        if (data[(*packet_size)++] != 0x62) {
                printk(KERN_WARNING "Unrecognizable packet\n");
                rc = -EINVAL;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 3d724a95882f..373fa90c796a 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp,
                struct buffer_head *bh = NULL;
                map_bh.b_state = 0;
-                err = ext3_get_blocks_handle(NULL, inode, blk, 1,
+                err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
-                                                &map_bh, 0, 0);
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5f51fed5c750..b49908a167ae 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -788,7 +788,7 @@ err_out:
 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
                sector_t iblock, unsigned long maxblocks,
                struct buffer_head *bh_result,
-                int create, int extend_disksize)
+                int create)
 {
        int err = -EIO;
        int offsets[4];
@@ -911,13 +911,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
        if (!err)
                err = ext3_splice_branch(handle, inode, iblock,
                                        partial, indirect_blks, count);
-        /*
-         * i_disksize growing is protected by truncate_mutex.  Don't forget to
-         * protect it if you're about to implement concurrent
-         * ext3_get_block() -bzzz
-        */
-        if (!err && extend_disksize && inode->i_size > ei->i_disksize)
-                ei->i_disksize = inode->i_size;
        mutex_unlock(&ei->truncate_mutex);
        if (err)
                goto cleanup;
@@ -972,7 +965,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
        }
        ret = ext3_get_blocks_handle(handle, inode, iblock,
-                                        max_blocks, bh_result, create, 0);
+                                        max_blocks, bh_result, create);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -1005,7 +998,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
        dummy.b_blocknr = -1000;
        buffer_trace_init(&dummy.b_history);
        err = ext3_get_blocks_handle(handle, inode, block, 1,
-                                        &dummy, create, 1);
+                                        &dummy, create);
        /*
         * ext3_get_blocks_handle() returns number of blocks
         * mapped. 0 in case of a HOLE.
@@ -1193,15 +1186,16 @@ write_begin_failed:
                 * i_size_read because we hold i_mutex.
                 *
                 * Add inode to orphan list in case we crash before truncate
-                 * finishes.
+                 * finishes. Do this only if ext3_can_truncate() agrees so
+                 * that orphan processing code is happy.
                 */
-                if (pos + len > inode->i_size)
+                if (pos + len > inode->i_size && ext3_can_truncate(inode))
                        ext3_orphan_add(handle, inode);
                ext3_journal_stop(handle);
                unlock_page(page);
                page_cache_release(page);
                if (pos + len > inode->i_size)
-                        vmtruncate(inode, inode->i_size);
+                        ext3_truncate(inode);
        }
        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
@@ -1287,7 +1281,7 @@ static int ext3_ordered_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        ret2 = ext3_journal_stop(handle);
        if (!ret)
@@ -1296,7 +1290,7 @@ static int ext3_ordered_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -1315,14 +1309,14 @@ static int ext3_writeback_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        ret = ext3_journal_stop(handle);
        unlock_page(page);
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -1358,7 +1352,7 @@ static int ext3_journalled_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
        if (inode->i_size > EXT3_I(inode)->i_disksize) {
@@ -1375,7 +1369,7 @@ static int ext3_journalled_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 737f7246a4b5..f96f85092d1c 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
+        journal_t *journal = transaction->t_journal;
        /*
         * The buffer really shouldn't be locked: only the current committing
@@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction,
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+        /* keep subsequent assertions sane */
+        new_bh->b_state = 0;
+        init_buffer(new_bh, NULL, NULL);
+        atomic_set(&new_bh->b_count, 1);
+        new_jh = journal_add_journal_head(new_bh);      /* This sleeps */
        /*
         * If a new transaction has already done a buffer copy-out, then
@@ -361,14 +367,6 @@ repeat:
                kunmap_atomic(mapped_data, KM_USER0);
        }
-        /* keep subsequent assertions sane */
-        new_bh->b_state = 0;
-        init_buffer(new_bh, NULL, NULL);
-        atomic_set(&new_bh->b_count, 1);
-        jbd_unlock_bh_state(bh_in);
-        new_jh = journal_add_journal_head(new_bh);      /* This sleeps */
        set_bh_page(new_bh, new_page, new_offset);
        new_jh->b_transaction = NULL;
        new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -385,7 +383,11 @@ repeat:
         * copying is moved to the transaction's shadow queue.
         */
        JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
-        journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_lock(&journal->j_list_lock);
+        __journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(bh_in);
        JBUFFER_TRACE(new_jh, "file as BJ_IO");
        journal_file_buffer(new_jh, transaction, BJ_IO);
@@ -848,6 +850,12 @@ static int journal_reset(journal_t *journal)
        first = be32_to_cpu(sb->s_first);
        last = be32_to_cpu(sb->s_maxlen);
+        if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
+                printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
+                       first, last);
+                journal_fail_superblock(journal);
+                return -EINVAL;
+        }
        journal->j_first = first;
        journal->j_last = last;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 73242ba7c7b1..c03ac11f74be 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
 }
-/*
+static void warn_dirty_buffer(struct buffer_head *bh)
- * Report any unexpected dirty buffers which turn up.  Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible.  #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 {
-        int jlist;
+        char b[BDEVNAME_SIZE];
-        /* If this buffer is one which might reasonably be dirty
-         * --- ie. data, or not part of this journal --- then
-         * we're OK to leave it alone, but otherwise we need to
-         * move the dirty bit to the journal's own internal
-         * JBDDirty bit. */
-        jlist = jh->b_jlist;
-        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+        printk(KERN_WARNING
-            jlist == BJ_Shadow || jlist == BJ_Forget) {
+               "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
-                struct buffer_head *bh = jh2bh(jh);
+               "There's a risk of filesystem corruption in case of system "
+               "crash.\n",
-                if (test_clear_buffer_dirty(bh))
+               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
-                        set_buffer_jbddirty(bh);
-        }
 }
 /*
@@ -583,14 +564,16 @@ repeat:
                        if (jh->b_next_transaction)
                                J_ASSERT_JH(jh, jh->b_next_transaction ==
                                                        transaction);
+                        warn_dirty_buffer(bh);
                }
                /*
                 * In any case we need to clean the dirty flag and we must
                 * do it under the buffer lock to be sure we don't race
                 * with running write-out.
                 */
-                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                JBUFFER_TRACE(jh, "Journalling dirty buffer");
-                jbd_unexpected_dirty_buffer(jh);
+                clear_buffer_dirty(bh);
+                set_buffer_jbddirty(bh);
        }
        unlock_buffer(bh);
@@ -826,6 +809,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
        if (jh->b_transaction == NULL) {
+                /*
+                 * Previous journal_forget() could have left the buffer
+                 * with jbddirty bit set because it was being committed. When
+                 * the commit finished, we've filed the buffer for
+                 * checkpointing and marked it dirty. Now we are reallocating
+                 * the buffer so the transaction freeing it must have
+                 * committed and so it's safe to clear the dirty bit.
+                 */
+                clear_buffer_dirty(jh2bh(jh));
                jh->b_transaction = transaction;
                /* first access by this transaction */
@@ -1782,8 +1774,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+                /*
+                 * We don't want to write the buffer anymore, clear the
+                 * bit so that we don't confuse checks in
+                 * __journal_file_buffer
+                 */
+                clear_buffer_dirty(bh);
                __journal_file_buffer(jh, transaction, BJ_Forget);
-                clear_buffer_jbddirty(bh);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
@@ -2041,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh,
        if (jh->b_transaction && jh->b_jlist == jlist)
                return;
-        /* The following list of buffer states needs to be consistent
-         * with __jbd_unexpected_dirty_buffer()'s handling of dirty
-         * state. */
        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
+                /*
+                 * For metadata buffers, we track dirty bit in buffer_jbddirty
+                 * instead of buffer_dirty. We should not see a dirty bit set
+                 * here because we clear it in do_get_write_access but e.g.
+                 * tune2fs can modify the sb and set the dirty bit at any time
+                 * so we try to gracefully handle that.
+                 */
+                if (buffer_dirty(bh))
+                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 91fa3ad6e8c2..a29c7c3e3fb8 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -67,10 +67,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
                acl = posix_acl_from_xattr(value, size);
        }
        kfree(value);
-        if (!IS_ERR(acl)) {
+        if (!IS_ERR(acl))
                set_cached_acl(inode, type, acl);
-                posix_acl_release(acl);
-        }
        return acl;
 }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c2d061675d80..8d25ccb2d51d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1242,20 +1242,6 @@ error:
        return error;
 }
-/*
- * Initialize a session.
- * Note: save the mount rsize and wsize for create_server negotiation.
- */
-static void nfs4_init_session(struct nfs_client *clp,
-                              unsigned int wsize, unsigned int rsize)
-{
-#if defined(CONFIG_NFS_V4_1)
-        if (nfs4_has_session(clp)) {
-                clp->cl_session->fc_attrs.max_rqst_sz = wsize;
-                clp->cl_session->fc_attrs.max_resp_sz = rsize;
-        }
-#endif /* CONFIG_NFS_V4_1 */
-}
 /*
 * Session has been established, and the client marked ready.
@@ -1350,7 +1336,9 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        BUG_ON(!server->nfs_client->rpc_ops);
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-        nfs4_init_session(server->nfs_client, server->wsize, server->rsize);
+        error = nfs4_init_session(server);
+        if (error < 0)
+                goto error;
        /* Probe the root fh to retrieve its FSID */
        error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 38d42c29fb92..32062c33c859 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1025,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
-                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
+                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
                                goto out;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 61bc3a32e1e2..6ea07a3c75d4 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -220,6 +220,7 @@ extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern int nfs4_proc_create_session(struct nfs_client *, int reset);
 extern int nfs4_proc_destroy_session(struct nfs4_session *);
+extern int nfs4_init_session(struct nfs_server *server);
 #else /* CONFIG_NFS_v4_1 */
 static inline int nfs4_setup_sequence(struct nfs_client *clp,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
@@ -227,6 +228,11 @@ static inline int nfs4_setup_sequence(struct nfs_client *clp,
 {
        return 0;
 }
+static inline int nfs4_init_session(struct nfs_server *server)
+{
+        return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[];
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ff0c080db59b..6917311f201c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2040,15 +2040,9 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
                .rpc_argp = &args,
                .rpc_resp = &res,
        };
-        int status;
        nfs_fattr_init(info->fattr);
-        status = nfs4_recover_expired_lease(server);
+        return nfs4_call_sync(server, &msg, &args, &res, 0);
-        if (!status)
-                status = nfs4_check_client_ready(server->nfs_client);
-        if (!status)
-                status = nfs4_call_sync(server, &msg, &args, &res, 0);
-        return status;
 }
 static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -4099,15 +4093,23 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
        if (request->fl_start < 0 || request->fl_end < 0)
                return -EINVAL;
-        if (IS_GETLK(cmd))
+        if (IS_GETLK(cmd)) {
-                return nfs4_proc_getlk(state, F_GETLK, request);
+                if (state != NULL)
+                        return nfs4_proc_getlk(state, F_GETLK, request);
+                return 0;
+        }
        if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
                return -EINVAL;
-        if (request->fl_type == F_UNLCK)
+        if (request->fl_type == F_UNLCK) {
-                return nfs4_proc_unlck(state, cmd, request);
+                if (state != NULL)
+                        return nfs4_proc_unlck(state, cmd, request);
+                return 0;
+        }
+        if (state == NULL)
+                return -ENOLCK;
        do {
                status = nfs4_proc_setlk(state, cmd, request);
                if ((status != -EAGAIN) || IS_SETLK(cmd))
@@ -4793,6 +4795,22 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
        return status;
 }
+int nfs4_init_session(struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        int ret;
+        if (!nfs4_has_session(clp))
+                return 0;
+        clp->cl_session->fc_attrs.max_rqst_sz = server->wsize;
+        clp->cl_session->fc_attrs.max_resp_sz = server->rsize;
+        ret = nfs4_recover_expired_lease(server);
+        if (!ret)
+                ret = nfs4_check_client_ready(clp);
+        return ret;
+}
 /*
 * Renew the cl_session lease.
 */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index b73c5a728655..65ca8c18476f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -553,6 +553,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        INIT_LIST_HEAD(&lsp->ls_sequence.list);
        lsp->ls_seqid.sequence = &lsp->ls_sequence;
        atomic_set(&lsp->ls_count, 1);
+        lsp->ls_state = state;
        lsp->ls_owner = fl_owner;
        spin_lock(&clp->cl_lock);
        nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
@@ -587,7 +588,6 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
                if (lsp != NULL)
                        break;
                if (new != NULL) {
-                        new->ls_state = state;
                        list_add(&new->ls_locks, &state->lock_states);
                        set_bit(LK_STATE_IN_USE, &state->flags);
                        lsp = new;
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
new file mode 100644
index 000000000000..72da095d4009
--- /dev/null
+++ b/fs/nilfs2/Kconfig
@@ -0,0 +1,25 @@
+config NILFS2_FS
+        tristate "NILFS2 file system support (EXPERIMENTAL)"
+        depends on BLOCK && EXPERIMENTAL
+        select CRC32
+        help
+          NILFS2 is a log-structured file system (LFS) supporting continuous
+          snapshotting.  In addition to versioning capability of the entire
+          file system, users can even restore files mistakenly overwritten or
+          destroyed just a few seconds ago.  Since this file system can keep
+          consistency like conventional LFS, it achieves quick recovery after
+          system crashes.
+          NILFS2 creates a number of checkpoints every few seconds or per
+          synchronous write basis (unless there is no change).  Users can
+          select significant versions among continuously created checkpoints,
+          and can change them into snapshots which will be preserved for long
+          periods until they are changed back to checkpoints.  Each
+          snapshot is mountable as a read-only file system concurrently with
+          its writable mount, and this feature is convenient for online backup.
+          Some features including atime, extended attributes, and POSIX ACLs,
+          are not supported yet.
+          To compile this file system support as a module, choose M here: the
+          module will be called nilfs2.  If unsure, say N.
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 31dac7e3b0f1..dffbb0911d02 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,15 +1,5 @@
 config FSNOTIFY
-        bool "Filesystem notification backend"
+        def_bool n
-        default y
-        ---help---
-           fsnotify is a backend for filesystem notification.  fsnotify does
-           not provide any userspace interface but does provide the basis
-           needed for other notification schemes such as dnotify, inotify,
-           and fanotify.
-           Say Y here to enable fsnotify suport.
-           If unsure, say Y.
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 904ff8d5405a..f9c1ca139d8f 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,6 +1,6 @@
 config DNOTIFY
        bool "Dnotify support"
-        depends on FSNOTIFY
+        select FSNOTIFY
        default y
        help
          Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ec2f7bd76818..037e878e03fc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
                        if (!group->ops->should_send_event(group, to_tell, mask))
                                continue;
                        if (!event) {
-                                event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
+                                event = fsnotify_create_event(to_tell, mask, data,
+                                                              data_is, file_name, cookie,
+                                                              GFP_KERNEL);
                                /* shit, we OOM'd and now we can't tell, maybe
                                 * someday someone else will want to do something
                                 * here */
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 5356884289a1..3e56dbffe729 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,7 +15,7 @@ config INOTIFY
 config INOTIFY_USER
        bool "Inotify support for userspace"
-        depends on FSNOTIFY
+        select FSNOTIFY
        default y
        ---help---
          Say Y here to enable inotify support for userspace, including the
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ff27a2965844..f30d9bbc2e1b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -57,7 +57,6 @@ int inotify_max_user_watches __read_mostly;
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 struct kmem_cache *event_priv_cachep __read_mostly;
-static struct fsnotify_event *inotify_ignored_event;
 /*
 * When inotify registers a new group it increments this and uses that
@@ -365,6 +364,17 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
        return error;
 }
+static void inotify_remove_from_idr(struct fsnotify_group *group,
+                                    struct inotify_inode_mark_entry *ientry)
+{
+        struct idr *idr;
+        spin_lock(&group->inotify_data.idr_lock);
+        idr = &group->inotify_data.idr;
+        idr_remove(idr, ientry->wd);
+        spin_unlock(&group->inotify_data.idr_lock);
+        ientry->wd = -1;
+}
 /*
 * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the
 * internal reference help on the mark because it is in the idr.
@@ -373,13 +383,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
                                    struct fsnotify_group *group)
 {
        struct inotify_inode_mark_entry *ientry;
+        struct fsnotify_event *ignored_event;
        struct inotify_event_private_data *event_priv;
        struct fsnotify_event_private_data *fsn_event_priv;
-        struct idr *idr;
+        ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
+                                              FSNOTIFY_EVENT_NONE, NULL, 0,
+                                              GFP_NOFS);
+        if (!ignored_event)
+                return;
        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
-        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
        if (unlikely(!event_priv))
                goto skip_send_ignore;
@@ -388,7 +404,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
        fsn_event_priv->group = group;
        event_priv->wd = ientry->wd;
-        fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
+        fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
        /* did the private data get added? */
        if (list_empty(&fsn_event_priv->event_list))
@@ -396,14 +412,16 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 skip_send_ignore:
+        /* matches the reference taken when the event was created */
+        fsnotify_put_event(ignored_event);
        /* remove this entry from the idr */
-        spin_lock(&group->inotify_data.idr_lock);
+        inotify_remove_from_idr(group, ientry);
-        idr = &group->inotify_data.idr;
-        idr_remove(idr, ientry->wd);
-        spin_unlock(&group->inotify_data.idr_lock);
        /* removed from idr, drop that reference */
        fsnotify_put_mark(entry);
+        atomic_dec(&group->inotify_data.user->inotify_watches);
 }
 /* ding dong the mark is dead */
@@ -418,6 +436,7 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
 {
        struct fsnotify_mark_entry *entry = NULL;
        struct inotify_inode_mark_entry *ientry;
+        struct inotify_inode_mark_entry *tmp_ientry;
        int ret = 0;
        int add = (arg & IN_MASK_ADD);
        __u32 mask;
@@ -428,54 +447,66 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
        if (unlikely(!mask))
                return -EINVAL;
-        ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+        tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
-        if (unlikely(!ientry))
+        if (unlikely(!tmp_ientry))
                return -ENOMEM;
        /* we set the mask at the end after attaching it */
-        fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
+        fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
-        ientry->wd = 0;
+        tmp_ientry->wd = -1;
 find_entry:
        spin_lock(&inode->i_lock);
        entry = fsnotify_find_mark_entry(group, inode);
        spin_unlock(&inode->i_lock);
        if (entry) {
-                kmem_cache_free(inotify_inode_mark_cachep, ientry);
                ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
        } else {
-                if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
+                ret = -ENOSPC;
-                        ret = -ENOSPC;
+                if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
-                        goto out_err;
-                }
-                ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
-                if (ret == -EEXIST)
-                        goto find_entry;
-                else if (ret)
                        goto out_err;
-                entry = &ientry->fsn_entry;
 retry:
                ret = -ENOMEM;
                if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
                        goto out_err;
                spin_lock(&group->inotify_data.idr_lock);
-                /* if entry is added to the idr we keep the reference obtained
+                ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
-                 * through fsnotify_mark_add.  remember to drop this reference
+                                        group->inotify_data.last_wd,
-                 * when entry is removed from idr */
+                                        &tmp_ientry->wd);
-                ret = idr_get_new_above(&group->inotify_data.idr, entry,
-                                        ++group->inotify_data.last_wd,
-                                        &ientry->wd);
                spin_unlock(&group->inotify_data.idr_lock);
                if (ret) {
                        if (ret == -EAGAIN)
                                goto retry;
                        goto out_err;
                }
+                ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+                if (ret) {
+                        inotify_remove_from_idr(group, tmp_ientry);
+                        if (ret == -EEXIST)
+                                goto find_entry;
+                        goto out_err;
+                }
+                /* tmp_ientry has been added to the inode, so we are all set up.
+                 * now we just need to make sure tmp_ientry doesn't get freed and
+                 * we need to set up entry and ientry so the generic code can
+                 * do its thing. */
+                ientry = tmp_ientry;
+                entry = &ientry->fsn_entry;
+                tmp_ientry = NULL;
                atomic_inc(&group->inotify_data.user->inotify_watches);
+                /* update the idr hint */
+                group->inotify_data.last_wd = ientry->wd;
+                /* we put the mark on the idr, take a reference */
+                fsnotify_get_mark(entry);
        }
+        ret = ientry->wd;
        spin_lock(&entry->lock);
        old_mask = entry->mask;
@@ -506,14 +537,19 @@ retry:
                        fsnotify_recalc_group_mask(group);
        }
-        return ientry->wd;
+        /* this either matches fsnotify_find_mark_entry, or init_mark_entry
+         * depending on which path we took... */
+        fsnotify_put_mark(entry);
 out_err:
-        /* see this isn't supposed to happen, just kill the watch */
+        /* could be an error, could be that we found an existing mark */
-        if (entry) {
+        if (tmp_ientry) {
-                fsnotify_destroy_mark_by_entry(entry);
+                /* on the idr but didn't make it on the inode */
-                fsnotify_put_mark(entry);
+                if (tmp_ientry->wd != -1)
+                        inotify_remove_from_idr(group, tmp_ientry);
+                kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
        }
        return ret;
 }
@@ -721,9 +757,6 @@ static int __init inotify_user_setup(void)
        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
        event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
-        inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
-        if (!inotify_ignored_event)
-                panic("unable to allocate the inotify ignored event\n");
        inotify_max_queued_events = 16384;
        inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 959b73e756fd..521368574e97 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -136,18 +136,24 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 {
        if ((old->mask == new->mask) &&
            (old->to_tell == new->to_tell) &&
-            (old->data_type == new->data_type)) {
+            (old->data_type == new->data_type) &&
+            (old->name_len == new->name_len)) {
                switch (old->data_type) {
                case (FSNOTIFY_EVENT_INODE):
-                        if (old->inode == new->inode)
+                        /* remember, after old was put on the wait_q we aren't
+                         * allowed to look at the inode any more, only thing
+                         * left to check was if the file_name is the same */
+                        if (old->name_len &&
+                            !strcmp(old->file_name, new->file_name))
                                return true;
                        break;
                case (FSNOTIFY_EVENT_PATH):
                        if ((old->path.mnt == new->path.mnt) &&
                            (old->path.dentry == new->path.dentry))
                                return true;
+                        break;
                case (FSNOTIFY_EVENT_NONE):
-                        return true;
+                        return false;
                };
        }
        return false;
@@ -339,18 +345,19 @@ static void initialize_event(struct fsnotify_event *event)
 * @name the filename, if available
 */
 struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
-                                             int data_type, const char *name, u32 cookie)
+                                             int data_type, const char *name, u32 cookie,
+                                             gfp_t gfp)
 {
        struct fsnotify_event *event;
-        event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+        event = kmem_cache_alloc(fsnotify_event_cachep, gfp);
        if (!event)
                return NULL;
        initialize_event(event);
        if (name) {
-                event->file_name = kstrdup(name, GFP_KERNEL);
+                event->file_name = kstrdup(name, gfp);
                if (!event->file_name) {
                        kmem_cache_free(fsnotify_event_cachep, event);
                        return NULL;
diff --git a/fs/pipe.c b/fs/pipe.c
index f7dd21ad85a6..52c415114838 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -68,8 +68,8 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
        } else {
-                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+                pipe_lock_nested(pipe2, I_MUTEX_PARENT);
-                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+                pipe_lock_nested(pipe1, I_MUTEX_CHILD);
        }
 }