Merge branch 'master' into next

author: James Morris <jmorris@namei.org> 2009-08-05 18:55:03 -0400
committer: James Morris <jmorris@namei.org> 2009-08-05 18:55:03 -0400
commit: 012a5299a29672039f42944a37984558393ef769 (patch)
tree: de0815c67cf4156c32c8b552cd7448387cc391b0 /fs
parent: da34d4248bd2013ee64ce51e63ec0ebd1f32b46c (diff)
parent: 90bc1a658a53f8832ee799685703977a450e5af9 (diff)
77 files changed, 2198 insertions, 1118 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6fcb1e7095cf..92828281a30b 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -57,7 +57,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
        buffer = kmap(page);
        offset = page_offset(page);
-        retval = v9fs_file_readn(filp, buffer, NULL, offset, PAGE_CACHE_SIZE);
+        retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
        if (retval < 0)
                goto done;
diff --git a/fs/Kconfig b/fs/Kconfig
index a97263be6a91..0e7da7bb5d93 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -186,32 +186,7 @@ source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
+source "fs/nilfs2/Kconfig"
-config NILFS2_FS
-        tristate "NILFS2 file system support (EXPERIMENTAL)"
-        depends on BLOCK && EXPERIMENTAL
-        select CRC32
-        help
-          NILFS2 is a log-structured file system (LFS) supporting continuous
-          snapshotting.  In addition to versioning capability of the entire
-          file system, users can even restore files mistakenly overwritten or
-          destroyed just a few seconds ago.  Since this file system can keep
-          consistency like conventional LFS, it achieves quick recovery after
-          system crashes.
-          NILFS2 creates a number of checkpoints every few seconds or per
-          synchronous write basis (unless there is no change).  Users can
-          select significant versions among continuously created checkpoints,
-          and can change them into snapshots which will be preserved for long
-          periods until they are changed back to checkpoints.  Each
-          snapshot is mountable as a read-only file system concurrently with
-          its writable mount, and this feature is convenient for online backup.
-          Some features including atime, extended attributes, and POSIX ACLs,
-          are not supported yet.
-          To compile this file system support as a module, choose M here: the
-          module will be called nilfs2.  If unsure, say N.
 endif # MISC_FILESYSTEMS
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3a6d4fb2a329..94dfda24c06e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -564,6 +564,16 @@ struct block_device *bdget(dev_t dev)
 EXPORT_SYMBOL(bdget);
+/**
+ * bdgrab -- Grab a reference to an already referenced block device
+ * @bdev:       Block device to grab a reference to.
+ */
+struct block_device *bdgrab(struct block_device *bdev)
+{
+        atomic_inc(&bdev->bd_inode->i_count);
+        return bdev;
+}
 long nr_blockdev_pages(void)
 {
        struct block_device *bdev;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 6e4f6c50a120..019e8af449ab 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work)
         * list
         */
        if (worker->idle) {
-                spin_lock_irqsave(&worker->workers->lock, flags);
+                spin_lock(&worker->workers->lock);
                worker->idle = 0;
                list_move_tail(&worker->worker_list,
                               &worker->workers->worker_list);
-                spin_unlock_irqrestore(&worker->workers->lock, flags);
+                spin_unlock(&worker->workers->lock);
        }
        if (!worker->working) {
                wake = 1;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 60a45f3a4e91..3fdcc0512d3a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
        btrfs_disk_key_to_cpu(&k1, disk);
-        if (k1.objectid > k2->objectid)
+        return btrfs_comp_cpu_keys(&k1, k2);
-                return 1;
-        if (k1.objectid < k2->objectid)
-                return -1;
-        if (k1.type > k2->type)
-                return 1;
-        if (k1.type < k2->type)
-                return -1;
-        if (k1.offset > k2->offset)
-                return 1;
-        if (k1.offset < k2->offset)
-                return -1;
-        return 0;
 }
 /*
@@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
                return 0;
-        if (btrfs_header_nritems(mid) > 2)
-                return 0;
        if (btrfs_header_nritems(mid) < 2)
                err_on_enospc = 1;
@@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        struct extent_buffer *b;
        int slot;
        int ret;
+        int err;
        int level;
        int lowest_unlock = 1;
        u8 lowest_level = 0;
@@ -1737,8 +1723,6 @@ again:
                        p->locks[level] = 1;
                if (cow) {
-                        int wret;
                        /*
                         * if we don't really need to cow this block
                         * then we don't want to set the path blocking,
@@ -1749,12 +1733,12 @@ again:
                        btrfs_set_path_blocking(p);
-                        wret = btrfs_cow_block(trans, root, b,
+                        err = btrfs_cow_block(trans, root, b,
-                                               p->nodes[level + 1],
+                                              p->nodes[level + 1],
-                                               p->slots[level + 1], &b);
+                                              p->slots[level + 1], &b);
-                        if (wret) {
+                        if (err) {
                                free_extent_buffer(b);
-                                ret = wret;
+                                ret = err;
                                goto done;
                        }
                }
@@ -1793,41 +1777,45 @@ cow_done:
                ret = bin_search(b, key, level, &slot);
                if (level != 0) {
-                        if (ret && slot > 0)
+                        int dec = 0;
+                        if (ret && slot > 0) {
+                                dec = 1;
                                slot -= 1;
+                        }
                        p->slots[level] = slot;
-                        ret = setup_nodes_for_search(trans, root, p, b, level,
+                        err = setup_nodes_for_search(trans, root, p, b, level,
                                                     ins_len);
-                        if (ret == -EAGAIN)
+                        if (err == -EAGAIN)
                                goto again;
-                        else if (ret)
+                        if (err) {
+                                ret = err;
                                goto done;
+                        }
                        b = p->nodes[level];
                        slot = p->slots[level];
                        unlock_up(p, level, lowest_unlock);
-                        /* this is only true while dropping a snapshot */
                        if (level == lowest_level) {
-                                ret = 0;
+                                if (dec)
+                                        p->slots[level]++;
                                goto done;
                        }
-                        ret = read_block_for_search(trans, root, p,
+                        err = read_block_for_search(trans, root, p,
                                                    &b, level, slot, key);
-                        if (ret == -EAGAIN)
+                        if (err == -EAGAIN)
                                goto again;
+                        if (err) {
-                        if (ret == -EIO)
+                                ret = err;
                                goto done;
+                        }
                        if (!p->skip_locking) {
-                                int lret;
                                btrfs_clear_path_blocking(p, NULL);
-                                lret = btrfs_try_spin_lock(b);
+                                err = btrfs_try_spin_lock(b);
-                                if (!lret) {
+                                if (!err) {
                                        btrfs_set_path_blocking(p);
                                        btrfs_tree_lock(b);
                                        btrfs_clear_path_blocking(p, b);
@@ -1837,16 +1825,14 @@ cow_done:
                        p->slots[level] = slot;
                        if (ins_len > 0 &&
                            btrfs_leaf_free_space(root, b) < ins_len) {
-                                int sret;
                                btrfs_set_path_blocking(p);
-                                sret = split_leaf(trans, root, key,
+                                err = split_leaf(trans, root, key,
-                                                      p, ins_len, ret == 0);
+                                                 p, ins_len, ret == 0);
                                btrfs_clear_path_blocking(p, NULL);
-                                BUG_ON(sret > 0);
+                                BUG_ON(err > 0);
-                                if (sret) {
+                                if (err) {
-                                        ret = sret;
+                                        ret = err;
                                        goto done;
                                }
                        }
@@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                }
                /* delete the leaf if it is mostly empty */
-                if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
+                if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
                        /* push_leaf_left fixes the path.
                         * make sure the path still points to our leaf
                         * for possible call to del_ptr below
@@ -4042,10 +4028,9 @@ out:
 * calling this function.
 */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-                        struct btrfs_key *key, int lowest_level,
+                        struct btrfs_key *key, int level,
                        int cache_only, u64 min_trans)
 {
-        int level = lowest_level;
        int slot;
        struct extent_buffer *c;
@@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
                c = path->nodes[level];
 next:
                if (slot >= btrfs_header_nritems(c)) {
-                        level++;
+                        int ret;
-                        if (level == BTRFS_MAX_LEVEL)
+                        int orig_lowest;
+                        struct btrfs_key cur_key;
+                        if (level + 1 >= BTRFS_MAX_LEVEL ||
+                            !path->nodes[level + 1])
                                return 1;
-                        continue;
+                        if (path->locks[level + 1]) {
+                                level++;
+                                continue;
+                        }
+                        slot = btrfs_header_nritems(c) - 1;
+                        if (level == 0)
+                                btrfs_item_key_to_cpu(c, &cur_key, slot);
+                        else
+                                btrfs_node_key_to_cpu(c, &cur_key, slot);
+                        orig_lowest = path->lowest_level;
+                        btrfs_release_path(root, path);
+                        path->lowest_level = level;
+                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
+                                                0, 0);
+                        path->lowest_level = orig_lowest;
+                        if (ret < 0)
+                                return ret;
+                        c = path->nodes[level];
+                        slot = path->slots[level];
+                        if (ret == 0)
+                                slot++;
+                        goto next;
                }
                if (level == 0)
                        btrfs_item_key_to_cpu(c, key, slot);
                else {
@@ -4146,7 +4160,8 @@ again:
         * advance the path if there are now more items available.
         */
        if (nritems > 0 && path->slots[0] < nritems - 1) {
-                path->slots[0]++;
+                if (ret == 0)
+                        path->slots[0]++;
                ret = 0;
                goto done;
        }
@@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root,
                        path->slots[0]--;
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.type == type)
-                        return 0;
                if (found_key.objectid < min_objectid)
                        break;
+                if (found_key.type == type)
+                        return 0;
                if (found_key.objectid == min_objectid &&
                    found_key.type < type)
                        break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 98a873838717..837435ce84ca 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -481,7 +481,7 @@ struct btrfs_shared_data_ref {
 struct btrfs_extent_inline_ref {
        u8 type;
-        u64 offset;
+        __le64 offset;
 } __attribute__ ((__packed__));
 /* old style backrefs item */
@@ -689,6 +689,7 @@ struct btrfs_space_info {
        struct list_head block_groups;
        spinlock_t lock;
        struct rw_semaphore groups_sem;
+        atomic_t caching_threads;
 };
 /*
@@ -707,6 +708,9 @@ struct btrfs_free_cluster {
        /* first extent starting offset */
        u64 window_start;
+        /* if this cluster simply points at a bitmap in the block group */
+        bool points_to_bitmap;
        struct btrfs_block_group_cache *block_group;
        /*
         * when a cluster is allocated from a block group, we put the
@@ -716,24 +720,37 @@ struct btrfs_free_cluster {
        struct list_head block_group_list;
 };
+enum btrfs_caching_type {
+        BTRFS_CACHE_NO          = 0,
+        BTRFS_CACHE_STARTED     = 1,
+        BTRFS_CACHE_FINISHED    = 2,
+};
 struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
+        struct btrfs_fs_info *fs_info;
        spinlock_t lock;
-        struct mutex cache_mutex;
        u64 pinned;
        u64 reserved;
        u64 flags;
-        int cached;
+        u64 sectorsize;
+        int extents_thresh;
+        int free_extents;
+        int total_bitmaps;
        int ro;
        int dirty;
+        /* cache tracking stuff */
+        wait_queue_head_t caching_q;
+        int cached;
        struct btrfs_space_info *space_info;
        /* free space cache stuff */
        spinlock_t tree_lock;
-        struct rb_root free_space_bytes;
        struct rb_root free_space_offset;
+        u64 free_space;
        /* block group cache stuff */
        struct rb_node cache_node;
@@ -808,6 +825,7 @@ struct btrfs_fs_info {
        struct mutex drop_mutex;
        struct mutex volume_mutex;
        struct mutex tree_reloc_mutex;
+        struct rw_semaphore extent_commit_sem;
        /*
         * this protects the ordered operations list only while we are
@@ -1988,6 +2006,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
                                 u64 bytes);
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
                              u64 bytes);
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d28d29c95f7c..e83be2e4602c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1639,6 +1639,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->volume_mutex);
        mutex_init(&fs_info->tree_reloc_mutex);
+        init_rwsem(&fs_info->extent_commit_sem);
        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1799,6 +1800,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                           btrfs_super_chunk_root(disk_super),
                                           blocksize, generation);
        BUG_ON(!chunk_root->node);
+        if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+                printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+                       sb->s_id);
+                goto fail_chunk_root;
+        }
        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
        chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1826,6 +1832,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                          blocksize, generation);
        if (!tree_root->node)
                goto fail_chunk_root;
+        if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+                printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+                       sb->s_id);
+                goto fail_tree_root;
+        }
        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
        tree_root->commit_root = btrfs_root_node(tree_root);
@@ -2322,6 +2333,9 @@ int close_ctree(struct btrfs_root *root)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
+        fs_info->closing = 2;
+        smp_mb();
        if (fs_info->delalloc_bytes) {
                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
                       (unsigned long long)fs_info->delalloc_bytes);
@@ -2343,6 +2357,7 @@ int close_ctree(struct btrfs_root *root)
        free_extent_buffer(root->fs_info->csum_root->commit_root);
        btrfs_free_block_groups(root->fs_info);
+        btrfs_free_pinned_extents(root->fs_info);
        del_fs_roots(fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a5aca3997d42..dc84daee6bc4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,6 +21,7 @@
 #include <linux/blkdev.h>
 #include <linux/sort.h>
 #include <linux/rcupdate.h>
+#include <linux/kthread.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force);
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+        smp_mb();
+        return cache->cached == BTRFS_CACHE_FINISHED;
+}
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
        return (cache->flags & bits) == bits;
@@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 }
 /*
+ * We always set EXTENT_LOCKED for the super mirror extents so we don't
+ * overwrite them, so those bits need to be unset.  Also, if we are unmounting
+ * with pinned extents still sitting there because we had a block group caching,
+ * we need to clear those now, since we are done.
+ */
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
+{
+        u64 start, end, last = 0;
+        int ret;
+        while (1) {
+                ret = find_first_extent_bit(&info->pinned_extents, last,
+                                            &start, &end,
+                                            EXTENT_LOCKED|EXTENT_DIRTY);
+                if (ret)
+                        break;
+                clear_extent_bits(&info->pinned_extents, start, end,
+                                  EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
+                last = end+1;
+        }
+}
+static int remove_sb_from_cache(struct btrfs_root *root,
+                                struct btrfs_block_group_cache *cache)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u64 bytenr;
+        u64 *logical;
+        int stripe_len;
+        int i, nr, ret;
+        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                bytenr = btrfs_sb_offset(i);
+                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+                                       cache->key.objectid, bytenr,
+                                       0, &logical, &nr, &stripe_len);
+                BUG_ON(ret);
+                while (nr--) {
+                        try_lock_extent(&fs_info->pinned_extents,
+                                        logical[nr],
+                                        logical[nr] + stripe_len - 1, GFP_NOFS);
+                }
+                kfree(logical);
+        }
+        return 0;
+}
+/*
 * this is only called by cache_block_group, since we could have freed extents
 * we need to check the pinned_extents for any extents that can't be used yet
 * since their free space will be released as soon as the transaction commits.
 */
-static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
                              struct btrfs_fs_info *info, u64 start, u64 end)
 {
-        u64 extent_start, extent_end, size;
+        u64 extent_start, extent_end, size, total_added = 0;
        int ret;
        while (start < end) {
                ret = find_first_extent_bit(&info->pinned_extents, start,
                                            &extent_start, &extent_end,
-                                            EXTENT_DIRTY);
+                                            EXTENT_DIRTY|EXTENT_LOCKED);
                if (ret)
                        break;
@@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
                        start = extent_end + 1;
                } else if (extent_start > start && extent_start < end) {
                        size = extent_start - start;
+                        total_added += size;
                        ret = btrfs_add_free_space(block_group, start,
                                                   size);
                        BUG_ON(ret);
@@ -178,84 +237,80 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
        if (start < end) {
                size = end - start;
+                total_added += size;
                ret = btrfs_add_free_space(block_group, start, size);
                BUG_ON(ret);
        }
-        return 0;
+        return total_added;
 }
-static int remove_sb_from_cache(struct btrfs_root *root,
+static int caching_kthread(void *data)
-                                struct btrfs_block_group_cache *cache)
-{
-        u64 bytenr;
-        u64 *logical;
-        int stripe_len;
-        int i, nr, ret;
-        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-                bytenr = btrfs_sb_offset(i);
-                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
-                                       cache->key.objectid, bytenr, 0,
-                                       &logical, &nr, &stripe_len);
-                BUG_ON(ret);
-                while (nr--) {
-                        btrfs_remove_free_space(cache, logical[nr],
-                                                stripe_len);
-                }
-                kfree(logical);
-        }
-        return 0;
-}
-static int cache_block_group(struct btrfs_root *root,
-                             struct btrfs_block_group_cache *block_group)
 {
+        struct btrfs_block_group_cache *block_group = data;
+        struct btrfs_fs_info *fs_info = block_group->fs_info;
+        u64 last = 0;
        struct btrfs_path *path;
        int ret = 0;
        struct btrfs_key key;
        struct extent_buffer *leaf;
        int slot;
-        u64 last;
+        u64 total_found = 0;
-        if (!block_group)
-                return 0;
-        root = root->fs_info->extent_root;
+        BUG_ON(!fs_info);
-        if (block_group->cached)
-                return 0;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        path->reada = 2;
+        atomic_inc(&block_group->space_info->caching_threads);
+        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+again:
+        /* need to make sure the commit_root doesn't disappear */
+        down_read(&fs_info->extent_commit_sem);
        /*
-         * we get into deadlocks with paths held by callers of this function.
+         * We don't want to deadlock with somebody trying to allocate a new
-         * since the alloc_mutex is protecting things right now, just
+         * extent for the extent root while also trying to search the extent
-         * skip the locking here
+         * root to add free space.  So we skip locking and search the commit
+         * root, since its read-only
         */
        path->skip_locking = 1;
-        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+        path->search_commit_root = 1;
+        path->reada = 2;
        key.objectid = last;
        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
        while (1) {
+                smp_mb();
+                if (block_group->fs_info->closing > 1) {
+                        last = (u64)-1;
+                        break;
+                }
                leaf = path->nodes[0];
                slot = path->slots[0];
                if (slot >= btrfs_header_nritems(leaf)) {
-                        ret = btrfs_next_leaf(root, path);
+                        ret = btrfs_next_leaf(fs_info->extent_root, path);
                        if (ret < 0)
                                goto err;
-                        if (ret == 0)
+                        else if (ret)
-                                continue;
-                        else
                                break;
+                        if (need_resched() ||
+                            btrfs_transaction_in_commit(fs_info)) {
+                                btrfs_release_path(fs_info->extent_root, path);
+                                up_read(&fs_info->extent_commit_sem);
+                                schedule_timeout(1);
+                                goto again;
+                        }
+                        continue;
                }
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid < block_group->key.objectid)
@@ -266,24 +321,59 @@ static int cache_block_group(struct btrfs_root *root,
                        break;
                if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
-                        add_new_free_space(block_group, root->fs_info, last,
+                        total_found += add_new_free_space(block_group,
-                                           key.objectid);
+                                                          fs_info, last,
+                                                          key.objectid);
                        last = key.objectid + key.offset;
                }
+                if (total_found > (1024 * 1024 * 2)) {
+                        total_found = 0;
+                        wake_up(&block_group->caching_q);
+                }
 next:
                path->slots[0]++;
        }
+        ret = 0;
-        add_new_free_space(block_group, root->fs_info, last,
+        total_found += add_new_free_space(block_group, fs_info, last,
-                           block_group->key.objectid +
+                                          block_group->key.objectid +
-                           block_group->key.offset);
+                                          block_group->key.offset);
+        spin_lock(&block_group->lock);
+        block_group->cached = BTRFS_CACHE_FINISHED;
+        spin_unlock(&block_group->lock);
-        block_group->cached = 1;
-        remove_sb_from_cache(root, block_group);
-        ret = 0;
 err:
        btrfs_free_path(path);
+        up_read(&fs_info->extent_commit_sem);
+        atomic_dec(&block_group->space_info->caching_threads);
+        wake_up(&block_group->caching_q);
+        return 0;
+}
+static int cache_block_group(struct btrfs_block_group_cache *cache)
+{
+        struct task_struct *tsk;
+        int ret = 0;
+        spin_lock(&cache->lock);
+        if (cache->cached != BTRFS_CACHE_NO) {
+                spin_unlock(&cache->lock);
+                return ret;
+        }
+        cache->cached = BTRFS_CACHE_STARTED;
+        spin_unlock(&cache->lock);
+        tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+                          cache->key.objectid);
+        if (IS_ERR(tsk)) {
+                ret = PTR_ERR(tsk);
+                printk(KERN_ERR "error running thread %d\n", ret);
+                BUG();
+        }
        return ret;
 }
@@ -2387,13 +2477,29 @@ fail:
 }
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+                 struct btrfs_block_group_cache *cache)
+{
+        struct rb_node *node;
+        spin_lock(&root->fs_info->block_group_cache_lock);
+        node = rb_next(&cache->cache_node);
+        btrfs_put_block_group(cache);
+        if (node) {
+                cache = rb_entry(node, struct btrfs_block_group_cache,
+                                 cache_node);
+                atomic_inc(&cache->count);
+        } else
+                cache = NULL;
+        spin_unlock(&root->fs_info->block_group_cache_lock);
+        return cache;
+}
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
-        struct btrfs_block_group_cache *cache, *entry;
+        struct btrfs_block_group_cache *cache;
-        struct rb_node *n;
        int err = 0;
-        int werr = 0;
        struct btrfs_path *path;
        u64 last = 0;
@@ -2402,39 +2508,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        while (1) {
-                cache = NULL;
+                if (last == 0) {
-                spin_lock(&root->fs_info->block_group_cache_lock);
+                        err = btrfs_run_delayed_refs(trans, root,
-                for (n = rb_first(&root->fs_info->block_group_cache_tree);
+                                                     (unsigned long)-1);
-                     n; n = rb_next(n)) {
+                        BUG_ON(err);
-                        entry = rb_entry(n, struct btrfs_block_group_cache,
-                                         cache_node);
-                        if (entry->dirty) {
-                                cache = entry;
-                                break;
-                        }
                }
-                spin_unlock(&root->fs_info->block_group_cache_lock);
-                if (!cache)
+                cache = btrfs_lookup_first_block_group(root->fs_info, last);
-                        break;
+                while (cache) {
+                        if (cache->dirty)
+                                break;
+                        cache = next_block_group(root, cache);
+                }
+                if (!cache) {
+                        if (last == 0)
+                                break;
+                        last = 0;
+                        continue;
+                }
                cache->dirty = 0;
-                last += cache->key.offset;
+                last = cache->key.objectid + cache->key.offset;
-                err = write_one_cache_group(trans, root,
+                err = write_one_cache_group(trans, root, path, cache);
-                                            path, cache);
+                BUG_ON(err);
-                /*
+                btrfs_put_block_group(cache);
-                 * if we fail to write the cache group, we want
-                 * to keep it marked dirty in hopes that a later
-                 * write will work
-                 */
-                if (err) {
-                        werr = err;
-                        continue;
-                }
        }
        btrfs_free_path(path);
-        return werr;
+        return 0;
 }
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2484,6 +2586,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->force_alloc = 0;
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
+        atomic_set(&found->caching_threads, 0);
        return 0;
 }
@@ -2947,13 +3050,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        if (pin) {
+        if (pin)
                set_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
-        } else {
-                clear_extent_dirty(&fs_info->pinned_extents,
-                                bytenr, bytenr + num - 1, GFP_NOFS);
-        }
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2969,14 +3068,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                        spin_unlock(&cache->space_info->lock);
                        fs_info->total_pinned += len;
                } else {
+                        int unpin = 0;
+                        /*
+                         * in order to not race with the block group caching, we
+                         * only want to unpin the extent if we are cached.  If
+                         * we aren't cached, we want to start async caching this
+                         * block group so we can free the extent the next time
+                         * around.
+                         */
                        spin_lock(&cache->space_info->lock);
                        spin_lock(&cache->lock);
-                        cache->pinned -= len;
+                        unpin = (cache->cached == BTRFS_CACHE_FINISHED);
-                        cache->space_info->bytes_pinned -= len;
+                        if (likely(unpin)) {
+                                cache->pinned -= len;
+                                cache->space_info->bytes_pinned -= len;
+                                fs_info->total_pinned -= len;
+                        }
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
-                        fs_info->total_pinned -= len;
-                        if (cache->cached)
+                        if (likely(unpin))
+                                clear_extent_dirty(&fs_info->pinned_extents,
+                                                   bytenr, bytenr + len -1,
+                                                   GFP_NOFS);
+                        else
+                                cache_block_group(cache);
+                        if (unpin)
                                btrfs_add_free_space(cache, bytenr, len);
                }
                btrfs_put_block_group(cache);
@@ -3030,6 +3149,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
                                            &start, &end, EXTENT_DIRTY);
                if (ret)
                        break;
                set_extent_dirty(copy, start, end, GFP_NOFS);
                last = end + 1;
        }
@@ -3058,6 +3178,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
        return ret;
 }
@@ -3436,6 +3557,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
 }
 /*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once.  So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes.  Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ */
+static noinline int
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+                                u64 num_bytes)
+{
+        DEFINE_WAIT(wait);
+        prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
+        if (block_group_cache_done(cache)) {
+                finish_wait(&cache->caching_q, &wait);
+                return 0;
+        }
+        schedule();
+        finish_wait(&cache->caching_q, &wait);
+        wait_event(cache->caching_q, block_group_cache_done(cache) ||
+                   (cache->free_space >= num_bytes));
+        return 0;
+}
+enum btrfs_loop_type {
+        LOOP_CACHED_ONLY = 0,
+        LOOP_CACHING_NOWAIT = 1,
+        LOOP_CACHING_WAIT = 2,
+        LOOP_ALLOC_CHUNK = 3,
+        LOOP_NO_EMPTY_SIZE = 4,
+};
+/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
 * ins->objectid == block start
@@ -3460,6 +3620,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
        int loop = 0;
+        bool found_uncached_bg = false;
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3491,15 +3652,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        search_start = max(search_start, first_logical_byte(root, 0));
        search_start = max(search_start, hint_byte);
-        if (!last_ptr) {
+        if (!last_ptr)
                empty_cluster = 0;
-                loop = 1;
-        }
        if (search_start == hint_byte) {
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
-                if (block_group && block_group_bits(block_group, data)) {
+                /*
+                 * we don't want to use the block group if it doesn't match our
+                 * allocation bits, or if its not cached.
+                 */
+                if (block_group && block_group_bits(block_group, data) &&
+                    block_group_cache_done(block_group)) {
                        down_read(&space_info->groups_sem);
                        if (list_empty(&block_group->list) ||
                            block_group->ro) {
@@ -3522,21 +3686,35 @@ search:
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups, list) {
                u64 offset;
+                int cached;
                atomic_inc(&block_group->count);
                search_start = block_group->key.objectid;
 have_block_group:
-                if (unlikely(!block_group->cached)) {
+                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
-                        mutex_lock(&block_group->cache_mutex);
+                        /*
-                        ret = cache_block_group(root, block_group);
+                         * we want to start caching kthreads, but not too many
-                        mutex_unlock(&block_group->cache_mutex);
+                         * right off the bat so we don't overwhelm the system,
-                        if (ret) {
+                         * so only start them if there are less than 2 and we're
-                                btrfs_put_block_group(block_group);
+                         * in the initial allocation phase.
-                                break;
+                         */
+                        if (loop > LOOP_CACHING_NOWAIT ||
+                            atomic_read(&space_info->caching_threads) < 2) {
+                                ret = cache_block_group(block_group);
+                                BUG_ON(ret);
                        }
                }
+                cached = block_group_cache_done(block_group);
+                if (unlikely(!cached)) {
+                        found_uncached_bg = true;
+                        /* if we only want cached bgs, loop */
+                        if (loop == LOOP_CACHED_ONLY)
+                                goto loop;
+                }
                if (unlikely(block_group->ro))
                        goto loop;
@@ -3615,14 +3793,21 @@ refill_cluster:
                                        spin_unlock(&last_ptr->refill_lock);
                                        goto checks;
                                }
+                        } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
+                                spin_unlock(&last_ptr->refill_lock);
+                                wait_block_group_cache_progress(block_group,
+                                       num_bytes + empty_cluster + empty_size);
+                                goto have_block_group;
                        }
                        /*
                         * at this point we either didn't find a cluster
                         * or we weren't able to allocate a block from our
                         * cluster.  Free the cluster we've been trying
                         * to use, and go to the next block group
                         */
-                        if (loop < 2) {
+                        if (loop < LOOP_NO_EMPTY_SIZE) {
                                btrfs_return_cluster_to_free_space(NULL,
                                                                   last_ptr);
                                spin_unlock(&last_ptr->refill_lock);
@@ -3633,11 +3818,17 @@ refill_cluster:
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
-                if (!offset)
+                if (!offset && (cached || (!cached &&
+                                           loop == LOOP_CACHING_NOWAIT))) {
                        goto loop;
+                } else if (!offset && (!cached &&
+                                       loop > LOOP_CACHING_NOWAIT)) {
+                        wait_block_group_cache_progress(block_group,
+                                        num_bytes + empty_size);
+                        goto have_block_group;
+                }
 checks:
                search_start = stripe_align(root, offset);
                /* move on to the next group */
                if (search_start + num_bytes >= search_end) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
@@ -3683,13 +3874,26 @@ loop:
        }
        up_read(&space_info->groups_sem);
-        /* loop == 0, try to find a clustered alloc in every block group
+        /* LOOP_CACHED_ONLY, only search fully cached block groups
-         * loop == 1, try again after forcing a chunk allocation
+         * LOOP_CACHING_NOWAIT, search partially cached block groups, but
-         * loop == 2, set empty_size and empty_cluster to 0 and try again
+         *                      dont wait foR them to finish caching
+         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+         *                      again
         */
-        if (!ins->objectid && loop < 3 &&
+        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
-            (empty_size || empty_cluster || allowed_chunk_alloc)) {
+            (found_uncached_bg || empty_size || empty_cluster ||
-                if (loop >= 2) {
+             allowed_chunk_alloc)) {
+                if (found_uncached_bg) {
+                        found_uncached_bg = false;
+                        if (loop < LOOP_CACHING_WAIT) {
+                                loop++;
+                                goto search;
+                        }
+                }
+                if (loop == LOOP_ALLOC_CHUNK) {
                        empty_size = 0;
                        empty_cluster = 0;
                }
@@ -3702,7 +3906,7 @@ loop:
                        space_info->force_alloc = 1;
                }
-                if (loop < 3) {
+                if (loop < LOOP_NO_EMPTY_SIZE) {
                        loop++;
                        goto search;
                }
@@ -3798,7 +4002,7 @@ again:
                               num_bytes, data, 1);
                goto again;
        }
-        if (ret) {
+        if (ret == -ENOSPC) {
                struct btrfs_space_info *sinfo;
                sinfo = __find_space_info(root->fs_info, data);
@@ -3806,7 +4010,6 @@ again:
                       "wanted %llu\n", (unsigned long long)data,
                       (unsigned long long)num_bytes);
                dump_space_info(sinfo, num_bytes);
-                BUG();
        }
        return ret;
@@ -3844,7 +4047,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
        ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
                                     empty_size, hint_byte, search_end, ins,
                                     data);
-        update_reserved_extents(root, ins->objectid, ins->offset, 1);
+        if (!ret)
+                update_reserved_extents(root, ins->objectid, ins->offset, 1);
        return ret;
 }
@@ -4006,9 +4211,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *block_group;
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-        mutex_lock(&block_group->cache_mutex);
+        cache_block_group(block_group);
-        cache_block_group(root, block_group);
+        wait_event(block_group->caching_q,
-        mutex_unlock(&block_group->cache_mutex);
+                   block_group_cache_done(block_group));
        ret = btrfs_remove_free_space(block_group, ins->objectid,
                                      ins->offset);
@@ -4039,7 +4244,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
        ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
                                     empty_size, hint_byte, search_end,
                                     ins, 0);
-        BUG_ON(ret);
+        if (ret)
+                return ret;
        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
                if (parent == 0)
@@ -6955,11 +7161,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                         &info->block_group_cache_tree);
                spin_unlock(&info->block_group_cache_lock);
-                btrfs_remove_free_space_cache(block_group);
                down_write(&block_group->space_info->groups_sem);
                list_del(&block_group->list);
                up_write(&block_group->space_info->groups_sem);
+                if (block_group->cached == BTRFS_CACHE_STARTED)
+                        wait_event(block_group->caching_q,
+                                   block_group_cache_done(block_group));
+                btrfs_remove_free_space_cache(block_group);
                WARN_ON(atomic_read(&block_group->count) != 1);
                kfree(block_group);
@@ -7025,9 +7236,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                atomic_set(&cache->count, 1);
                spin_lock_init(&cache->lock);
                spin_lock_init(&cache->tree_lock);
-                mutex_init(&cache->cache_mutex);
+                cache->fs_info = info;
+                init_waitqueue_head(&cache->caching_q);
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
+                /*
+                 * we only want to have 32k of ram per block group for keeping
+                 * track of free space, and if we pass 1/2 of that we want to
+                 * start converting things over to using bitmaps
+                 */
+                cache->extents_thresh = ((1024 * 32) / 2) /
+                        sizeof(struct btrfs_free_space);
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
@@ -7036,6 +7257,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                key.objectid = found_key.objectid + found_key.offset;
                btrfs_release_path(root, path);
                cache->flags = btrfs_block_group_flags(&cache->item);
+                cache->sectorsize = root->sectorsize;
+                remove_sb_from_cache(root, cache);
+                /*
+                 * check for two cases, either we are full, and therefore
+                 * don't need to bother with the caching work since we won't
+                 * find any space, or we are empty, and we can just add all
+                 * the space in and be done with it.  This saves us _alot_ of
+                 * time, particularly in the full case.
+                 */
+                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+                        cache->cached = BTRFS_CACHE_FINISHED;
+                } else if (btrfs_block_group_used(&cache->item) == 0) {
+                        cache->cached = BTRFS_CACHE_FINISHED;
+                        add_new_free_space(cache, root->fs_info,
+                                           found_key.objectid,
+                                           found_key.objectid +
+                                           found_key.offset);
+                }
                ret = update_space_info(info, cache->flags, found_key.offset,
                                        btrfs_block_group_used(&cache->item),
@@ -7079,10 +7320,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.objectid = chunk_offset;
        cache->key.offset = size;
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+        cache->sectorsize = root->sectorsize;
+        /*
+         * we only want to have 32k of ram per block group for keeping track
+         * of free space, and if we pass 1/2 of that we want to start
+         * converting things over to using bitmaps
+         */
+        cache->extents_thresh = ((1024 * 32) / 2) /
+                sizeof(struct btrfs_free_space);
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
        spin_lock_init(&cache->tree_lock);
-        mutex_init(&cache->cache_mutex);
+        init_waitqueue_head(&cache->caching_q);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
@@ -7091,6 +7341,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->flags = type;
        btrfs_set_block_group_flags(&cache->item, type);
+        cache->cached = BTRFS_CACHE_FINISHED;
+        remove_sb_from_cache(root, cache);
+        add_new_free_space(cache, root->fs_info, chunk_offset,
+                           chunk_offset + size);
        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                &cache->space_info);
        BUG_ON(ret);
@@ -7149,7 +7405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
        spin_unlock(&root->fs_info->block_group_cache_lock);
-        btrfs_remove_free_space_cache(block_group);
        down_write(&block_group->space_info->groups_sem);
        /*
         * we must use list_del_init so people can check to see if they
@@ -7158,11 +7414,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        list_del_init(&block_group->list);
        up_write(&block_group->space_info->groups_sem);
+        if (block_group->cached == BTRFS_CACHE_STARTED)
+                wait_event(block_group->caching_q,
+                           block_group_cache_done(block_group));
+        btrfs_remove_free_space_cache(block_group);
        spin_lock(&block_group->space_info->lock);
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        spin_unlock(&block_group->space_info->lock);
-        block_group->space_info->full = 0;
+        btrfs_clear_space_info_full(root->fs_info);
        btrfs_put_block_group(block_group);
        btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4538e48581a5..af99b78b288e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
 * Boston, MA 021110-1307, USA.
 */
+#include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/math64.h>
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
-struct btrfs_free_space {
+#define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
-        struct rb_node bytes_index;
+#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
-        struct rb_node offset_index;
-        u64 offset;
-        u64 bytes;
-};
-static int tree_insert_offset(struct rb_root *root, u64 offset,
+static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
-                              struct rb_node *node)
+                                          u64 offset)
 {
-        struct rb_node **p = &root->rb_node;
+        BUG_ON(offset < bitmap_start);
-        struct rb_node *parent = NULL;
+        offset -= bitmap_start;
-        struct btrfs_free_space *info;
+        return (unsigned long)(div64_u64(offset, sectorsize));
+}
-        while (*p) {
+static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
-                parent = *p;
+{
-                info = rb_entry(parent, struct btrfs_free_space, offset_index);
+        return (unsigned long)(div64_u64(bytes, sectorsize));
+}
-                if (offset < info->offset)
+static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
-                        p = &(*p)->rb_left;
+                                   u64 offset)
-                else if (offset > info->offset)
+{
-                        p = &(*p)->rb_right;
+        u64 bitmap_start;
-                else
+        u64 bytes_per_bitmap;
-                        return -EEXIST;
-        }
-        rb_link_node(node, parent, p);
+        bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
-        rb_insert_color(node, root);
+        bitmap_start = offset - block_group->key.objectid;
+        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+        bitmap_start *= bytes_per_bitmap;
+        bitmap_start += block_group->key.objectid;
-        return 0;
+        return bitmap_start;
 }
-static int tree_insert_bytes(struct rb_root *root, u64 bytes,
+static int tree_insert_offset(struct rb_root *root, u64 offset,
-                             struct rb_node *node)
+                              struct rb_node *node, int bitmap)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
        while (*p) {
                parent = *p;
-                info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+                info = rb_entry(parent, struct btrfs_free_space, offset_index);
-                if (bytes < info->bytes)
+                if (offset < info->offset) {
                        p = &(*p)->rb_left;
-                else
+                } else if (offset > info->offset) {
                        p = &(*p)->rb_right;
+                } else {
+                        /*
+                         * we could have a bitmap entry and an extent entry
+                         * share the same offset.  If this is the case, we want
+                         * the extent entry to always be found first if we do a
+                         * linear search through the tree, since we want to have
+                         * the quickest allocation time, and allocating from an
+                         * extent is faster than allocating from a bitmap.  So
+                         * if we're inserting a bitmap and we find an entry at
+                         * this offset, we want to go right, or after this entry
+                         * logically.  If we are inserting an extent and we've
+                         * found a bitmap, we want to go left, or before
+                         * logically.
+                         */
+                        if (bitmap) {
+                                WARN_ON(info->bitmap);
+                                p = &(*p)->rb_right;
+                        } else {
+                                WARN_ON(!info->bitmap);
+                                p = &(*p)->rb_left;
+                        }
+                }
        }
        rb_link_node(node, parent, p);
@@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
 /*
 * searches the tree for the given offset.
 *
- * fuzzy == 1: this is used for allocations where we are given a hint of where
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
- * to look for free space.  Because the hint may not be completely on an offset
+ * want a section that has at least bytes size and comes at or after the given
- * mark, or the hint may no longer point to free space we need to fudge our
+ * offset.
- * results a bit.  So we look for free space starting at or after offset with at
- * least bytes size.  We prefer to find as close to the given offset as we can.
- * Also if the offset is within a free space range, then we will return the free
- * space that contains the given offset, which means we can return a free space
- * chunk with an offset before the provided offset.
- *
- * fuzzy == 0: this is just a normal tree search.  Give us the free space that
- * starts at the given offset which is at least bytes size, and if its not there
- * return NULL.
 */
-static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
+static struct btrfs_free_space *
-                                                   u64 offset, u64 bytes,
+tree_search_offset(struct btrfs_block_group_cache *block_group,
-                                                   int fuzzy)
+                   u64 offset, int bitmap_only, int fuzzy)
 {
-        struct rb_node *n = root->rb_node;
+        struct rb_node *n = block_group->free_space_offset.rb_node;
-        struct btrfs_free_space *entry, *ret = NULL;
+        struct btrfs_free_space *entry, *prev = NULL;
+        /* find entry that is closest to the 'offset' */
+        while (1) {
+                if (!n) {
+                        entry = NULL;
+                        break;
+                }
-        while (n) {
                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+                prev = entry;
-                if (offset < entry->offset) {
+                if (offset < entry->offset)
-                        if (fuzzy &&
-                            (!ret || entry->offset < ret->offset) &&
-                            (bytes <= entry->bytes))
-                                ret = entry;
                        n = n->rb_left;
-                } else if (offset > entry->offset) {
+                else if (offset > entry->offset)
-                        if (fuzzy &&
-                            (entry->offset + entry->bytes - 1) >= offset &&
-                            bytes <= entry->bytes) {
-                                ret = entry;
-                                break;
-                        }
                        n = n->rb_right;
-                } else {
+                else
-                        if (bytes > entry->bytes) {
-                                n = n->rb_right;
-                                continue;
-                        }
-                        ret = entry;
                        break;
-                }
        }
-        return ret;
+        if (bitmap_only) {
-}
+                if (!entry)
+                        return NULL;
-/*
+                if (entry->bitmap)
- * return a chunk at least bytes size, as close to offset that we can get.
+                        return entry;
- */
-static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
-                                                  u64 offset, u64 bytes)
-{
-        struct rb_node *n = root->rb_node;
-        struct btrfs_free_space *entry, *ret = NULL;
-        while (n) {
+                /*
-                entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+                 * bitmap entry and extent entry may share same offset,
+                 * in that case, bitmap entry comes after extent entry.
+                 */
+                n = rb_next(n);
+                if (!n)
+                        return NULL;
+                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+                if (entry->offset != offset)
+                        return NULL;
-                if (bytes < entry->bytes) {
+                WARN_ON(!entry->bitmap);
+                return entry;
+        } else if (entry) {
+                if (entry->bitmap) {
                        /*
-                         * We prefer to get a hole size as close to the size we
+                         * if previous extent entry covers the offset,
-                         * are asking for so we don't take small slivers out of
+                         * we should return it instead of the bitmap entry
-                         * huge holes, but we also want to get as close to the
-                         * offset as possible so we don't have a whole lot of
-                         * fragmentation.
                         */
-                        if (offset <= entry->offset) {
+                        n = &entry->offset_index;
-                                if (!ret)
+                        while (1) {
-                                        ret = entry;
+                                n = rb_prev(n);
-                                else if (entry->bytes < ret->bytes)
+                                if (!n)
-                                        ret = entry;
+                                        break;
-                                else if (entry->offset < ret->offset)
+                                prev = rb_entry(n, struct btrfs_free_space,
-                                        ret = entry;
+                                                offset_index);
+                                if (!prev->bitmap) {
+                                        if (prev->offset + prev->bytes > offset)
+                                                entry = prev;
+                                        break;
+                                }
                        }
-                        n = n->rb_left;
+                }
-                } else if (bytes > entry->bytes) {
+                return entry;
-                        n = n->rb_right;
+        }
+        if (!prev)
+                return NULL;
+        /* find last entry before the 'offset' */
+        entry = prev;
+        if (entry->offset > offset) {
+                n = rb_prev(&entry->offset_index);
+                if (n) {
+                        entry = rb_entry(n, struct btrfs_free_space,
+                                        offset_index);
+                        BUG_ON(entry->offset > offset);
                } else {
-                        /*
+                        if (fuzzy)
-                         * Ok we may have multiple chunks of the wanted size,
+                                return entry;
-                         * so we don't want to take the first one we find, we
+                        else
-                         * want to take the one closest to our given offset, so
+                                return NULL;
-                         * keep searching just in case theres a better match.
-                         */
-                        n = n->rb_right;
-                        if (offset > entry->offset)
-                                continue;
-                        else if (!ret || entry->offset < ret->offset)
-                                ret = entry;
                }
        }
-        return ret;
+        if (entry->bitmap) {
+                n = &entry->offset_index;
+                while (1) {
+                        n = rb_prev(n);
+                        if (!n)
+                                break;
+                        prev = rb_entry(n, struct btrfs_free_space,
+                                        offset_index);
+                        if (!prev->bitmap) {
+                                if (prev->offset + prev->bytes > offset)
+                                        return prev;
+                                break;
+                        }
+                }
+                if (entry->offset + BITS_PER_BITMAP *
+                    block_group->sectorsize > offset)
+                        return entry;
+        } else if (entry->offset + entry->bytes > offset)
+                return entry;
+        if (!fuzzy)
+                return NULL;
+        while (1) {
+                if (entry->bitmap) {
+                        if (entry->offset + BITS_PER_BITMAP *
+                            block_group->sectorsize > offset)
+                                break;
+                } else {
+                        if (entry->offset + entry->bytes > offset)
+                                break;
+                }
+                n = rb_next(&entry->offset_index);
+                if (!n)
+                        return NULL;
+                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+        }
+        return entry;
 }
 static void unlink_free_space(struct btrfs_block_group_cache *block_group,
                              struct btrfs_free_space *info)
 {
        rb_erase(&info->offset_index, &block_group->free_space_offset);
-        rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+        block_group->free_extents--;
+        block_group->free_space -= info->bytes;
 }
 static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +246,314 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 {
        int ret = 0;
+        BUG_ON(!info->bitmap && !info->bytes);
-        BUG_ON(!info->bytes);
        ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
-                                 &info->offset_index);
+                                 &info->offset_index, (info->bitmap != NULL));
        if (ret)
                return ret;
-        ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
+        block_group->free_space += info->bytes;
-                                &info->bytes_index);
+        block_group->free_extents++;
-        if (ret)
+        return ret;
-                return ret;
+}
+static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+{
+        u64 max_bytes, possible_bytes;
+        /*
+         * The goal is to keep the total amount of memory used per 1gb of space
+         * at or below 32k, so we need to adjust how much memory we allow to be
+         * used by extent based free space tracking
+         */
+        max_bytes = MAX_CACHE_BYTES_PER_GIG *
+                (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+        possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
+                (sizeof(struct btrfs_free_space) *
+                 block_group->extents_thresh);
+        if (possible_bytes > max_bytes) {
+                int extent_bytes = max_bytes -
+                        (block_group->total_bitmaps * PAGE_CACHE_SIZE);
+                if (extent_bytes <= 0) {
+                        block_group->extents_thresh = 0;
+                        return;
+                }
+                block_group->extents_thresh = extent_bytes /
+                        (sizeof(struct btrfs_free_space));
+        }
+}
+static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info, u64 offset,
+                              u64 bytes)
+{
+        unsigned long start, end;
+        unsigned long i;
+        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        BUG_ON(end > BITS_PER_BITMAP);
+        for (i = start; i < end; i++)
+                clear_bit(i, info->bitmap);
+        info->bytes -= bytes;
+        block_group->free_space -= bytes;
+}
+static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+                            struct btrfs_free_space *info, u64 offset,
+                            u64 bytes)
+{
+        unsigned long start, end;
+        unsigned long i;
+        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        BUG_ON(end > BITS_PER_BITMAP);
+        for (i = start; i < end; i++)
+                set_bit(i, info->bitmap);
+        info->bytes += bytes;
+        block_group->free_space += bytes;
+}
+static int search_bitmap(struct btrfs_block_group_cache *block_group,
+                         struct btrfs_free_space *bitmap_info, u64 *offset,
+                         u64 *bytes)
+{
+        unsigned long found_bits = 0;
+        unsigned long bits, i;
+        unsigned long next_zero;
+        i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+                          max_t(u64, *offset, bitmap_info->offset));
+        bits = bytes_to_bits(*bytes, block_group->sectorsize);
+        for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
+             i < BITS_PER_BITMAP;
+             i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+                next_zero = find_next_zero_bit(bitmap_info->bitmap,
+                                               BITS_PER_BITMAP, i);
+                if ((next_zero - i) >= bits) {
+                        found_bits = next_zero - i;
+                        break;
+                }
+                i = next_zero;
+        }
+        if (found_bits) {
+                *offset = (u64)(i * block_group->sectorsize) +
+                        bitmap_info->offset;
+                *bytes = (u64)(found_bits) * block_group->sectorsize;
+                return 0;
+        }
+        return -1;
+}
+static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
+                                                *block_group, u64 *offset,
+                                                u64 *bytes, int debug)
+{
+        struct btrfs_free_space *entry;
+        struct rb_node *node;
+        int ret;
+        if (!block_group->free_space_offset.rb_node)
+                return NULL;
+        entry = tree_search_offset(block_group,
+                                   offset_to_bitmap(block_group, *offset),
+                                   0, 1);
+        if (!entry)
+                return NULL;
+        for (node = &entry->offset_index; node; node = rb_next(node)) {
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                if (entry->bytes < *bytes)
+                        continue;
+                if (entry->bitmap) {
+                        ret = search_bitmap(block_group, entry, offset, bytes);
+                        if (!ret)
+                                return entry;
+                        continue;
+                }
+                *offset = entry->offset;
+                *bytes = entry->bytes;
+                return entry;
+        }
+        return NULL;
+}
+static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+                           struct btrfs_free_space *info, u64 offset)
+{
+        u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+        int max_bitmaps = (int)div64_u64(block_group->key.offset +
+                                         bytes_per_bg - 1, bytes_per_bg);
+        BUG_ON(block_group->total_bitmaps >= max_bitmaps);
+        info->offset = offset_to_bitmap(block_group, offset);
+        link_free_space(block_group, info);
+        block_group->total_bitmaps++;
+        recalculate_thresholds(block_group);
+}
+static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *bitmap_info,
+                              u64 *offset, u64 *bytes)
+{
+        u64 end;
+again:
+        end = bitmap_info->offset +
+                (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
+        if (*offset > bitmap_info->offset && *offset + *bytes > end) {
+                bitmap_clear_bits(block_group, bitmap_info, *offset,
+                                  end - *offset + 1);
+                *bytes -= end - *offset + 1;
+                *offset = end + 1;
+        } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
+                bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
+                *bytes = 0;
+        }
+        if (*bytes) {
+                if (!bitmap_info->bytes) {
+                        unlink_free_space(block_group, bitmap_info);
+                        kfree(bitmap_info->bitmap);
+                        kfree(bitmap_info);
+                        block_group->total_bitmaps--;
+                        recalculate_thresholds(block_group);
+                }
+                bitmap_info = tree_search_offset(block_group,
+                                                 offset_to_bitmap(block_group,
+                                                                  *offset),
+                                                 1, 0);
+                if (!bitmap_info)
+                        return -EINVAL;
+                if (!bitmap_info->bitmap)
+                        return -EAGAIN;
+                goto again;
+        } else if (!bitmap_info->bytes) {
+                unlink_free_space(block_group, bitmap_info);
+                kfree(bitmap_info->bitmap);
+                kfree(bitmap_info);
+                block_group->total_bitmaps--;
+                recalculate_thresholds(block_group);
+        }
+        return 0;
+}
+static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info)
+{
+        struct btrfs_free_space *bitmap_info;
+        int added = 0;
+        u64 bytes, offset, end;
+        int ret;
+        /*
+         * If we are below the extents threshold then we can add this as an
+         * extent, and don't have to deal with the bitmap
+         */
+        if (block_group->free_extents < block_group->extents_thresh &&
+            info->bytes > block_group->sectorsize * 4)
+                return 0;
+        /*
+         * some block groups are so tiny they can't be enveloped by a bitmap, so
+         * don't even bother to create a bitmap for this
+         */
+        if (BITS_PER_BITMAP * block_group->sectorsize >
+            block_group->key.offset)
+                return 0;
+        bytes = info->bytes;
+        offset = info->offset;
+again:
+        bitmap_info = tree_search_offset(block_group,
+                                         offset_to_bitmap(block_group, offset),
+                                         1, 0);
+        if (!bitmap_info) {
+                BUG_ON(added);
+                goto new_bitmap;
+        }
+        end = bitmap_info->offset +
+                (u64)(BITS_PER_BITMAP * block_group->sectorsize);
+        if (offset >= bitmap_info->offset && offset + bytes > end) {
+                bitmap_set_bits(block_group, bitmap_info, offset,
+                                end - offset);
+                bytes -= end - offset;
+                offset = end;
+                added = 0;
+        } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
+                bitmap_set_bits(block_group, bitmap_info, offset, bytes);
+                bytes = 0;
+        } else {
+                BUG();
+        }
+        if (!bytes) {
+                ret = 1;
+                goto out;
+        } else
+                goto again;
+new_bitmap:
+        if (info && info->bitmap) {
+                add_new_bitmap(block_group, info, offset);
+                added = 1;
+                info = NULL;
+                goto again;
+        } else {
+                spin_unlock(&block_group->tree_lock);
+                /* no pre-allocated info, allocate a new one */
+                if (!info) {
+                        info = kzalloc(sizeof(struct btrfs_free_space),
+                                       GFP_NOFS);
+                        if (!info) {
+                                spin_lock(&block_group->tree_lock);
+                                ret = -ENOMEM;
+                                goto out;
+                        }
+                }
+                /* allocate the bitmap */
+                info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+                spin_lock(&block_group->tree_lock);
+                if (!info->bitmap) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                goto again;
+        }
+out:
+        if (info) {
+                if (info->bitmap)
+                        kfree(info->bitmap);
+                kfree(info);
+        }
        return ret;
 }
@@ -208,8 +561,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                         u64 offset, u64 bytes)
 {
-        struct btrfs_free_space *right_info;
+        struct btrfs_free_space *right_info = NULL;
-        struct btrfs_free_space *left_info;
+        struct btrfs_free_space *left_info = NULL;
        struct btrfs_free_space *info = NULL;
        int ret = 0;
@@ -227,18 +580,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
         * are adding, if there is remove that struct and add a new one to
         * cover the entire range
         */
-        right_info = tree_search_offset(&block_group->free_space_offset,
+        right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
-                                        offset+bytes, 0, 0);
+        if (right_info && rb_prev(&right_info->offset_index))
-        left_info = tree_search_offset(&block_group->free_space_offset,
+                left_info = rb_entry(rb_prev(&right_info->offset_index),
-                                       offset-1, 0, 1);
+                                     struct btrfs_free_space, offset_index);
+        else
+                left_info = tree_search_offset(block_group, offset - 1, 0, 0);
-        if (right_info) {
+        /*
+         * If there was no extent directly to the left or right of this new
+         * extent then we know we're going to have to allocate a new extent, so
+         * before we do that see if we need to drop this into a bitmap
+         */
+        if ((!left_info || left_info->bitmap) &&
+            (!right_info || right_info->bitmap)) {
+                ret = insert_into_bitmap(block_group, info);
+                if (ret < 0) {
+                        goto out;
+                } else if (ret) {
+                        ret = 0;
+                        goto out;
+                }
+        }
+        if (right_info && !right_info->bitmap) {
                unlink_free_space(block_group, right_info);
                info->bytes += right_info->bytes;
                kfree(right_info);
        }
-        if (left_info && left_info->offset + left_info->bytes == offset) {
+        if (left_info && !left_info->bitmap &&
+            left_info->offset + left_info->bytes == offset) {
                unlink_free_space(block_group, left_info);
                info->offset = left_info->offset;
                info->bytes += left_info->bytes;
@@ -248,11 +621,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
        ret = link_free_space(block_group, info);
        if (ret)
                kfree(info);
+out:
        spin_unlock(&block_group->tree_lock);
        if (ret) {
-                printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+                printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
                BUG_ON(ret == -EEXIST);
        }
@@ -263,40 +636,65 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                            u64 offset, u64 bytes)
 {
        struct btrfs_free_space *info;
+        struct btrfs_free_space *next_info = NULL;
        int ret = 0;
        spin_lock(&block_group->tree_lock);
-        info = tree_search_offset(&block_group->free_space_offset, offset, 0,
+again:
-                                  1);
+        info = tree_search_offset(block_group, offset, 0, 0);
-        if (info && info->offset == offset) {
+        if (!info) {
-                if (info->bytes < bytes) {
+                WARN_ON(1);
-                        printk(KERN_ERR "Found free space at %llu, size %llu,"
+                goto out_lock;
-                               "trying to use %llu\n",
+        }
-                               (unsigned long long)info->offset,
-                               (unsigned long long)info->bytes,
+        if (info->bytes < bytes && rb_next(&info->offset_index)) {
-                               (unsigned long long)bytes);
+                u64 end;
+                next_info = rb_entry(rb_next(&info->offset_index),
+                                             struct btrfs_free_space,
+                                             offset_index);
+                if (next_info->bitmap)
+                        end = next_info->offset + BITS_PER_BITMAP *
+                                block_group->sectorsize - 1;
+                else
+                        end = next_info->offset + next_info->bytes;
+                if (next_info->bytes < bytes ||
+                    next_info->offset > offset || offset > end) {
+                        printk(KERN_CRIT "Found free space at %llu, size %llu,"
+                              " trying to use %llu\n",
+                              (unsigned long long)info->offset,
+                              (unsigned long long)info->bytes,
+                              (unsigned long long)bytes);
                        WARN_ON(1);
                        ret = -EINVAL;
-                        spin_unlock(&block_group->tree_lock);
+                        goto out_lock;
-                        goto out;
                }
-                unlink_free_space(block_group, info);
-                if (info->bytes == bytes) {
+                info = next_info;
-                        kfree(info);
+        }
-                        spin_unlock(&block_group->tree_lock);
-                        goto out;
+        if (info->bytes == bytes) {
+                unlink_free_space(block_group, info);
+                if (info->bitmap) {
+                        kfree(info->bitmap);
+                        block_group->total_bitmaps--;
                }
+                kfree(info);
+                goto out_lock;
+        }
+        if (!info->bitmap && info->offset == offset) {
+                unlink_free_space(block_group, info);
                info->offset += bytes;
                info->bytes -= bytes;
+                link_free_space(block_group, info);
+                goto out_lock;
+        }
-                ret = link_free_space(block_group, info);
+        if (!info->bitmap && info->offset <= offset &&
-                spin_unlock(&block_group->tree_lock);
+            info->offset + info->bytes >= offset + bytes) {
-                BUG_ON(ret);
-        } else if (info && info->offset < offset &&
-                   info->offset + info->bytes >= offset + bytes) {
                u64 old_start = info->offset;
                /*
                 * we're freeing space in the middle of the info,
@@ -312,7 +710,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                        info->offset = offset + bytes;
                        info->bytes = old_end - info->offset;
                        ret = link_free_space(block_group, info);
-                        BUG_ON(ret);
+                        WARN_ON(ret);
+                        if (ret)
+                                goto out_lock;
                } else {
                        /* the hole we're creating ends at the end
                         * of the info struct, just free the info
@@ -320,32 +720,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                        kfree(info);
                }
                spin_unlock(&block_group->tree_lock);
-                /* step two, insert a new info struct to cover anything
-                 * before the hole
+                /* step two, insert a new info struct to cover
+                 * anything before the hole
                 */
                ret = btrfs_add_free_space(block_group, old_start,
                                           offset - old_start);
-                BUG_ON(ret);
+                WARN_ON(ret);
-        } else {
+                goto out;
-                spin_unlock(&block_group->tree_lock);
-                if (!info) {
-                        printk(KERN_ERR "couldn't find space %llu to free\n",
-                               (unsigned long long)offset);
-                        printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
-                               block_group->cached,
-                               (unsigned long long)block_group->key.objectid,
-                               (unsigned long long)block_group->key.offset);
-                        btrfs_dump_free_space(block_group, bytes);
-                } else if (info) {
-                        printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
-                               "but wanted offset=%llu bytes=%llu\n",
-                               (unsigned long long)info->offset,
-                               (unsigned long long)info->bytes,
-                               (unsigned long long)offset,
-                               (unsigned long long)bytes);
-                }
-                WARN_ON(1);
        }
+        ret = remove_from_bitmap(block_group, info, &offset, &bytes);
+        if (ret == -EAGAIN)
+                goto again;
+        BUG_ON(ret);
+out_lock:
+        spin_unlock(&block_group->tree_lock);
 out:
        return ret;
 }
@@ -361,10 +751,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes)
                        count++;
-                printk(KERN_ERR "entry offset %llu, bytes %llu\n",
+                printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
                       (unsigned long long)info->offset,
-                       (unsigned long long)info->bytes);
+                       (unsigned long long)info->bytes,
+                       (info->bitmap) ? "yes" : "no");
        }
+        printk(KERN_INFO "block group has cluster?: %s\n",
+               list_empty(&block_group->cluster_list) ? "no" : "yes");
        printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
               "\n", count);
 }
@@ -397,26 +790,35 @@ __btrfs_return_cluster_to_free_space(
 {
        struct btrfs_free_space *entry;
        struct rb_node *node;
+        bool bitmap;
        spin_lock(&cluster->lock);
        if (cluster->block_group != block_group)
                goto out;
+        bitmap = cluster->points_to_bitmap;
+        cluster->block_group = NULL;
        cluster->window_start = 0;
+        list_del_init(&cluster->block_group_list);
+        cluster->points_to_bitmap = false;
+        if (bitmap)
+                goto out;
        node = rb_first(&cluster->root);
-        while(node) {
+        while (node) {
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
                rb_erase(&entry->offset_index, &cluster->root);
-                link_free_space(block_group, entry);
+                BUG_ON(entry->bitmap);
+                tree_insert_offset(&block_group->free_space_offset,
+                                   entry->offset, &entry->offset_index, 0);
        }
-        list_del_init(&cluster->block_group_list);
-        btrfs_put_block_group(cluster->block_group);
-        cluster->block_group = NULL;
        cluster->root.rb_node = NULL;
 out:
        spin_unlock(&cluster->lock);
+        btrfs_put_block_group(block_group);
        return 0;
 }
@@ -425,20 +827,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
        struct btrfs_free_space *info;
        struct rb_node *node;
        struct btrfs_free_cluster *cluster;
-        struct btrfs_free_cluster *safe;
+        struct list_head *head;
        spin_lock(&block_group->tree_lock);
+        while ((head = block_group->cluster_list.next) !=
-        list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
+               &block_group->cluster_list) {
-                                 block_group_list) {
+                cluster = list_entry(head, struct btrfs_free_cluster,
+                                     block_group_list);
                WARN_ON(cluster->block_group != block_group);
                __btrfs_return_cluster_to_free_space(block_group, cluster);
+                if (need_resched()) {
+                        spin_unlock(&block_group->tree_lock);
+                        cond_resched();
+                        spin_lock(&block_group->tree_lock);
+                }
        }
-        while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
+        while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
-                info = rb_entry(node, struct btrfs_free_space, bytes_index);
+                info = rb_entry(node, struct btrfs_free_space, offset_index);
                unlink_free_space(block_group, info);
+                if (info->bitmap)
+                        kfree(info->bitmap);
                kfree(info);
                if (need_resched()) {
                        spin_unlock(&block_group->tree_lock);
@@ -446,6 +856,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
                        spin_lock(&block_group->tree_lock);
                }
        }
        spin_unlock(&block_group->tree_lock);
 }
@@ -453,25 +864,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
                               u64 offset, u64 bytes, u64 empty_size)
 {
        struct btrfs_free_space *entry = NULL;
+        u64 bytes_search = bytes + empty_size;
        u64 ret = 0;
        spin_lock(&block_group->tree_lock);
-        entry = tree_search_offset(&block_group->free_space_offset, offset,
+        entry = find_free_space(block_group, &offset, &bytes_search, 0);
-                                   bytes + empty_size, 1);
        if (!entry)
-                entry = tree_search_bytes(&block_group->free_space_bytes,
+                goto out;
-                                          offset, bytes + empty_size);
-        if (entry) {
+        ret = offset;
+        if (entry->bitmap) {
+                bitmap_clear_bits(block_group, entry, offset, bytes);
+                if (!entry->bytes) {
+                        unlink_free_space(block_group, entry);
+                        kfree(entry->bitmap);
+                        kfree(entry);
+                        block_group->total_bitmaps--;
+                        recalculate_thresholds(block_group);
+                }
+        } else {
                unlink_free_space(block_group, entry);
-                ret = entry->offset;
                entry->offset += bytes;
                entry->bytes -= bytes;
                if (!entry->bytes)
                        kfree(entry);
                else
                        link_free_space(block_group, entry);
        }
+out:
        spin_unlock(&block_group->tree_lock);
        return ret;
@@ -517,6 +938,47 @@ int btrfs_return_cluster_to_free_space(
        return ret;
 }
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+                                   struct btrfs_free_cluster *cluster,
+                                   u64 bytes, u64 min_start)
+{
+        struct btrfs_free_space *entry;
+        int err;
+        u64 search_start = cluster->window_start;
+        u64 search_bytes = bytes;
+        u64 ret = 0;
+        spin_lock(&block_group->tree_lock);
+        spin_lock(&cluster->lock);
+        if (!cluster->points_to_bitmap)
+                goto out;
+        if (cluster->block_group != block_group)
+                goto out;
+        entry = tree_search_offset(block_group, search_start, 0, 0);
+        if (!entry || !entry->bitmap)
+                goto out;
+        search_start = min_start;
+        search_bytes = bytes;
+        err = search_bitmap(block_group, entry, &search_start,
+                            &search_bytes);
+        if (err)
+                goto out;
+        ret = search_start;
+        bitmap_clear_bits(block_group, entry, ret, bytes);
+out:
+        spin_unlock(&cluster->lock);
+        spin_unlock(&block_group->tree_lock);
+        return ret;
+}
 /*
 * given a cluster, try to allocate 'bytes' from it, returns 0
 * if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +992,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
        struct rb_node *node;
        u64 ret = 0;
+        if (cluster->points_to_bitmap)
+                return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
+                                               min_start);
        spin_lock(&cluster->lock);
        if (bytes > cluster->max_size)
                goto out;
@@ -567,9 +1033,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
        }
 out:
        spin_unlock(&cluster->lock);
        return ret;
 }
+static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+                                struct btrfs_free_space *entry,
+                                struct btrfs_free_cluster *cluster,
+                                u64 offset, u64 bytes, u64 min_bytes)
+{
+        unsigned long next_zero;
+        unsigned long i;
+        unsigned long search_bits;
+        unsigned long total_bits;
+        unsigned long found_bits;
+        unsigned long start = 0;
+        unsigned long total_found = 0;
+        bool found = false;
+        i = offset_to_bit(entry->offset, block_group->sectorsize,
+                          max_t(u64, offset, entry->offset));
+        search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+        total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+again:
+        found_bits = 0;
+        for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
+             i < BITS_PER_BITMAP;
+             i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+                next_zero = find_next_zero_bit(entry->bitmap,
+                                               BITS_PER_BITMAP, i);
+                if (next_zero - i >= search_bits) {
+                        found_bits = next_zero - i;
+                        break;
+                }
+                i = next_zero;
+        }
+        if (!found_bits)
+                return -1;
+        if (!found) {
+                start = i;
+                found = true;
+        }
+        total_found += found_bits;
+        if (cluster->max_size < found_bits * block_group->sectorsize)
+                cluster->max_size = found_bits * block_group->sectorsize;
+        if (total_found < total_bits) {
+                i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
+                if (i - start > total_bits * 2) {
+                        total_found = 0;
+                        cluster->max_size = 0;
+                        found = false;
+                }
+                goto again;
+        }
+        cluster->window_start = start * block_group->sectorsize +
+                entry->offset;
+        cluster->points_to_bitmap = true;
+        return 0;
+}
 /*
 * here we try to find a cluster of blocks in a block group.  The goal
 * is to find at least bytes free and up to empty_size + bytes free.
@@ -587,12 +1117,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
        struct btrfs_free_space *entry = NULL;
        struct rb_node *node;
        struct btrfs_free_space *next;
-        struct btrfs_free_space *last;
+        struct btrfs_free_space *last = NULL;
        u64 min_bytes;
        u64 window_start;
        u64 window_free;
        u64 max_extent = 0;
-        int total_retries = 0;
+        bool found_bitmap = false;
        int ret;
        /* for metadata, allow allocates with more holes */
@@ -620,31 +1150,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                goto out;
        }
 again:
-        min_bytes = min(min_bytes, bytes + empty_size);
+        entry = tree_search_offset(block_group, offset, found_bitmap, 1);
-        entry = tree_search_bytes(&block_group->free_space_bytes,
-                                  offset, min_bytes);
        if (!entry) {
                ret = -ENOSPC;
                goto out;
        }
+        /*
+         * If found_bitmap is true, we exhausted our search for extent entries,
+         * and we just want to search all of the bitmaps that we can find, and
+         * ignore any extent entries we find.
+         */
+        while (entry->bitmap || found_bitmap ||
+               (!entry->bitmap && entry->bytes < min_bytes)) {
+                struct rb_node *node = rb_next(&entry->offset_index);
+                if (entry->bitmap && entry->bytes > bytes + empty_size) {
+                        ret = btrfs_bitmap_cluster(block_group, entry, cluster,
+                                                   offset, bytes + empty_size,
+                                                   min_bytes);
+                        if (!ret)
+                                goto got_it;
+                }
+                if (!node) {
+                        ret = -ENOSPC;
+                        goto out;
+                }
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+        }
+        /*
+         * We already searched all the extent entries from the passed in offset
+         * to the end and didn't find enough space for the cluster, and we also
+         * didn't find any bitmaps that met our criteria, just go ahead and exit
+         */
+        if (found_bitmap) {
+                ret = -ENOSPC;
+                goto out;
+        }
+        cluster->points_to_bitmap = false;
        window_start = entry->offset;
        window_free = entry->bytes;
        last = entry;
        max_extent = entry->bytes;
-        while(1) {
+        while (1) {
                /* out window is just right, lets fill it */
                if (window_free >= bytes + empty_size)
                        break;
                node = rb_next(&last->offset_index);
                if (!node) {
+                        if (found_bitmap)
+                                goto again;
                        ret = -ENOSPC;
                        goto out;
                }
                next = rb_entry(node, struct btrfs_free_space, offset_index);
                /*
+                 * we found a bitmap, so if this search doesn't result in a
+                 * cluster, we know to go and search again for the bitmaps and
+                 * start looking for space there
+                 */
+                if (next->bitmap) {
+                        if (!found_bitmap)
+                                offset = next->offset;
+                        found_bitmap = true;
+                        last = next;
+                        continue;
+                }
+                /*
                 * we haven't filled the empty size and the window is
                 * very large.  reset and try again
                 */
@@ -655,19 +1234,6 @@ again:
                        window_free = entry->bytes;
                        last = entry;
                        max_extent = 0;
-                        total_retries++;
-                        if (total_retries % 64 == 0) {
-                                if (min_bytes >= (bytes + empty_size)) {
-                                        ret = -ENOSPC;
-                                        goto out;
-                                }
-                                /*
-                                 * grow our allocation a bit, we're not having
-                                 * much luck
-                                 */
-                                min_bytes *= 2;
-                                goto again;
-                        }
                } else {
                        last = next;
                        window_free += next->bytes;
@@ -685,11 +1251,19 @@ again:
         * The cluster includes an rbtree, but only uses the offset index
         * of each free space cache entry.
         */
-        while(1) {
+        while (1) {
                node = rb_next(&entry->offset_index);
-                unlink_free_space(block_group, entry);
+                if (entry->bitmap && node) {
+                        entry = rb_entry(node, struct btrfs_free_space,
+                                         offset_index);
+                        continue;
+                } else if (entry->bitmap && !node) {
+                        break;
+                }
+                rb_erase(&entry->offset_index, &block_group->free_space_offset);
                ret = tree_insert_offset(&cluster->root, entry->offset,
-                                         &entry->offset_index);
+                                         &entry->offset_index, 0);
                BUG_ON(ret);
                if (!node || entry == last)
@@ -697,8 +1271,10 @@ again:
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
        }
-        ret = 0;
        cluster->max_size = max_extent;
+got_it:
+        ret = 0;
        atomic_inc(&block_group->count);
        list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
        cluster->block_group = block_group;
@@ -718,6 +1294,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
        spin_lock_init(&cluster->refill_lock);
        cluster->root.rb_node = NULL;
        cluster->max_size = 0;
+        cluster->points_to_bitmap = false;
        INIT_LIST_HEAD(&cluster->block_group_list);
        cluster->block_group = NULL;
 }
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 266fb8764054..890a8e79011b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
 #ifndef __BTRFS_FREE_SPACE_CACHE
 #define __BTRFS_FREE_SPACE_CACHE
+struct btrfs_free_space {
+        struct rb_node offset_index;
+        u64 offset;
+        u64 bytes;
+        unsigned long *bitmap;
+        struct list_head list;
+};
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                         u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 791eab19e330..56fe83fa60c4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2603,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        if (root->ref_cows)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
        path = btrfs_alloc_path();
-        path->reada = -1;
        BUG_ON(!path);
+        path->reada = -1;
        /* FIXME, add redo link to tree so we don't leak on crash */
        key.objectid = inode->i_ino;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6d6523da0a30..0d126be22b63 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
        }
        printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
               (unsigned long long)btrfs_header_bytenr(c),
-               btrfs_header_level(c), nr,
+              level, nr,
               (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
        for (i = 0; i < nr; i++) {
                btrfs_node_key_to_cpu(c, &key, i);
@@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
                                        btrfs_level_size(root, level - 1),
                                        btrfs_node_ptr_generation(c, i));
                if (btrfs_is_leaf(next) &&
-                    btrfs_header_level(c) != 1)
+                   level != 1)
                        BUG();
                if (btrfs_header_level(next) !=
-                        btrfs_header_level(c) - 1)
+                       level - 1)
                        BUG();
                btrfs_print_tree(root, next);
                free_extent_buffer(next);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 008397934778..e71264d1c2c9 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -670,6 +670,8 @@ again:
                        err = ret;
                        goto out;
                }
+                if (ret > 0 && path2->slots[level] > 0)
+                        path2->slots[level]--;
                eb = path2->nodes[level];
                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
@@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                BUG_ON(level == 0);
                path->lowest_level = level;
                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+                path->lowest_level = 0;
                if (ret < 0) {
                        btrfs_free_path(path);
                        return ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2dbf1c1f56ee..cdbb5022da52 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
        }
 }
+static noinline void switch_commit_root(struct btrfs_root *root)
+{
+        free_extent_buffer(root->commit_root);
+        root->commit_root = btrfs_root_node(root);
+}
 /*
 * either allocate a new transaction or hop into the existing one
 */
@@ -444,9 +450,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        btrfs_write_dirty_block_groups(trans, root);
-        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-        BUG_ON(ret);
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start)
@@ -457,13 +460,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                                        &root->root_key,
                                        &root->root_item);
                BUG_ON(ret);
-                btrfs_write_dirty_block_groups(trans, root);
-                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                ret = btrfs_write_dirty_block_groups(trans, root);
                BUG_ON(ret);
        }
-        free_extent_buffer(root->commit_root);
-        root->commit_root = btrfs_root_node(root);
+        if (root != root->fs_info->extent_root)
+                switch_commit_root(root);
        return 0;
 }
@@ -495,10 +499,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                root = list_entry(next, struct btrfs_root, dirty_list);
                update_cowonly_root(trans, root);
-                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-                BUG_ON(ret);
        }
+        down_write(&fs_info->extent_commit_sem);
+        switch_commit_root(fs_info->extent_root);
+        up_write(&fs_info->extent_commit_sem);
        return 0;
 }
@@ -544,8 +550,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        btrfs_update_reloc_root(trans, root);
                        if (root->commit_root != root->node) {
-                                free_extent_buffer(root->commit_root);
+                                switch_commit_root(root);
-                                root->commit_root = btrfs_root_node(root);
                                btrfs_set_root_node(&root->root_item,
                                                    root->node);
                        }
@@ -852,6 +857,16 @@ static void update_super_roots(struct btrfs_root *root)
        super->root_level = root_item->level;
 }
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
+{
+        int ret = 0;
+        spin_lock(&info->new_trans_lock);
+        if (info->running_transaction)
+                ret = info->running_transaction->in_commit;
+        spin_unlock(&info->new_trans_lock);
+        return ret;
+}
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
@@ -943,9 +958,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                mutex_unlock(&root->fs_info->trans_mutex);
-                if (flush_on_commit || snap_pending) {
+                if (flush_on_commit) {
-                        if (flush_on_commit)
+                        btrfs_start_delalloc_inodes(root);
-                                btrfs_start_delalloc_inodes(root);
+                        ret = btrfs_wait_ordered_extents(root, 0);
+                        BUG_ON(ret);
+                } else if (snap_pending) {
                        ret = btrfs_wait_ordered_extents(root, 1);
                        BUG_ON(ret);
                }
@@ -1009,15 +1026,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
                            root->fs_info->tree_root->node);
-        free_extent_buffer(root->fs_info->tree_root->commit_root);
+        switch_commit_root(root->fs_info->tree_root);
-        root->fs_info->tree_root->commit_root =
-                                btrfs_root_node(root->fs_info->tree_root);
        btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
                            root->fs_info->chunk_root->node);
-        free_extent_buffer(root->fs_info->chunk_root->commit_root);
+        switch_commit_root(root->fs_info->chunk_root);
-        root->fs_info->chunk_root->commit_root =
-                                btrfs_root_node(root->fs_info->chunk_root);
        update_super_roots(root);
@@ -1057,6 +1070,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        cur_trans->commit_done = 1;
        root->fs_info->last_trans_committed = cur_trans->transid;
        wake_up(&cur_trans->commit_wait);
        put_transaction(cur_trans);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 961c3ee5a2e1..663c67404918 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,4 +107,5 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
                                        struct extent_io_tree *dirty_pages);
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c13922206d1b..d91b0de7c502 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                return -ENOENT;
        inode = read_one_inode(root, key->objectid);
-        BUG_ON(!dir);
+        BUG_ON(!inode);
        ref_ptr = btrfs_item_ptr_offset(eb, slot);
        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ab80e9cd767..5dbefd11b4af 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -721,7 +721,8 @@ error:
 */
 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
                                         struct btrfs_device *device,
-                                         u64 num_bytes, u64 *start)
+                                         u64 num_bytes, u64 *start,
+                                         u64 *max_avail)
 {
        struct btrfs_key key;
        struct btrfs_root *root = device->dev_root;
@@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
        if (ret < 0)
                goto error;
-        ret = btrfs_previous_item(root, path, 0, key.type);
+        if (ret > 0) {
-        if (ret < 0)
+                ret = btrfs_previous_item(root, path, key.objectid, key.type);
-                goto error;
+                if (ret < 0)
+                        goto error;
+                if (ret > 0)
+                        start_found = 1;
+        }
        l = path->nodes[0];
        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
        while (1) {
@@ -803,6 +808,10 @@ no_more_items:
                        if (last_byte < search_start)
                                last_byte = search_start;
                        hole_size = key.offset - last_byte;
+                        if (hole_size > *max_avail)
+                                *max_avail = hole_size;
                        if (key.offset > last_byte &&
                            hole_size >= num_bytes) {
                                *start = last_byte;
@@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
        device->fs_devices->total_rw_bytes += diff;
        device->total_bytes = new_size;
+        device->disk_total_bytes = new_size;
        btrfs_clear_space_info_full(device->dev_root->fs_info);
        return btrfs_update_device(trans, device);
@@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                        goto done;
                if (ret) {
                        ret = 0;
-                        goto done;
+                        break;
                }
                l = path->nodes[0];
@@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
                if (key.objectid != device->devid)
-                        goto done;
+                        break;
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
                length = btrfs_dev_extent_length(l, dev_extent);
@@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                             max_chunk_size);
 again:
+        max_avail = 0;
        if (!map || map->num_stripes != num_stripes) {
                kfree(map);
                map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2219,7 +2230,8 @@ again:
                if (device->in_fs_metadata && avail >= min_free) {
                        ret = find_free_dev_extent(trans, device,
-                                                   min_free, &dev_offset);
+                                                   min_free, &dev_offset,
+                                                   &max_avail);
                        if (ret == 0) {
                                list_move_tail(&device->dev_alloc_list,
                                               &private_devs);
@@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                }
        }
-        for (i = 0; i > nr; i++) {
-                struct btrfs_multi_bio *multi;
-                struct btrfs_bio_stripe *stripe;
-                int ret;
-                length = 1;
-                ret = btrfs_map_block(map_tree, WRITE, buf[i],
-                                      &length, &multi, 0);
-                BUG_ON(ret);
-                stripe = multi->stripes;
-                for (j = 0; j < multi->num_stripes; j++) {
-                        if (stripe->physical >= physical &&
-                            physical < stripe->physical + length)
-                                break;
-                }
-                BUG_ON(j >= multi->num_stripes);
-                kfree(multi);
-        }
        *logical = buf;
        *naddrs = nr;
        *stripe_len = map->stripe_len;
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 92888aa90749..e85b1e4389e0 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,10 @@
+Version 1.60
+-------------
+Fix memory leak in reconnect.  Fix oops in DFS mount error path.
+Set s_maxbytes to smaller (the max that vfs can handle) so that
+sendfile will now work over cifs mounts again.  Add noforcegid
+and noforceuid mount parameters.
 Version 1.59
 ------------
 Client uses server inode numbers (which are persistent) rather than
diff --git a/fs/cifs/README b/fs/cifs/README
index ad92921dbde4..79c1a93400be 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,11 +262,11 @@ A partial list of the supported mount options follows:
                mount.  
  domain        Set the SMB/CIFS workgroup name prepended to the
                username during CIFS session establishment
-  forceuid      Set the default uid for inodes based on the uid
+  forceuid      Set the default uid for inodes to the uid
-                passed in. For mounts to servers
+                passed in on mount. For mounts to servers
                which do support the CIFS Unix extensions, such as a
                properly configured Samba server, the server provides
-                the uid, gid and mode so this parameter should  not be
+                the uid, gid and mode so this parameter should not be
                specified unless the server and clients uid and gid
                numbering differ.  If the server and client are in the
                same domain (e.g. running winbind or nss_ldap) and
@@ -278,11 +278,7 @@ A partial list of the supported mount options follows:
                of existing files will be the uid (gid) of the person
                who executed the mount (root, except when mount.cifs
                is configured setuid for user mounts) unless the "uid=" 
-                (gid) mount option is specified.  For the uid (gid) of newly
+                (gid) mount option is specified. Also note that permission
-                created files and directories, ie files created since 
-                the last mount of the server share, the expected uid 
-                (gid) is cached as long as the inode remains in 
-                memory on the client.   Also note that permission
                checks (authorization checks) on accesses to a file occur
                at the server, but there are cases in which an administrator
                may want to restrict at the client as well.  For those
@@ -290,12 +286,15 @@ A partial list of the supported mount options follows:
                (such as Windows), permissions can also be checked at the
                client, and a crude form of client side permission checking 
                can be enabled by specifying file_mode and dir_mode on 
-                the client.  Note that the mount.cifs helper must be
+                the client.  (default)
-                at version 1.10 or higher to support specifying the uid
+  forcegid      (similar to above but for the groupid instead of uid) (default)
-                (or gid) in non-numeric form.
+  noforceuid    Fill in file owner information (uid) by requesting it from
-  forcegid      (similar to above but for the groupid instead of uid)
+                the server if possible. With this option, the value given in
+                the uid= option (on mount) will only be used if the server
+                can not support returning uids on inodes.
+  noforcegid    (similar to above but for the group owner, gid, instead of uid)
  uid           Set the default uid for inodes, and indicate to the
-                cifs kernel driver which local user mounted . If the server
+                cifs kernel driver which local user mounted. If the server
                supports the unix extensions the default uid is
                not used to fill in the owner fields of inodes (files)
                unless the "forceuid" parameter is specified.
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 7f19fefd3d45..42cec2a7c0cf 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -261,6 +261,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
                                        atomic_set(&tcon->num_reads, 0);
                                        atomic_set(&tcon->num_oplock_brks, 0);
                                        atomic_set(&tcon->num_opens, 0);
+                                        atomic_set(&tcon->num_posixopens, 0);
+                                        atomic_set(&tcon->num_posixmkdirs, 0);
                                        atomic_set(&tcon->num_closes, 0);
                                        atomic_set(&tcon->num_deletes, 0);
                                        atomic_set(&tcon->num_mkdirs, 0);
@@ -347,11 +349,15 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                                        atomic_read(&tcon->num_locks),
                                        atomic_read(&tcon->num_hardlinks),
                                        atomic_read(&tcon->num_symlinks));
-                                seq_printf(m, "\nOpens: %d Closes: %d"
+                                seq_printf(m, "\nOpens: %d Closes: %d "
                                              "Deletes: %d",
                                        atomic_read(&tcon->num_opens),
                                        atomic_read(&tcon->num_closes),
                                        atomic_read(&tcon->num_deletes));
+                                seq_printf(m, "\nPosix Opens: %d "
+                                              "Posix Mkdirs: %d",
+                                        atomic_read(&tcon->num_posixopens),
+                                        atomic_read(&tcon->num_posixmkdirs));
                                seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
                                        atomic_read(&tcon->num_mkdirs),
                                        atomic_read(&tcon->num_rmdirs));
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 3bb11be8b6a8..606912d8f2a8 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -55,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
 * i.e. strips from UNC trailing path that is not part of share
 * name and fixup missing '\' in the begining of DFS node refferal
 * if neccessary.
- * Returns pointer to share name on success or NULL on error.
+ * Returns pointer to share name on success or ERR_PTR on error.
 * Caller is responsible for freeing returned string.
 */
 static char *cifs_get_share_name(const char *node_name)
@@ -68,7 +68,7 @@ static char *cifs_get_share_name(const char *node_name)
        UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
                         GFP_KERNEL);
        if (!UNC)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        /* get share name and server name */
        if (node_name[1] != '\\') {
@@ -87,7 +87,7 @@ static char *cifs_get_share_name(const char *node_name)
                cERROR(1, ("%s: no server name end in node name: %s",
                        __func__, node_name));
                kfree(UNC);
-                return NULL;
+                return ERR_PTR(-EINVAL);
        }
        /* find sharename end */
@@ -133,6 +133,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
                return ERR_PTR(-EINVAL);
        *devname = cifs_get_share_name(ref->node_name);
+        if (IS_ERR(*devname)) {
+                rc = PTR_ERR(*devname);
+                *devname = NULL;
+                goto compose_mount_options_err;
+        }
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
        if (rc != 0) {
                cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 60e3c4253de0..714a542cbafc 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,7 +44,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
        int maxwords = maxbytes / 2;
        char tmp[NLS_MAX_CHARSET_SIZE];
-        for (i = 0; from[i] && i < maxwords; i++) {
+        for (i = 0; i < maxwords && from[i]; i++) {
                charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
                                             NLS_MAX_CHARSET_SIZE);
                if (charlen > 0)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 44f30504b82d..84b75253b05a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -376,10 +376,14 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
        seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
                seq_printf(s, ",forceuid");
+        else
+                seq_printf(s, ",noforceuid");
        seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
                seq_printf(s, ",forcegid");
+        else
+                seq_printf(s, ",noforcegid");
        cifs_show_address(s, tcon->ses->server);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 63f6cdfa5638..6084d6379c03 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -260,6 +260,8 @@ struct cifsTconInfo {
        atomic_t num_closes;
        atomic_t num_deletes;
        atomic_t num_mkdirs;
+        atomic_t num_posixopens;
+        atomic_t num_posixmkdirs;
        atomic_t num_rmdirs;
        atomic_t num_renames;
        atomic_t num_t2renames;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 922f5fe2084c..1866bc2927d4 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1113,7 +1113,10 @@ PsxCreat:
 psx_create_err:
        cifs_buf_release(pSMB);
-        cifs_stats_inc(&tcon->num_mkdirs);
+        if (posix_flags & SMB_O_DIRECTORY)
+                cifs_stats_inc(&tcon->num_posixmkdirs);
+        else
+                cifs_stats_inc(&tcon->num_posixopens);
        if (rc == -EAGAIN)
                goto PsxCreat;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e16d7592116a..1f3345d7fa79 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -803,6 +803,10 @@ cifs_parse_mount_options(char *options, const char *devname,
        char *data;
        unsigned int  temp_len, i, j;
        char separator[2];
+        short int override_uid = -1;
+        short int override_gid = -1;
+        bool uid_specified = false;
+        bool gid_specified = false;
        separator[0] = ',';
        separator[1] = 0;
@@ -1093,18 +1097,20 @@ cifs_parse_mount_options(char *options, const char *devname,
                                                    "too long.\n");
                                return 1;
                        }
-                } else if (strnicmp(data, "uid", 3) == 0) {
+                } else if (!strnicmp(data, "uid", 3) && value && *value) {
-                        if (value && *value)
+                        vol->linux_uid = simple_strtoul(value, &value, 0);
-                                vol->linux_uid =
+                        uid_specified = true;
-                                        simple_strtoul(value, &value, 0);
+                } else if (!strnicmp(data, "forceuid", 8)) {
-                } else if (strnicmp(data, "forceuid", 8) == 0) {
+                        override_uid = 1;
-                                vol->override_uid = 1;
+                } else if (!strnicmp(data, "noforceuid", 10)) {
-                } else if (strnicmp(data, "gid", 3) == 0) {
+                        override_uid = 0;
-                        if (value && *value)
+                } else if (!strnicmp(data, "gid", 3) && value && *value) {
-                                vol->linux_gid =
+                        vol->linux_gid = simple_strtoul(value, &value, 0);
-                                        simple_strtoul(value, &value, 0);
+                        gid_specified = true;
-                } else if (strnicmp(data, "forcegid", 8) == 0) {
+                } else if (!strnicmp(data, "forcegid", 8)) {
-                                vol->override_gid = 1;
+                        override_gid = 1;
+                } else if (!strnicmp(data, "noforcegid", 10)) {
+                        override_gid = 0;
                } else if (strnicmp(data, "file_mode", 4) == 0) {
                        if (value && *value) {
                                vol->file_mode =
@@ -1355,6 +1361,18 @@ cifs_parse_mount_options(char *options, const char *devname,
        if (vol->UNCip == NULL)
                vol->UNCip = &vol->UNC[2];
+        if (uid_specified)
+                vol->override_uid = override_uid;
+        else if (override_uid == 1)
+                printk(KERN_NOTICE "CIFS: ignoring forceuid mount option "
+                                   "specified with no uid= option.\n");
+        if (gid_specified)
+                vol->override_gid = override_gid;
+        else if (override_gid == 1)
+                printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
+                                   "specified with no gid= option.\n");
        return 0;
 }
@@ -2452,10 +2470,10 @@ try_mount_again:
                tcon->local_lease = volume_info->local_lease;
        }
        if (pSesInfo) {
-                if (pSesInfo->capabilities & CAP_LARGE_FILES) {
+                if (pSesInfo->capabilities & CAP_LARGE_FILES)
-                        sb->s_maxbytes = (u64) 1 << 63;
+                        sb->s_maxbytes = MAX_LFS_FILESIZE;
-                } else
+                else
-                        sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */
+                        sb->s_maxbytes = MAX_NON_LFS;
        }
        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
@@ -2544,11 +2562,20 @@ remote_path_check:
                        if (mount_data != mount_data_global)
                                kfree(mount_data);
                        mount_data = cifs_compose_mount_options(
                                        cifs_sb->mountdata, full_path + 1,
                                        referrals, &fake_devname);
-                        kfree(fake_devname);
                        free_dfs_info_array(referrals, num_referrals);
+                        kfree(fake_devname);
+                        kfree(full_path);
+                        if (IS_ERR(mount_data)) {
+                                rc = PTR_ERR(mount_data);
+                                mount_data = NULL;
+                                goto mount_fail_check;
+                        }
                        if (tcon)
                                cifs_put_tcon(tcon);
@@ -2556,8 +2583,6 @@ remote_path_check:
                                cifs_put_smb_ses(pSesInfo);
                        cleanup_volume_info(&volume_info);
-                        FreeXid(xid);
-                        kfree(full_path);
                        referral_walks_count++;
                        goto try_mount_again;
                }
@@ -2726,6 +2751,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
                /* mostly informational -- no need to fail on error here */
+                kfree(tcon->nativeFileSystem);
                tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
                                                      bytes_left, is_unicode,
                                                      nls_codepage);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 18afe57b2461..82d83839655e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -212,7 +212,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
 * junction to the new submount (ie to setup the fake directory
 * which represents a DFS referral).
 */
-void
+static void
 cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -388,7 +388,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 }
 /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
-void
+static void
 cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
                       struct cifs_sb_info *cifs_sb, bool adjust_tz)
 {
@@ -513,9 +513,12 @@ int cifs_get_inode_info(struct inode **pinode,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        if (rc1) {
-                                /* BB EOPNOSUPP disable SERVER_INUM? */
                                cFYI(1, ("GetSrvInodeNum rc %d", rc1));
                                fattr.cf_uniqueid = iunique(sb, ROOT_I);
+                                /* disable serverino if call not supported */
+                                if (rc1 == -EINVAL)
+                                        cifs_sb->mnt_cifs_flags &=
+                                                        ~CIFS_MOUNT_SERVER_INUM;
                        }
                } else {
                        fattr.cf_uniqueid = iunique(sb, ROOT_I);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 205ec95b347e..eb507c453c5f 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -435,7 +435,7 @@ static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
 static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
                    unsigned int flags, struct dlm_rsb **r_ret)
 {
-        struct dlm_rsb *r, *tmp;
+        struct dlm_rsb *r = NULL, *tmp;
        uint32_t hash, bucket;
        int error = -EINVAL;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index cdb580a9c7a2..618a60f03886 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -902,7 +902,7 @@ static void tcp_connect_to_sock(struct connection *con)
        int result = -EHOSTUNREACH;
        struct sockaddr_storage saddr, src_addr;
        int addr_len;
-        struct socket *sock;
+        struct socket *sock = NULL;
        if (con->nodeid == 0) {
                log_print("attempt to connect sock 0 foiled");
@@ -962,6 +962,8 @@ out_err:
        if (con->sock) {
                sock_release(con->sock);
                con->sock = NULL;
+        } else if (sock) {
+                sock_release(sock);
        }
        /*
         * Some errors are fatal and this list might need adjusting. For other
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 894a32d438d5..16f682e26c07 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -353,7 +353,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
 {
        struct dlm_plock_info info;
        struct plock_op *op;
-        int found = 0;
+        int found = 0, do_callback = 0;
        if (count != sizeof(info))
                return -EINVAL;
@@ -366,21 +366,24 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
        spin_lock(&ops_lock);
        list_for_each_entry(op, &recv_list, list) {
-                if (op->info.fsid == info.fsid && op->info.number == info.number &&
+                if (op->info.fsid == info.fsid &&
+                    op->info.number == info.number &&
                    op->info.owner == info.owner) {
+                        struct plock_xop *xop = (struct plock_xop *)op;
                        list_del_init(&op->list);
-                        found = 1;
-                        op->done = 1;
                        memcpy(&op->info, &info, sizeof(info));
+                        if (xop->callback)
+                                do_callback = 1;
+                        else
+                                op->done = 1;
+                        found = 1;
                        break;
                }
        }
        spin_unlock(&ops_lock);
        if (found) {
-                struct plock_xop *xop;
+                if (do_callback)
-                xop = (struct plock_xop *)op;
-                if (xop->callback)
                        dlm_plock_callback(op);
                else
                        wake_up(&recv_wq);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index af737bb56cb7..259525c9abb8 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
        }
        (*new_auth_tok)->session_key.encrypted_key_size =
                (body_size - (ECRYPTFS_SALT_SIZE + 5));
+        if ((*new_auth_tok)->session_key.encrypted_key_size
+            > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+                printk(KERN_WARNING "Tag 3 packet contains key larger "
+                       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
+                rc = -EINVAL;
+                goto out_free;
+        }
        if (unlikely(data[(*packet_size)++] != 0x04)) {
                printk(KERN_WARNING "Unknown version number [%d]\n",
                       data[(*packet_size) - 1]);
@@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents,
                rc = -EINVAL;
                goto out;
        }
+        if (unlikely((*tag_11_contents_size) > max_contents_bytes)) {
+                printk(KERN_ERR "Literal data section in tag 11 packet exceeds "
+                       "expected size\n");
+                rc = -EINVAL;
+                goto out;
+        }
        if (data[(*packet_size)++] != 0x62) {
                printk(KERN_WARNING "Unrecognizable packet\n");
                rc = -EINVAL;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 3d724a95882f..373fa90c796a 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp,
                struct buffer_head *bh = NULL;
                map_bh.b_state = 0;
-                err = ext3_get_blocks_handle(NULL, inode, blk, 1,
+                err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
-                                                &map_bh, 0, 0);
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5f51fed5c750..b49908a167ae 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -788,7 +788,7 @@ err_out:
 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
                sector_t iblock, unsigned long maxblocks,
                struct buffer_head *bh_result,
-                int create, int extend_disksize)
+                int create)
 {
        int err = -EIO;
        int offsets[4];
@@ -911,13 +911,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
        if (!err)
                err = ext3_splice_branch(handle, inode, iblock,
                                        partial, indirect_blks, count);
-        /*
-         * i_disksize growing is protected by truncate_mutex.  Don't forget to
-         * protect it if you're about to implement concurrent
-         * ext3_get_block() -bzzz
-        */
-        if (!err && extend_disksize && inode->i_size > ei->i_disksize)
-                ei->i_disksize = inode->i_size;
        mutex_unlock(&ei->truncate_mutex);
        if (err)
                goto cleanup;
@@ -972,7 +965,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
        }
        ret = ext3_get_blocks_handle(handle, inode, iblock,
-                                        max_blocks, bh_result, create, 0);
+                                        max_blocks, bh_result, create);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -1005,7 +998,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
        dummy.b_blocknr = -1000;
        buffer_trace_init(&dummy.b_history);
        err = ext3_get_blocks_handle(handle, inode, block, 1,
-                                        &dummy, create, 1);
+                                        &dummy, create);
        /*
         * ext3_get_blocks_handle() returns number of blocks
         * mapped. 0 in case of a HOLE.
@@ -1193,15 +1186,16 @@ write_begin_failed:
                 * i_size_read because we hold i_mutex.
                 *
                 * Add inode to orphan list in case we crash before truncate
-                 * finishes.
+                 * finishes. Do this only if ext3_can_truncate() agrees so
+                 * that orphan processing code is happy.
                 */
-                if (pos + len > inode->i_size)
+                if (pos + len > inode->i_size && ext3_can_truncate(inode))
                        ext3_orphan_add(handle, inode);
                ext3_journal_stop(handle);
                unlock_page(page);
                page_cache_release(page);
                if (pos + len > inode->i_size)
-                        vmtruncate(inode, inode->i_size);
+                        ext3_truncate(inode);
        }
        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
@@ -1287,7 +1281,7 @@ static int ext3_ordered_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        ret2 = ext3_journal_stop(handle);
        if (!ret)
@@ -1296,7 +1290,7 @@ static int ext3_ordered_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -1315,14 +1309,14 @@ static int ext3_writeback_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        ret = ext3_journal_stop(handle);
        unlock_page(page);
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -1358,7 +1352,7 @@ static int ext3_journalled_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
        if (inode->i_size > EXT3_I(inode)->i_disksize) {
@@ -1375,7 +1369,7 @@ static int ext3_journalled_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0ddf7e55abe1..9714db393efe 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -93,20 +93,20 @@ typedef unsigned int ext4_group_t;
 struct ext4_allocation_request {
        /* target inode for block we're allocating */
        struct inode *inode;
+        /* how many blocks we want to allocate */
+        unsigned int len;
        /* logical block in target inode */
        ext4_lblk_t logical;
-        /* phys. target (a hint) */
-        ext4_fsblk_t goal;
        /* the closest logical allocated block to the left */
        ext4_lblk_t lleft;
-        /* phys. block for ^^^ */
-        ext4_fsblk_t pleft;
        /* the closest logical allocated block to the right */
        ext4_lblk_t lright;
-        /* phys. block for ^^^ */
+        /* phys. target (a hint) */
+        ext4_fsblk_t goal;
+        /* phys. block for the closest logical allocated block to the left */
+        ext4_fsblk_t pleft;
+        /* phys. block for the closest logical allocated block to the right */
        ext4_fsblk_t pright;
-        /* how many blocks we want to allocate */
-        unsigned int len;
        /* flags. see above EXT4_MB_HINT_* */
        unsigned int flags;
 };
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index ad13a84644e1..eb27fd0f2ee8 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -43,6 +43,8 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
                        ext4_journal_abort_handle(where, __func__, bh,
                                                  handle, err);
        }
+        else
+                brelse(bh);
        return err;
 }
@@ -57,6 +59,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
                        ext4_journal_abort_handle(where, __func__, bh,
                                                  handle, err);
        }
+        else
+                brelse(bh);
        return err;
 }
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index be2f426f6805..139fb8cb87e4 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -131,9 +131,11 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
                                struct buffer_head *bh);
+/* When called with an invalid handle, this will still do a put on the BH */
 int __ext4_journal_forget(const char *where, handle_t *handle,
                                struct buffer_head *bh);
+/* When called with an invalid handle, this will still do a put on the BH */
 int __ext4_journal_revoke(const char *where, handle_t *handle,
                                ext4_fsblk_t blocknr, struct buffer_head *bh);
@@ -281,10 +283,10 @@ static inline int ext4_should_order_data(struct inode *inode)
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
-        if (EXT4_JOURNAL(inode) == NULL)
-                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
+        if (EXT4_JOURNAL(inode) == NULL)
+                return 1;
        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 50322a09bd01..73ebfb44ad75 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1977,6 +1977,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
                         */
                        /* 1 bitmap, 1 block group descriptor */
                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+                        return ret;
                }
        }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 2f645732e3b7..29e6dc7299b8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -833,7 +833,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
        if (!goal)
                goal = sbi->s_inode_goal;
-        if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) {
+        if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
                group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
                ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
                ret2 = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 60a26f3a6f8b..f9c642b22efa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -78,16 +78,14 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
 * but there may still be a record of it in the journal, and that record
 * still needs to be revoked.
 *
- * If the handle isn't valid we're not journaling so there's nothing to do.
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
 */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
                struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
        int err;
-        if (!ext4_handle_valid(handle))
-                return 0;
        might_sleep();
        BUFFER_TRACE(bh, "enter");
@@ -1513,14 +1511,14 @@ retry:
                 * Add inode to orphan list in case we crash before
                 * truncate finishes
                 */
-                if (pos + len > inode->i_size)
+                if (pos + len > inode->i_size && ext4_can_truncate(inode))
                        ext4_orphan_add(handle, inode);
                ext4_journal_stop(handle);
                if (pos + len > inode->i_size) {
-                        vmtruncate(inode, inode->i_size);
+                        ext4_truncate(inode);
                        /*
-                         * If vmtruncate failed early the inode might
+                         * If truncate failed early the inode might
                         * still be on the orphan list; we need to
                         * make sure the inode is removed from the
                         * orphan list in that case.
@@ -1614,7 +1612,7 @@ static int ext4_ordered_write_end(struct file *file,
                ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
                copied = ret2;
-                if (pos + len > inode->i_size)
+                if (pos + len > inode->i_size && ext4_can_truncate(inode))
                        /* if we have allocated more blocks and copied
                         * less. We will have blocks allocated outside
                         * inode->i_size. So truncate them
@@ -1628,9 +1626,9 @@ static int ext4_ordered_write_end(struct file *file,
                ret = ret2;
        if (pos + len > inode->i_size) {
-                vmtruncate(inode, inode->i_size);
+                ext4_truncate(inode);
                /*
-                 * If vmtruncate failed early the inode might still be
+                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
@@ -1655,7 +1653,7 @@ static int ext4_writeback_write_end(struct file *file,
        ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
        copied = ret2;
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
@@ -1670,9 +1668,9 @@ static int ext4_writeback_write_end(struct file *file,
                ret = ret2;
        if (pos + len > inode->i_size) {
-                vmtruncate(inode, inode->i_size);
+                ext4_truncate(inode);
                /*
-                 * If vmtruncate failed early the inode might still be
+                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
@@ -1722,7 +1720,7 @@ static int ext4_journalled_write_end(struct file *file,
        unlock_page(page);
        page_cache_release(page);
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
@@ -1733,9 +1731,9 @@ static int ext4_journalled_write_end(struct file *file,
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size) {
-                vmtruncate(inode, inode->i_size);
+                ext4_truncate(inode);
                /*
-                 * If vmtruncate failed early the inode might still be
+                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
@@ -2305,15 +2303,9 @@ flush_it:
        return;
 }
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 {
-        /*
+        return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
-         * unmapped buffer is possible for holes.
-         * delay buffer is possible with delayed allocation.
-         * We also need to consider unwritten buffer as unmapped.
-         */
-        return (!buffer_mapped(bh) || buffer_delay(bh) ||
-                                buffer_unwritten(bh)) && buffer_dirty(bh);
 }
 /*
@@ -2398,9 +2390,9 @@ static int __mpage_da_writepage(struct page *page,
                         * We need to try to allocate
                         * unmapped blocks in the same page.
                         * Otherwise we won't make progress
-                         * with the page in ext4_da_writepage
+                         * with the page in ext4_writepage
                         */
-                        if (ext4_bh_unmapped_or_delay(NULL, bh)) {
+                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
                                mpage_add_bh_to_extent(mpd, logical,
                                                       bh->b_size,
                                                       bh->b_state);
@@ -2517,7 +2509,6 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
         * so call get_block_wrap with create = 0
         */
        ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-        BUG_ON(create && ret == 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -2525,15 +2516,102 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
        return ret;
 }
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+        get_bh(bh);
+        return 0;
+}
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+        put_bh(bh);
+        return 0;
+}
+static int __ext4_journalled_writepage(struct page *page,
+                                       struct writeback_control *wbc,
+                                       unsigned int len)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
+        struct buffer_head *page_bufs;
+        handle_t *handle = NULL;
+        int ret = 0;
+        int err;
+        page_bufs = page_buffers(page);
+        BUG_ON(!page_bufs);
+        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+        /* As soon as we unlock the page, it can go away, but we have
+         * references to buffers so we are safe */
+        unlock_page(page);
+        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+                                do_journal_get_write_access);
+        err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+                                write_end_fn);
+        if (ret == 0)
+                ret = err;
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+out:
+        return ret;
+}
 /*
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
+ *
 * This function can get called via...
 *   - ext4_da_writepages after taking page lock (have journal handle)
 *   - journal_submit_inode_data_buffers (no journal handle)
 *   - shrink_page_list via pdflush (no journal handle)
 *   - grab_page_cache when doing write_begin (have journal handle)
+ *
+ * We don't do any block allocation in this function. If we have page with
+ * multiple blocks we need to write those buffer_heads that are mapped. This
+ * is important for mmaped based write. So if we do with blocksize 1K
+ * truncate(f, 1024);
+ * a = mmap(f, 0, 4096);
+ * a[0] = 'a';
+ * truncate(f, 4096);
+ * we have in the page first buffer_head mapped via page_mkwrite call back
+ * but other bufer_heads would be unmapped but dirty(dirty done via the
+ * do_wp_page). So writepage should write the first block. If we modify
+ * the mmap area beyond 1024 we will again get a page_fault and the
+ * page_mkwrite callback will do the block allocation and mark the
+ * buffer_heads mapped.
+ *
+ * We redirty the page if we have any buffer_heads that is either delay or
+ * unwritten in the page.
+ *
+ * We can get recursively called as show below.
+ *
+ *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ *              ext4_writepage()
+ *
+ * But since we don't do any block allocation we should not deadlock.
+ * Page also have the dirty flag cleared so we don't get recurive page_lock.
 */
-static int ext4_da_writepage(struct page *page,
+static int ext4_writepage(struct page *page,
-                                struct writeback_control *wbc)
+                          struct writeback_control *wbc)
 {
        int ret = 0;
        loff_t size;
@@ -2541,7 +2619,7 @@ static int ext4_da_writepage(struct page *page,
        struct buffer_head *page_bufs;
        struct inode *inode = page->mapping->host;
-        trace_ext4_da_writepage(inode, page);
+        trace_ext4_writepage(inode, page);
        size = i_size_read(inode);
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@ -2551,7 +2629,7 @@ static int ext4_da_writepage(struct page *page,
        if (page_has_buffers(page)) {
                page_bufs = page_buffers(page);
                if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                        ext4_bh_unmapped_or_delay)) {
+                                        ext4_bh_delay_or_unwritten)) {
                        /*
                         * We don't want to do  block allocation
                         * So redirty the page and return
@@ -2578,13 +2656,13 @@ static int ext4_da_writepage(struct page *page,
                 * all are mapped and non delay. We don't want to
                 * do block allocation here.
                 */
-                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                ret = block_prepare_write(page, 0, len,
                                          noalloc_get_block_write);
                if (!ret) {
                        page_bufs = page_buffers(page);
                        /* check whether all are mapped and non delay */
                        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                                ext4_bh_unmapped_or_delay)) {
+                                                ext4_bh_delay_or_unwritten)) {
                                redirty_page_for_writepage(wbc, page);
                                unlock_page(page);
                                return 0;
@@ -2600,7 +2678,16 @@ static int ext4_da_writepage(struct page *page,
                        return 0;
                }
                /* now mark the buffer_heads as dirty and uptodate */
-                block_commit_write(page, 0, PAGE_CACHE_SIZE);
+                block_commit_write(page, 0, len);
+        }
+        if (PageChecked(page) && ext4_should_journal_data(inode)) {
+                /*
+                 * It's mmapped pagecache.  Add buffers and journal it.  There
+                 * doesn't seem much point in redirtying the page here.
+                 */
+                ClearPageChecked(page);
+                return __ext4_journalled_writepage(page, wbc, len);
        }
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
@@ -2907,7 +2994,7 @@ retry:
                 * i_size_read because we hold i_mutex.
                 */
                if (pos + len > inode->i_size)
-                        vmtruncate(inode, inode->i_size);
+                        ext4_truncate(inode);
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3130,222 +3217,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
        return generic_block_bmap(mapping, block, ext4_get_block);
 }
-static int bget_one(handle_t *handle, struct buffer_head *bh)
-{
-        get_bh(bh);
-        return 0;
-}
-static int bput_one(handle_t *handle, struct buffer_head *bh)
-{
-        put_bh(bh);
-        return 0;
-}
-/*
- * Note that we don't need to start a transaction unless we're journaling data
- * because we should have holes filled from ext4_page_mkwrite(). We even don't
- * need to file the inode to the transaction's list in ordered mode because if
- * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
- * transaction the data will hit the disk. In case we are journaling data, we
- * cannot start transaction directly because transaction start ranks above page
- * lock so we have to do some magic.
- *
- * In all journaling modes block_write_full_page() will start the I/O.
- *
- * Problem:
- *
- *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- *              ext4_writepage()
- *
- * Similar for:
- *
- *      ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
- *
- * Same applies to ext4_get_block().  We will deadlock on various things like
- * lock_journal and i_data_sem
- *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
- *
- * 16May01: If we're reentered then journal_current_handle() will be
- *          non-zero. We simply *return*.
- *
- * 1 July 2001: @@@ FIXME:
- *   In journalled data mode, a data buffer may be metadata against the
- *   current transaction.  But the same file is part of a shared mapping
- *   and someone does a writepage() on it.
- *
- *   We will move the buffer onto the async_data list, but *after* it has
- *   been dirtied. So there's a small window where we have dirty data on
- *   BJ_Metadata.
- *
- *   Note that this only applies to the last partial page in the file.  The
- *   bit which block_write_full_page() uses prepare/commit for.  (That's
- *   broken code anyway: it's wrong for msync()).
- *
- *   It's a rare case: affects the final partial page, for journalled data
- *   where the file is subject to bith write() and writepage() in the same
- *   transction.  To fix it we'll need a custom block_write_full_page().
- *   We'll probably need that anyway for journalling writepage() output.
- *
- * We don't honour synchronous mounts for writepage().  That would be
- * disastrous.  Any write() or metadata operation will sync the fs for
- * us.
- *
- */
-static int __ext4_normal_writepage(struct page *page,
-                                   struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        if (test_opt(inode->i_sb, NOBH))
-                return nobh_writepage(page, noalloc_get_block_write, wbc);
-        else
-                return block_write_full_page(page, noalloc_get_block_write,
-                                             wbc);
-}
-static int ext4_normal_writepage(struct page *page,
-                                 struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        loff_t size = i_size_read(inode);
-        loff_t len;
-        trace_ext4_normal_writepage(inode, page);
-        J_ASSERT(PageLocked(page));
-        if (page->index == size >> PAGE_CACHE_SHIFT)
-                len = size & ~PAGE_CACHE_MASK;
-        else
-                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
-                /* if page has buffers it should all be mapped
-                 * and allocated. If there are not buffers attached
-                 * to the page we know the page is dirty but it lost
-                 * buffers. That means that at some moment in time
-                 * after write_begin() / write_end() has been called
-                 * all buffers have been clean and thus they must have been
-                 * written at least once. So they are all mapped and we can
-                 * happily proceed with mapping them and writing the page.
-                 */
-                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                        ext4_bh_unmapped_or_delay));
-        }
-        if (!ext4_journal_current_handle())
-                return __ext4_normal_writepage(page, wbc);
-        redirty_page_for_writepage(wbc, page);
-        unlock_page(page);
-        return 0;
-}
-static int __ext4_journalled_writepage(struct page *page,
-                                       struct writeback_control *wbc)
-{
-        struct address_space *mapping = page->mapping;
-        struct inode *inode = mapping->host;
-        struct buffer_head *page_bufs;
-        handle_t *handle = NULL;
-        int ret = 0;
-        int err;
-        ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                                  noalloc_get_block_write);
-        if (ret != 0)
-                goto out_unlock;
-        page_bufs = page_buffers(page);
-        walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
-                                                                bget_one);
-        /* As soon as we unlock the page, it can go away, but we have
-         * references to buffers so we are safe */
-        unlock_page(page);
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                goto out;
-        }
-        ret = walk_page_buffers(handle, page_bufs, 0,
-                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-        err = walk_page_buffers(handle, page_bufs, 0,
-                                PAGE_CACHE_SIZE, NULL, write_end_fn);
-        if (ret == 0)
-                ret = err;
-        err = ext4_journal_stop(handle);
-        if (!ret)
-                ret = err;
-        walk_page_buffers(handle, page_bufs, 0,
-                                PAGE_CACHE_SIZE, NULL, bput_one);
-        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-        goto out;
-out_unlock:
-        unlock_page(page);
-out:
-        return ret;
-}
-static int ext4_journalled_writepage(struct page *page,
-                                     struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        loff_t size = i_size_read(inode);
-        loff_t len;
-        trace_ext4_journalled_writepage(inode, page);
-        J_ASSERT(PageLocked(page));
-        if (page->index == size >> PAGE_CACHE_SHIFT)
-                len = size & ~PAGE_CACHE_MASK;
-        else
-                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
-                /* if page has buffers it should all be mapped
-                 * and allocated. If there are not buffers attached
-                 * to the page we know the page is dirty but it lost
-                 * buffers. That means that at some moment in time
-                 * after write_begin() / write_end() has been called
-                 * all buffers have been clean and thus they must have been
-                 * written at least once. So they are all mapped and we can
-                 * happily proceed with mapping them and writing the page.
-                 */
-                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                        ext4_bh_unmapped_or_delay));
-        }
-        if (ext4_journal_current_handle())
-                goto no_write;
-        if (PageChecked(page)) {
-                /*
-                 * It's mmapped pagecache.  Add buffers and journal it.  There
-                 * doesn't seem much point in redirtying the page here.
-                 */
-                ClearPageChecked(page);
-                return __ext4_journalled_writepage(page, wbc);
-        } else {
-                /*
-                 * It may be a page full of checkpoint-mode buffers.  We don't
-                 * really know unless we go poke around in the buffer_heads.
-                 * But block_write_full_page will do the right thing.
-                 */
-                return block_write_full_page(page, noalloc_get_block_write,
-                                             wbc);
-        }
-no_write:
-        redirty_page_for_writepage(wbc, page);
-        unlock_page(page);
-        return 0;
-}
 static int ext4_readpage(struct file *file, struct page *page)
 {
        return mpage_readpage(page, ext4_get_block);
@@ -3492,7 +3363,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 static const struct address_space_operations ext4_ordered_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_normal_writepage,
+        .writepage              = ext4_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_ordered_write_end,
@@ -3507,7 +3378,7 @@ static const struct address_space_operations ext4_ordered_aops = {
 static const struct address_space_operations ext4_writeback_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_normal_writepage,
+        .writepage              = ext4_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_writeback_write_end,
@@ -3522,7 +3393,7 @@ static const struct address_space_operations ext4_writeback_aops = {
 static const struct address_space_operations ext4_journalled_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_journalled_writepage,
+        .writepage              = ext4_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_journalled_write_end,
@@ -3536,7 +3407,7 @@ static const struct address_space_operations ext4_journalled_aops = {
 static const struct address_space_operations ext4_da_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_da_writepage,
+        .writepage              = ext4_writepage,
        .writepages             = ext4_da_writepages,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_da_write_begin,
@@ -3583,7 +3454,8 @@ int ext4_block_truncate_page(handle_t *handle,
        struct page *page;
        int err = 0;
-        page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
        if (!page)
                return -EINVAL;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 24a6abb2aef5..7050a9cd04a4 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -191,7 +191,7 @@ setversion_out:
        case EXT4_IOC_GROUP_EXTEND: {
                ext4_fsblk_t n_blocks_count;
                struct super_block *sb = inode->i_sb;
-                int err, err2;
+                int err, err2=0;
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
@@ -204,9 +204,11 @@ setversion_out:
                        return err;
                err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
-                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+                if (EXT4_SB(sb)->s_journal) {
-                err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                        err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                }
                if (err == 0)
                        err = err2;
                mnt_drop_write(filp->f_path.mnt);
@@ -251,7 +253,7 @@ setversion_out:
        case EXT4_IOC_GROUP_ADD: {
                struct ext4_new_group_data input;
                struct super_block *sb = inode->i_sb;
-                int err, err2;
+                int err, err2=0;
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
@@ -265,9 +267,11 @@ setversion_out:
                        return err;
                err = ext4_group_add(sb, &input);
-                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+                if (EXT4_SB(sb)->s_journal) {
-                err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                        err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                }
                if (err == 0)
                        err = err2;
                mnt_drop_write(filp->f_path.mnt);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 519a0a686d94..cd258463e2a9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -657,7 +657,8 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        }
 }
-static void ext4_mb_generate_buddy(struct super_block *sb,
+static noinline_for_stack
+void ext4_mb_generate_buddy(struct super_block *sb,
                                void *buddy, void *bitmap, ext4_group_t group)
 {
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -1480,7 +1481,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
        ext4_mb_check_limits(ac, e4b, 0);
 }
-static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+static noinline_for_stack
+int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
 {
        struct ext4_free_extent ex = ac->ac_b_ex;
@@ -1507,7 +1509,8 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
        return 0;
 }
-static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
+static noinline_for_stack
+int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
                                struct ext4_buddy *e4b)
 {
        ext4_group_t group = ac->ac_g_ex.fe_group;
@@ -1566,7 +1569,8 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 * The routine scans buddy structures (not bitmap!) from given order
 * to max order and tries to find big enough chunk to satisfy the req
 */
-static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
 {
        struct super_block *sb = ac->ac_sb;
@@ -1609,7 +1613,8 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
 * In order to optimize scanning, caller must pass number of
 * free blocks in the group, so the routine can know upper limit.
 */
-static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
 {
        struct super_block *sb = ac->ac_sb;
@@ -1668,7 +1673,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 * we try to find stripe-aligned chunks for stripe-size requests
 * XXX should do so at least for multiples of stripe size as well
 */
-static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
                                 struct ext4_buddy *e4b)
 {
        struct super_block *sb = ac->ac_sb;
@@ -1831,7 +1837,8 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
 }
-static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
        int ret;
@@ -2902,7 +2909,11 @@ int __init init_ext4_mballoc(void)
 void exit_ext4_mballoc(void)
 {
-        /* XXX: synchronize_rcu(); */
+        /* 
+         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
+         * before destroying the slab cache.
+         */
+        rcu_barrier();
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
@@ -3457,7 +3468,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 * used in in-core bitmap. buddy must be generated from this bitmap
 * Need to be called with ext4 group lock held
 */
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+static noinline_for_stack
+void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group)
 {
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -4215,14 +4227,9 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        ext4_get_group_no_and_offset(sb, goal, &group, &block);
        /* set up allocation goals */
+        memset(ac, 0, sizeof(struct ext4_allocation_context));
        ac->ac_b_ex.fe_logical = ar->logical;
-        ac->ac_b_ex.fe_group = 0;
-        ac->ac_b_ex.fe_start = 0;
-        ac->ac_b_ex.fe_len = 0;
        ac->ac_status = AC_STATUS_CONTINUE;
-        ac->ac_groups_scanned = 0;
-        ac->ac_ex_scanned = 0;
-        ac->ac_found = 0;
        ac->ac_sb = sb;
        ac->ac_inode = ar->inode;
        ac->ac_o_ex.fe_logical = ar->logical;
@@ -4233,15 +4240,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        ac->ac_g_ex.fe_group = group;
        ac->ac_g_ex.fe_start = block;
        ac->ac_g_ex.fe_len = len;
-        ac->ac_f_ex.fe_len = 0;
        ac->ac_flags = ar->flags;
-        ac->ac_2order = 0;
-        ac->ac_criteria = 0;
-        ac->ac_pa = NULL;
-        ac->ac_bitmap_page = NULL;
-        ac->ac_buddy_page = NULL;
-        ac->alloc_semp = NULL;
-        ac->ac_lg = NULL;
        /* we have to define context: we'll we work with a file or
         * locality group. this is a policy, actually */
@@ -4509,10 +4508,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        }
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac) {
+        if (!ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = ar->inode;
-        } else {
                ar->len = 0;
                *errp = -ENOMEM;
                goto out1;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 03ebb439ace0..7ebae9a4ecc0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -624,6 +624,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 {
        struct gfs2_inode *ip = GFS2_I(mapping->host);
        struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
        int alloc_required;
        int error = 0;
@@ -637,6 +638,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        error = gfs2_glock_nq(&ip->i_gh);
        if (unlikely(error))
                goto out_uninit;
+        if (&ip->i_inode == sdp->sd_rindex) {
+                error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+                                           GL_NOCACHE, &m_ip->i_gh);
+                if (unlikely(error)) {
+                        gfs2_glock_dq(&ip->i_gh);
+                        goto out_uninit;
+                }
+        }
        error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
        if (error)
@@ -667,6 +676,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                rblocks += data_blocks ? data_blocks : 1;
        if (ind_blocks || data_blocks)
                rblocks += RES_STATFS + RES_QUOTA;
+        if (&ip->i_inode == sdp->sd_rindex)
+                rblocks += 2 * RES_STATFS;
        error = gfs2_trans_begin(sdp, rblocks,
                                 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -712,6 +723,10 @@ out_alloc_put:
                gfs2_alloc_put(ip);
        }
 out_unlock:
+        if (&ip->i_inode == sdp->sd_rindex) {
+                gfs2_glock_dq(&m_ip->i_gh);
+                gfs2_holder_uninit(&m_ip->i_gh);
+        }
        gfs2_glock_dq(&ip->i_gh);
 out_uninit:
        gfs2_holder_uninit(&ip->i_gh);
@@ -725,14 +740,21 @@ out_uninit:
 static void adjust_fs_space(struct inode *inode)
 {
        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        struct buffer_head *m_bh, *l_bh;
        u64 fs_total, new_free;
        /* Total up the file system space, according to the latest rindex. */
        fs_total = gfs2_ri_total(sdp);
+        if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
+                return;
        spin_lock(&sdp->sd_statfs_spin);
+        gfs2_statfs_change_in(m_sc, m_bh->b_data +
+                              sizeof(struct gfs2_dinode));
        if (fs_total > (m_sc->sc_total + l_sc->sc_total))
                new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
        else
@@ -741,6 +763,13 @@ static void adjust_fs_space(struct inode *inode)
        fs_warn(sdp, "File system extended by %llu blocks.\n",
                (unsigned long long)new_free);
        gfs2_statfs_change(sdp, new_free, new_free, 0);
+        if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
+                goto out;
+        update_statfs(sdp, m_bh, l_bh);
+        brelse(l_bh);
+out:
+        brelse(m_bh);
 }
 /**
@@ -763,6 +792,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        u64 to = pos + copied;
        void *kaddr;
        unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
@@ -794,6 +824,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        brelse(dibh);
        gfs2_trans_end(sdp);
+        if (inode == sdp->sd_rindex) {
+                gfs2_glock_dq(&m_ip->i_gh);
+                gfs2_holder_uninit(&m_ip->i_gh);
+        }
        gfs2_glock_dq(&ip->i_gh);
        gfs2_holder_uninit(&ip->i_gh);
        return copied;
@@ -823,6 +857,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        struct inode *inode = page->mapping->host;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        struct buffer_head *dibh;
        struct gfs2_alloc *al = ip->i_alloc;
        unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
@@ -865,6 +900,10 @@ failed:
                gfs2_quota_unlock(ip);
                gfs2_alloc_put(ip);
        }
+        if (inode == sdp->sd_rindex) {
+                gfs2_glock_dq(&m_ip->i_gh);
+                gfs2_holder_uninit(&m_ip->i_gh);
+        }
        gfs2_glock_dq(&ip->i_gh);
        gfs2_holder_uninit(&ip->i_gh);
        return ret;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 297421c0427a..8b674b1f3a55 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -63,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
 static struct workqueue_struct *glock_workqueue;
+struct workqueue_struct *gfs2_delete_workqueue;
 static LIST_HEAD(lru_list);
 static atomic_t lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(lru_lock);
@@ -167,13 +168,33 @@ static void glock_free(struct gfs2_glock *gl)
 *
 */
-static void gfs2_glock_hold(struct gfs2_glock *gl)
+void gfs2_glock_hold(struct gfs2_glock *gl)
 {
        GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
        atomic_inc(&gl->gl_ref);
 }
 /**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+static int demote_ok(const struct gfs2_glock *gl)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (gl->gl_state == LM_ST_UNLOCKED)
+                return 0;
+        if (!list_empty(&gl->gl_holders))
+                return 0;
+        if (glops->go_demote_ok)
+                return glops->go_demote_ok(gl);
+        return 1;
+}
+/**
 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
 * @gl: the glock
 *
@@ -181,8 +202,13 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 {
+        int may_reclaim;
+        may_reclaim = (demote_ok(gl) &&
+                       (atomic_read(&gl->gl_ref) == 1 ||
+                        (gl->gl_name.ln_type == LM_TYPE_INODE &&
+                         atomic_read(&gl->gl_ref) <= 2)));
        spin_lock(&lru_lock);
-        if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
+        if (list_empty(&gl->gl_lru) && may_reclaim) {
                list_add_tail(&gl->gl_lru, &lru_list);
                atomic_inc(&lru_count);
        }
@@ -190,6 +216,21 @@ static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 }
 /**
+ * gfs2_glock_put_nolock() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ * This function should only be used if the caller has its own reference
+ * to the glock, in addition to the one it is dropping.
+ */
+void gfs2_glock_put_nolock(struct gfs2_glock *gl)
+{
+        if (atomic_dec_and_test(&gl->gl_ref))
+                GLOCK_BUG_ON(gl, 1);
+        gfs2_glock_schedule_for_reclaim(gl);
+}
+/**
 * gfs2_glock_put() - Decrement reference count on glock
 * @gl: The glock to put
 *
@@ -214,9 +255,9 @@ int gfs2_glock_put(struct gfs2_glock *gl)
                rv = 1;
                goto out;
        }
-        /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
+        spin_lock(&gl->gl_spin);
-        if (atomic_read(&gl->gl_ref) == 2)
+        gfs2_glock_schedule_for_reclaim(gl);
-                gfs2_glock_schedule_for_reclaim(gl);
+        spin_unlock(&gl->gl_spin);
        write_unlock(gl_lock_addr(gl->gl_hash));
 out:
        return rv;
@@ -398,7 +439,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
                if (held2)
                        gfs2_glock_hold(gl);
                else
-                        gfs2_glock_put(gl);
+                        gfs2_glock_put_nolock(gl);
        }
        gl->gl_state = new_state;
@@ -633,12 +674,35 @@ out:
 out_sched:
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-                gfs2_glock_put(gl);
+                gfs2_glock_put_nolock(gl);
 out_unlock:
        clear_bit(GLF_LOCK, &gl->gl_flags);
        goto out;
 }
+static void delete_work_func(struct work_struct *work)
+{
+        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_inode *ip = NULL;
+        struct inode *inode;
+        u64 no_addr = 0;
+        spin_lock(&gl->gl_spin);
+        ip = (struct gfs2_inode *)gl->gl_object;
+        if (ip)
+                no_addr = ip->i_no_addr;
+        spin_unlock(&gl->gl_spin);
+        if (ip) {
+                inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
+                if (inode) {
+                        d_prune_aliases(inode);
+                        iput(inode);
+                }
+        }
+        gfs2_glock_put(gl);
+}
 static void glock_work_func(struct work_struct *work)
 {
        unsigned long delay = 0;
@@ -717,6 +781,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        gl->gl_sbd = sdp;
        gl->gl_aspace = NULL;
        INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
+        INIT_WORK(&gl->gl_delete, delete_work_func);
        /* If this glock protects actual on-disk data or metadata blocks,
           create a VFS inode to manage the pages/buffers holding them. */
@@ -858,6 +923,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
                        gl->gl_demote_state != state) {
                gl->gl_demote_state = LM_ST_UNLOCKED;
        }
+        if (gl->gl_ops->go_callback)
+                gl->gl_ops->go_callback(gl);
        trace_gfs2_demote_rq(gl);
 }
@@ -1274,33 +1341,12 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
                gfs2_glock_put(gl);
 }
-/**
- * demote_ok - Check to see if it's ok to unlock a glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-static int demote_ok(const struct gfs2_glock *gl)
-{
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        if (gl->gl_state == LM_ST_UNLOCKED)
-                return 0;
-        if (!list_empty(&gl->gl_holders))
-                return 0;
-        if (glops->go_demote_ok)
-                return glops->go_demote_ok(gl);
-        return 1;
-}
 static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 {
        struct gfs2_glock *gl;
        int may_demote;
        int nr_skipped = 0;
-        int got_ref = 0;
        LIST_HEAD(skipped);
        if (nr == 0)
@@ -1315,37 +1361,29 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
                list_del_init(&gl->gl_lru);
                atomic_dec(&lru_count);
+                /* Check if glock is about to be freed */
+                if (atomic_read(&gl->gl_ref) == 0)
+                        continue;
                /* Test for being demotable */
                if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
                        gfs2_glock_hold(gl);
-                        got_ref = 1;
                        spin_unlock(&lru_lock);
                        spin_lock(&gl->gl_spin);
                        may_demote = demote_ok(gl);
-                        spin_unlock(&gl->gl_spin);
-                        clear_bit(GLF_LOCK, &gl->gl_flags);
                        if (may_demote) {
                                handle_callback(gl, LM_ST_UNLOCKED, 0);
                                nr--;
-                                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-                                        gfs2_glock_put(gl);
-                                got_ref = 0;
                        }
+                        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                                gfs2_glock_put_nolock(gl);
+                        spin_unlock(&gl->gl_spin);
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
                        spin_lock(&lru_lock);
-                        if (may_demote)
+                        continue;
-                                continue;
-                }
-                if (list_empty(&gl->gl_lru) &&
-                    (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
-                        nr_skipped++;
-                        list_add(&gl->gl_lru, &skipped);
-                }
-                if (got_ref) {
-                        spin_unlock(&lru_lock);
-                        gfs2_glock_put(gl);
-                        spin_lock(&lru_lock);
-                        got_ref = 0;
                }
+                nr_skipped++;
+                list_add(&gl->gl_lru, &skipped);
        }
        list_splice(&skipped, &lru_list);
        atomic_add(nr_skipped, &lru_count);
@@ -1727,6 +1765,11 @@ int __init gfs2_glock_init(void)
        glock_workqueue = create_workqueue("glock_workqueue");
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
+        gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+        if (IS_ERR(gfs2_delete_workqueue)) {
+                destroy_workqueue(glock_workqueue);
+                return PTR_ERR(gfs2_delete_workqueue);
+        }
        register_shrinker(&glock_shrinker);
@@ -1737,6 +1780,7 @@ void gfs2_glock_exit(void)
 {
        unregister_shrinker(&glock_shrinker);
        destroy_workqueue(glock_workqueue);
+        destroy_workqueue(gfs2_delete_workqueue);
 }
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index a602a28f6f08..c609894ec0d0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -143,6 +143,7 @@ struct lm_lockops {
 #define GLR_TRYFAILED           13
+extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
        struct gfs2_holder *gh;
@@ -191,6 +192,8 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 int gfs2_glock_get(struct gfs2_sbd *sdp,
                   u64 number, const struct gfs2_glock_operations *glops,
                   int create, struct gfs2_glock **glp);
+void gfs2_glock_hold(struct gfs2_glock *gl);
+void gfs2_glock_put_nolock(struct gfs2_glock *gl);
 int gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
                      struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d5e4ab155ca0..6985eef06c39 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -323,6 +323,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
        if (gl->gl_state != LM_ST_UNLOCKED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+                flush_workqueue(gfs2_delete_workqueue);
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
@@ -372,6 +373,25 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
        return 0;
 }
+/**
+ * iopen_go_callback - schedule the dcache entry for the inode to be deleted
+ * @gl: the glock
+ *
+ * gl_spin lock is held while calling this
+ */
+static void iopen_go_callback(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
+        if (gl->gl_demote_state == LM_ST_UNLOCKED &&
+            gl->gl_state == LM_ST_SHARED &&
+            ip && test_bit(GIF_USER, &ip->i_flags)) {
+                gfs2_glock_hold(gl);
+                if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+                        gfs2_glock_put_nolock(gl);
+        }
+}
 const struct gfs2_glock_operations gfs2_meta_glops = {
        .go_type = LM_TYPE_META,
 };
@@ -406,6 +426,7 @@ const struct gfs2_glock_operations gfs2_trans_glops = {
 const struct gfs2_glock_operations gfs2_iopen_glops = {
        .go_type = LM_TYPE_IOPEN,
+        .go_callback = iopen_go_callback,
 };
 const struct gfs2_glock_operations gfs2_flock_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 225347fbff3c..61801ada36f0 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -159,6 +159,7 @@ struct gfs2_glock_operations {
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
        int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+        void (*go_callback) (struct gfs2_glock *gl);
        const int go_type;
        const unsigned long go_min_hold_time;
 };
@@ -228,6 +229,7 @@ struct gfs2_glock {
        struct list_head gl_ail_list;
        atomic_t gl_ail_count;
        struct delayed_work gl_work;
+        struct work_struct gl_delete;
 };
 #define GFS2_MIN_LVB_SIZE 32    /* Min size of LVB that gfs2 supports */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index daa4ae341a29..fba795798d3a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -285,27 +285,19 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
        }
        tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
-        if (count[1] + count[2] != tmp) {
+        if (count[1] != tmp) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used data mismatch:  %u != %u\n",
                               count[1], tmp);
                return;
        }
-        if (count[3] != rgd->rd_dinodes) {
+        if (count[2] + count[3] != rgd->rd_dinodes) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used metadata mismatch:  %u != %u\n",
-                               count[3], rgd->rd_dinodes);
+                               count[2] + count[3], rgd->rd_dinodes);
                return;
        }
-        if (count[2] > count[3]) {
-                if (gfs2_consist_rgrpd(rgd))
-                        fs_err(sdp, "unlinked inodes > inodes:  %u\n",
-                               count[2]);
-                return;
-        }
 }
 static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
@@ -961,7 +953,8 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 * Returns: The inode, if one has been found
 */
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
+static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+                                     u64 skip)
 {
        struct inode *inode;
        u32 goal = 0, block;
@@ -985,6 +978,8 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
                goal++;
                if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
                        continue;
+                if (no_addr == skip)
+                        continue;
                *last_unlinked = no_addr;
                inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
                                          no_addr, -1, 1);
@@ -1104,7 +1099,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                        if (try_rgrp_fit(rgd, al))
                                goto out;
                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                inode = try_rgrp_unlink(rgd, last_unlinked);
+                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
@@ -1138,7 +1133,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                        if (try_rgrp_fit(rgd, al))
                                goto out;
                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                inode = try_rgrp_unlink(rgd, last_unlinked);
+                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0a6801336470..f522bb017973 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -353,7 +353,7 @@ fail:
        return error;
 }
-static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
+void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
 {
        const struct gfs2_statfs_change *str = buf;
@@ -441,6 +441,29 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
        brelse(l_bh);
 }
+void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+                   struct buffer_head *l_bh)
+{
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        spin_lock(&sdp->sd_statfs_spin);
+        m_sc->sc_total += l_sc->sc_total;
+        m_sc->sc_free += l_sc->sc_free;
+        m_sc->sc_dinodes += l_sc->sc_dinodes;
+        memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
+        memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+               0, sizeof(struct gfs2_statfs_change));
+        spin_unlock(&sdp->sd_statfs_spin);
+        gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+}
 int gfs2_statfs_sync(struct gfs2_sbd *sdp)
 {
        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -477,19 +500,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
        if (error)
                goto out_bh2;
-        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        update_statfs(sdp, m_bh, l_bh);
-        spin_lock(&sdp->sd_statfs_spin);
-        m_sc->sc_total += l_sc->sc_total;
-        m_sc->sc_free += l_sc->sc_free;
-        m_sc->sc_dinodes += l_sc->sc_dinodes;
-        memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
-        memset(l_bh->b_data + sizeof(struct gfs2_dinode),
-               0, sizeof(struct gfs2_statfs_change));
-        spin_unlock(&sdp->sd_statfs_spin);
-        gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
-        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
        gfs2_trans_end(sdp);
@@ -680,6 +691,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
        struct gfs2_holder t_gh;
        int error;
+        flush_workqueue(gfs2_delete_workqueue);
        gfs2_quota_sync(sdp);
        gfs2_statfs_sync(sdp);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index b56413e3e40d..22e0417ed996 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -40,6 +40,10 @@ extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
 extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
 extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
                               s64 dinodes);
+extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
+                                  const void *buf);
+extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+                          struct buffer_head *l_bh);
 extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
 extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 98d6ef1c1dc0..148d55c14171 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -1,12 +1,11 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM gfs2
 #if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_GFS2_H
 #include <linux/tracepoint.h>
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM gfs2
-#define TRACE_INCLUDE_FILE trace_gfs2
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/dlmconstants.h>
@@ -403,5 +402,6 @@ TRACE_EVENT(gfs2_block_alloc,
 /* This part must be outside protection */
 #undef TRACE_INCLUDE_PATH
 #define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_gfs2
 #include <trace/define_trace.h>
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 737f7246a4b5..f96f85092d1c 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
+        journal_t *journal = transaction->t_journal;
        /*
         * The buffer really shouldn't be locked: only the current committing
@@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction,
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+        /* keep subsequent assertions sane */
+        new_bh->b_state = 0;
+        init_buffer(new_bh, NULL, NULL);
+        atomic_set(&new_bh->b_count, 1);
+        new_jh = journal_add_journal_head(new_bh);      /* This sleeps */
        /*
         * If a new transaction has already done a buffer copy-out, then
@@ -361,14 +367,6 @@ repeat:
                kunmap_atomic(mapped_data, KM_USER0);
        }
-        /* keep subsequent assertions sane */
-        new_bh->b_state = 0;
-        init_buffer(new_bh, NULL, NULL);
-        atomic_set(&new_bh->b_count, 1);
-        jbd_unlock_bh_state(bh_in);
-        new_jh = journal_add_journal_head(new_bh);      /* This sleeps */
        set_bh_page(new_bh, new_page, new_offset);
        new_jh->b_transaction = NULL;
        new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -385,7 +383,11 @@ repeat:
         * copying is moved to the transaction's shadow queue.
         */
        JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
-        journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_lock(&journal->j_list_lock);
+        __journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(bh_in);
        JBUFFER_TRACE(new_jh, "file as BJ_IO");
        journal_file_buffer(new_jh, transaction, BJ_IO);
@@ -848,6 +850,12 @@ static int journal_reset(journal_t *journal)
        first = be32_to_cpu(sb->s_first);
        last = be32_to_cpu(sb->s_maxlen);
+        if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
+                printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
+                       first, last);
+                journal_fail_superblock(journal);
+                return -EINVAL;
+        }
        journal->j_first = first;
        journal->j_last = last;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 73242ba7c7b1..c03ac11f74be 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
 }
-/*
+static void warn_dirty_buffer(struct buffer_head *bh)
- * Report any unexpected dirty buffers which turn up.  Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible.  #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 {
-        int jlist;
+        char b[BDEVNAME_SIZE];
-        /* If this buffer is one which might reasonably be dirty
-         * --- ie. data, or not part of this journal --- then
-         * we're OK to leave it alone, but otherwise we need to
-         * move the dirty bit to the journal's own internal
-         * JBDDirty bit. */
-        jlist = jh->b_jlist;
-        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+        printk(KERN_WARNING
-            jlist == BJ_Shadow || jlist == BJ_Forget) {
+               "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
-                struct buffer_head *bh = jh2bh(jh);
+               "There's a risk of filesystem corruption in case of system "
+               "crash.\n",
-                if (test_clear_buffer_dirty(bh))
+               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
-                        set_buffer_jbddirty(bh);
-        }
 }
 /*
@@ -583,14 +564,16 @@ repeat:
                        if (jh->b_next_transaction)
                                J_ASSERT_JH(jh, jh->b_next_transaction ==
                                                        transaction);
+                        warn_dirty_buffer(bh);
                }
                /*
                 * In any case we need to clean the dirty flag and we must
                 * do it under the buffer lock to be sure we don't race
                 * with running write-out.
                 */
-                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                JBUFFER_TRACE(jh, "Journalling dirty buffer");
-                jbd_unexpected_dirty_buffer(jh);
+                clear_buffer_dirty(bh);
+                set_buffer_jbddirty(bh);
        }
        unlock_buffer(bh);
@@ -826,6 +809,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
        if (jh->b_transaction == NULL) {
+                /*
+                 * Previous journal_forget() could have left the buffer
+                 * with jbddirty bit set because it was being committed. When
+                 * the commit finished, we've filed the buffer for
+                 * checkpointing and marked it dirty. Now we are reallocating
+                 * the buffer so the transaction freeing it must have
+                 * committed and so it's safe to clear the dirty bit.
+                 */
+                clear_buffer_dirty(jh2bh(jh));
                jh->b_transaction = transaction;
                /* first access by this transaction */
@@ -1782,8 +1774,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+                /*
+                 * We don't want to write the buffer anymore, clear the
+                 * bit so that we don't confuse checks in
+                 * __journal_file_buffer
+                 */
+                clear_buffer_dirty(bh);
                __journal_file_buffer(jh, transaction, BJ_Forget);
-                clear_buffer_jbddirty(bh);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
@@ -2041,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh,
        if (jh->b_transaction && jh->b_jlist == jlist)
                return;
-        /* The following list of buffer states needs to be consistent
-         * with __jbd_unexpected_dirty_buffer()'s handling of dirty
-         * state. */
        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
+                /*
+                 * For metadata buffers, we track dirty bit in buffer_jbddirty
+                 * instead of buffer_dirty. We should not see a dirty bit set
+                 * here because we clear it in do_get_write_access but e.g.
+                 * tune2fs can modify the sb and set the dirty bit at any time
+                 * so we try to gracefully handle that.
+                 */
+                if (buffer_dirty(bh))
+                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 18bfd5dab642..e378cb383979 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -297,6 +297,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
        struct jbd2_buffer_trigger_type *triggers;
+        journal_t *journal = transaction->t_journal;
        /*
         * The buffer really shouldn't be locked: only the current committing
@@ -310,6 +311,11 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+        /* keep subsequent assertions sane */
+        new_bh->b_state = 0;
+        init_buffer(new_bh, NULL, NULL);
+        atomic_set(&new_bh->b_count, 1);
+        new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
        /*
         * If a new transaction has already done a buffer copy-out, then
@@ -388,14 +394,6 @@ repeat:
                kunmap_atomic(mapped_data, KM_USER0);
        }
-        /* keep subsequent assertions sane */
-        new_bh->b_state = 0;
-        init_buffer(new_bh, NULL, NULL);
-        atomic_set(&new_bh->b_count, 1);
-        jbd_unlock_bh_state(bh_in);
-        new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
        set_bh_page(new_bh, new_page, new_offset);
        new_jh->b_transaction = NULL;
        new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -412,7 +410,11 @@ repeat:
         * copying is moved to the transaction's shadow queue.
         */
        JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
-        jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_lock(&journal->j_list_lock);
+        __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(bh_in);
        JBUFFER_TRACE(new_jh, "file as BJ_IO");
        jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
@@ -2410,6 +2412,7 @@ const char *jbd2_dev_to_name(dev_t device)
        int     i = hash_32(device, CACHE_SIZE_BITS);
        char    *ret;
        struct block_device *bd;
+        static struct devname_cache *new_dev;
        rcu_read_lock();
        if (devcache[i] && devcache[i]->device == device) {
@@ -2419,20 +2422,20 @@ const char *jbd2_dev_to_name(dev_t device)
        }
        rcu_read_unlock();
+        new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
+        if (!new_dev)
+                return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
        spin_lock(&devname_cache_lock);
        if (devcache[i]) {
                if (devcache[i]->device == device) {
+                        kfree(new_dev);
                        ret = devcache[i]->devname;
                        spin_unlock(&devname_cache_lock);
                        return ret;
                }
                call_rcu(&devcache[i]->rcu, free_devcache);
        }
-        devcache[i] = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
+        devcache[i] = new_dev;
-        if (!devcache[i]) {
-                spin_unlock(&devname_cache_lock);
-                return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
-        }
        devcache[i]->device = device;
        bd = bdget(device);
        if (bd) {
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 494501edba6b..6213ac728f30 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -499,34 +499,15 @@ void jbd2_journal_unlock_updates (journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
 }
-/*
+static void warn_dirty_buffer(struct buffer_head *bh)
- * Report any unexpected dirty buffers which turn up.  Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible.  #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 {
-        int jlist;
+        char b[BDEVNAME_SIZE];
-        /* If this buffer is one which might reasonably be dirty
-         * --- ie. data, or not part of this journal --- then
-         * we're OK to leave it alone, but otherwise we need to
-         * move the dirty bit to the journal's own internal
-         * JBDDirty bit. */
-        jlist = jh->b_jlist;
-        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+        printk(KERN_WARNING
-            jlist == BJ_Shadow || jlist == BJ_Forget) {
+               "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
-                struct buffer_head *bh = jh2bh(jh);
+               "There's a risk of filesystem corruption in case of system "
+               "crash.\n",
-                if (test_clear_buffer_dirty(bh))
+               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
-                        set_buffer_jbddirty(bh);
-        }
 }
 /*
@@ -593,14 +574,16 @@ repeat:
                        if (jh->b_next_transaction)
                                J_ASSERT_JH(jh, jh->b_next_transaction ==
                                                        transaction);
+                        warn_dirty_buffer(bh);
                }
                /*
                 * In any case we need to clean the dirty flag and we must
                 * do it under the buffer lock to be sure we don't race
                 * with running write-out.
                 */
-                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                JBUFFER_TRACE(jh, "Journalling dirty buffer");
-                jbd_unexpected_dirty_buffer(jh);
+                clear_buffer_dirty(bh);
+                set_buffer_jbddirty(bh);
        }
        unlock_buffer(bh);
@@ -843,6 +826,15 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
        if (jh->b_transaction == NULL) {
+                /*
+                 * Previous jbd2_journal_forget() could have left the buffer
+                 * with jbddirty bit set because it was being committed. When
+                 * the commit finished, we've filed the buffer for
+                 * checkpointing and marked it dirty. Now we are reallocating
+                 * the buffer so the transaction freeing it must have
+                 * committed and so it's safe to clear the dirty bit.
+                 */
+                clear_buffer_dirty(jh2bh(jh));
                jh->b_transaction = transaction;
                /* first access by this transaction */
@@ -1644,8 +1636,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+                /*
+                 * We don't want to write the buffer anymore, clear the
+                 * bit so that we don't confuse checks in
+                 * __journal_file_buffer
+                 */
+                clear_buffer_dirty(bh);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
-                clear_buffer_jbddirty(bh);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
@@ -1896,12 +1893,17 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
        if (jh->b_transaction && jh->b_jlist == jlist)
                return;
-        /* The following list of buffer states needs to be consistent
-         * with __jbd_unexpected_dirty_buffer()'s handling of dirty
-         * state. */
        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
+                /*
+                 * For metadata buffers, we track dirty bit in buffer_jbddirty
+                 * instead of buffer_dirty. We should not see a dirty bit set
+                 * here because we clear it in do_get_write_access but e.g.
+                 * tune2fs can modify the sb and set the dirty bit at any time
+                 * so we try to gracefully handle that.
+                 */
+                if (buffer_dirty(bh))
+                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 91fa3ad6e8c2..a29c7c3e3fb8 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -67,10 +67,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
                acl = posix_acl_from_xattr(value, size);
        }
        kfree(value);
-        if (!IS_ERR(acl)) {
+        if (!IS_ERR(acl))
                set_cached_acl(inode, type, acl);
-                posix_acl_release(acl);
-        }
        return acl;
 }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c2d061675d80..8d25ccb2d51d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1242,20 +1242,6 @@ error:
        return error;
 }
-/*
- * Initialize a session.
- * Note: save the mount rsize and wsize for create_server negotiation.
- */
-static void nfs4_init_session(struct nfs_client *clp,
-                              unsigned int wsize, unsigned int rsize)
-{
-#if defined(CONFIG_NFS_V4_1)
-        if (nfs4_has_session(clp)) {
-                clp->cl_session->fc_attrs.max_rqst_sz = wsize;
-                clp->cl_session->fc_attrs.max_resp_sz = rsize;
-        }
-#endif /* CONFIG_NFS_V4_1 */
-}
 /*
 * Session has been established, and the client marked ready.
@@ -1350,7 +1336,9 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        BUG_ON(!server->nfs_client->rpc_ops);
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-        nfs4_init_session(server->nfs_client, server->wsize, server->rsize);
+        error = nfs4_init_session(server);
+        if (error < 0)
+                goto error;
        /* Probe the root fh to retrieve its FSID */
        error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 38d42c29fb92..32062c33c859 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1025,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
-                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
+                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
                                goto out;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 61bc3a32e1e2..6ea07a3c75d4 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -220,6 +220,7 @@ extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern int nfs4_proc_create_session(struct nfs_client *, int reset);
 extern int nfs4_proc_destroy_session(struct nfs4_session *);
+extern int nfs4_init_session(struct nfs_server *server);
 #else /* CONFIG_NFS_v4_1 */
 static inline int nfs4_setup_sequence(struct nfs_client *clp,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
@@ -227,6 +228,11 @@ static inline int nfs4_setup_sequence(struct nfs_client *clp,
 {
        return 0;
 }
+static inline int nfs4_init_session(struct nfs_server *server)
+{
+        return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[];
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ff0c080db59b..6917311f201c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2040,15 +2040,9 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
                .rpc_argp = &args,
                .rpc_resp = &res,
        };
-        int status;
        nfs_fattr_init(info->fattr);
-        status = nfs4_recover_expired_lease(server);
+        return nfs4_call_sync(server, &msg, &args, &res, 0);
-        if (!status)
-                status = nfs4_check_client_ready(server->nfs_client);
-        if (!status)
-                status = nfs4_call_sync(server, &msg, &args, &res, 0);
-        return status;
 }
 static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -4099,15 +4093,23 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
        if (request->fl_start < 0 || request->fl_end < 0)
                return -EINVAL;
-        if (IS_GETLK(cmd))
+        if (IS_GETLK(cmd)) {
-                return nfs4_proc_getlk(state, F_GETLK, request);
+                if (state != NULL)
+                        return nfs4_proc_getlk(state, F_GETLK, request);
+                return 0;
+        }
        if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
                return -EINVAL;
-        if (request->fl_type == F_UNLCK)
+        if (request->fl_type == F_UNLCK) {
-                return nfs4_proc_unlck(state, cmd, request);
+                if (state != NULL)
+                        return nfs4_proc_unlck(state, cmd, request);
+                return 0;
+        }
+        if (state == NULL)
+                return -ENOLCK;
        do {
                status = nfs4_proc_setlk(state, cmd, request);
                if ((status != -EAGAIN) || IS_SETLK(cmd))
@@ -4793,6 +4795,22 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
        return status;
 }
+int nfs4_init_session(struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        int ret;
+        if (!nfs4_has_session(clp))
+                return 0;
+        clp->cl_session->fc_attrs.max_rqst_sz = server->wsize;
+        clp->cl_session->fc_attrs.max_resp_sz = server->rsize;
+        ret = nfs4_recover_expired_lease(server);
+        if (!ret)
+                ret = nfs4_check_client_ready(clp);
+        return ret;
+}
 /*
 * Renew the cl_session lease.
 */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index b73c5a728655..65ca8c18476f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -553,6 +553,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        INIT_LIST_HEAD(&lsp->ls_sequence.list);
        lsp->ls_seqid.sequence = &lsp->ls_sequence;
        atomic_set(&lsp->ls_count, 1);
+        lsp->ls_state = state;
        lsp->ls_owner = fl_owner;
        spin_lock(&clp->cl_lock);
        nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
@@ -587,7 +588,6 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
                if (lsp != NULL)
                        break;
                if (new != NULL) {
-                        new->ls_state = state;
                        list_add(&new->ls_locks, &state->lock_states);
                        set_bit(LK_STATE_IN_USE, &state->flags);
                        lsp = new;
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
new file mode 100644
index 000000000000..72da095d4009
--- /dev/null
+++ b/fs/nilfs2/Kconfig
@@ -0,0 +1,25 @@
+config NILFS2_FS
+        tristate "NILFS2 file system support (EXPERIMENTAL)"
+        depends on BLOCK && EXPERIMENTAL
+        select CRC32
+        help
+          NILFS2 is a log-structured file system (LFS) supporting continuous
+          snapshotting.  In addition to versioning capability of the entire
+          file system, users can even restore files mistakenly overwritten or
+          destroyed just a few seconds ago.  Since this file system can keep
+          consistency like conventional LFS, it achieves quick recovery after
+          system crashes.
+          NILFS2 creates a number of checkpoints every few seconds or per
+          synchronous write basis (unless there is no change).  Users can
+          select significant versions among continuously created checkpoints,
+          and can change them into snapshots which will be preserved for long
+          periods until they are changed back to checkpoints.  Each
+          snapshot is mountable as a read-only file system concurrently with
+          its writable mount, and this feature is convenient for online backup.
+          Some features including atime, extended attributes, and POSIX ACLs,
+          are not supported yet.
+          To compile this file system support as a module, choose M here: the
+          module will be called nilfs2.  If unsure, say N.
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 3d3ddb3f5177..2dfd47714ae5 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -412,8 +412,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
                return 0; /* Do not request flush for shadow page cache */
        if (!sb) {
                writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
-                if (!writer)
+                if (!writer) {
+                        nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
                        return -EROFS;
+                }
                sb = writer->s_super;
        }
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 8b5e4778cf28..51ff3d0a4ee2 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1859,12 +1859,26 @@ static void nilfs_end_page_io(struct page *page, int err)
        if (!page)
                return;
-        if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page))
+        if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) {
                /*
                 * For b-tree node pages, this function may be called twice
                 * or more because they might be split in a segment.
                 */
+                if (PageDirty(page)) {
+                        /*
+                         * For pages holding split b-tree node buffers, dirty
+                         * flag on the buffers may be cleared discretely.
+                         * In that case, the page is once redirtied for
+                         * remaining buffers, and it must be cancelled if
+                         * all the buffers get cleaned later.
+                         */
+                        lock_page(page);
+                        if (nilfs_page_buffers_clean(page))
+                                __nilfs_clear_page_dirty(page);
+                        unlock_page(page);
+                }
                return;
+        }
        __nilfs_end_page_io(page, err);
 }
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 31dac7e3b0f1..dffbb0911d02 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,15 +1,5 @@
 config FSNOTIFY
-        bool "Filesystem notification backend"
+        def_bool n
-        default y
-        ---help---
-           fsnotify is a backend for filesystem notification.  fsnotify does
-           not provide any userspace interface but does provide the basis
-           needed for other notification schemes such as dnotify, inotify,
-           and fanotify.
-           Say Y here to enable fsnotify suport.
-           If unsure, say Y.
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 904ff8d5405a..f9c1ca139d8f 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,6 +1,6 @@
 config DNOTIFY
        bool "Dnotify support"
-        depends on FSNOTIFY
+        select FSNOTIFY
        default y
        help
          Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ec2f7bd76818..037e878e03fc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
                        if (!group->ops->should_send_event(group, to_tell, mask))
                                continue;
                        if (!event) {
-                                event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
+                                event = fsnotify_create_event(to_tell, mask, data,
+                                                              data_is, file_name, cookie,
+                                                              GFP_KERNEL);
                                /* shit, we OOM'd and now we can't tell, maybe
                                 * someday someone else will want to do something
                                 * here */
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 5356884289a1..3e56dbffe729 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,7 +15,7 @@ config INOTIFY
 config INOTIFY_USER
        bool "Inotify support for userspace"
-        depends on FSNOTIFY
+        select FSNOTIFY
        default y
        ---help---
          Say Y here to enable inotify support for userspace, including the
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ff27a2965844..f30d9bbc2e1b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -57,7 +57,6 @@ int inotify_max_user_watches __read_mostly;
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 struct kmem_cache *event_priv_cachep __read_mostly;
-static struct fsnotify_event *inotify_ignored_event;
 /*
 * When inotify registers a new group it increments this and uses that
@@ -365,6 +364,17 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
        return error;
 }
+static void inotify_remove_from_idr(struct fsnotify_group *group,
+                                    struct inotify_inode_mark_entry *ientry)
+{
+        struct idr *idr;
+        spin_lock(&group->inotify_data.idr_lock);
+        idr = &group->inotify_data.idr;
+        idr_remove(idr, ientry->wd);
+        spin_unlock(&group->inotify_data.idr_lock);
+        ientry->wd = -1;
+}
 /*
 * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the
 * internal reference help on the mark because it is in the idr.
@@ -373,13 +383,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
                                    struct fsnotify_group *group)
 {
        struct inotify_inode_mark_entry *ientry;
+        struct fsnotify_event *ignored_event;
        struct inotify_event_private_data *event_priv;
        struct fsnotify_event_private_data *fsn_event_priv;
-        struct idr *idr;
+        ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
+                                              FSNOTIFY_EVENT_NONE, NULL, 0,
+                                              GFP_NOFS);
+        if (!ignored_event)
+                return;
        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
-        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
        if (unlikely(!event_priv))
                goto skip_send_ignore;
@@ -388,7 +404,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
        fsn_event_priv->group = group;
        event_priv->wd = ientry->wd;
-        fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
+        fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
        /* did the private data get added? */
        if (list_empty(&fsn_event_priv->event_list))
@@ -396,14 +412,16 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 skip_send_ignore:
+        /* matches the reference taken when the event was created */
+        fsnotify_put_event(ignored_event);
        /* remove this entry from the idr */
-        spin_lock(&group->inotify_data.idr_lock);
+        inotify_remove_from_idr(group, ientry);
-        idr = &group->inotify_data.idr;
-        idr_remove(idr, ientry->wd);
-        spin_unlock(&group->inotify_data.idr_lock);
        /* removed from idr, drop that reference */
        fsnotify_put_mark(entry);
+        atomic_dec(&group->inotify_data.user->inotify_watches);
 }
 /* ding dong the mark is dead */
@@ -418,6 +436,7 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
 {
        struct fsnotify_mark_entry *entry = NULL;
        struct inotify_inode_mark_entry *ientry;
+        struct inotify_inode_mark_entry *tmp_ientry;
        int ret = 0;
        int add = (arg & IN_MASK_ADD);
        __u32 mask;
@@ -428,54 +447,66 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
        if (unlikely(!mask))
                return -EINVAL;
-        ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+        tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
-        if (unlikely(!ientry))
+        if (unlikely(!tmp_ientry))
                return -ENOMEM;
        /* we set the mask at the end after attaching it */
-        fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
+        fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
-        ientry->wd = 0;
+        tmp_ientry->wd = -1;
 find_entry:
        spin_lock(&inode->i_lock);
        entry = fsnotify_find_mark_entry(group, inode);
        spin_unlock(&inode->i_lock);
        if (entry) {
-                kmem_cache_free(inotify_inode_mark_cachep, ientry);
                ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
        } else {
-                if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
+                ret = -ENOSPC;
-                        ret = -ENOSPC;
+                if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
-                        goto out_err;
-                }
-                ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
-                if (ret == -EEXIST)
-                        goto find_entry;
-                else if (ret)
                        goto out_err;
-                entry = &ientry->fsn_entry;
 retry:
                ret = -ENOMEM;
                if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
                        goto out_err;
                spin_lock(&group->inotify_data.idr_lock);
-                /* if entry is added to the idr we keep the reference obtained
+                ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
-                 * through fsnotify_mark_add.  remember to drop this reference
+                                        group->inotify_data.last_wd,
-                 * when entry is removed from idr */
+                                        &tmp_ientry->wd);
-                ret = idr_get_new_above(&group->inotify_data.idr, entry,
-                                        ++group->inotify_data.last_wd,
-                                        &ientry->wd);
                spin_unlock(&group->inotify_data.idr_lock);
                if (ret) {
                        if (ret == -EAGAIN)
                                goto retry;
                        goto out_err;
                }
+                ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+                if (ret) {
+                        inotify_remove_from_idr(group, tmp_ientry);
+                        if (ret == -EEXIST)
+                                goto find_entry;
+                        goto out_err;
+                }
+                /* tmp_ientry has been added to the inode, so we are all set up.
+                 * now we just need to make sure tmp_ientry doesn't get freed and
+                 * we need to set up entry and ientry so the generic code can
+                 * do its thing. */
+                ientry = tmp_ientry;
+                entry = &ientry->fsn_entry;
+                tmp_ientry = NULL;
                atomic_inc(&group->inotify_data.user->inotify_watches);
+                /* update the idr hint */
+                group->inotify_data.last_wd = ientry->wd;
+                /* we put the mark on the idr, take a reference */
+                fsnotify_get_mark(entry);
        }
+        ret = ientry->wd;
        spin_lock(&entry->lock);
        old_mask = entry->mask;
@@ -506,14 +537,19 @@ retry:
                        fsnotify_recalc_group_mask(group);
        }
-        return ientry->wd;
+        /* this either matches fsnotify_find_mark_entry, or init_mark_entry
+         * depending on which path we took... */
+        fsnotify_put_mark(entry);
 out_err:
-        /* see this isn't supposed to happen, just kill the watch */
+        /* could be an error, could be that we found an existing mark */
-        if (entry) {
+        if (tmp_ientry) {
-                fsnotify_destroy_mark_by_entry(entry);
+                /* on the idr but didn't make it on the inode */
-                fsnotify_put_mark(entry);
+                if (tmp_ientry->wd != -1)
+                        inotify_remove_from_idr(group, tmp_ientry);
+                kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
        }
        return ret;
 }
@@ -721,9 +757,6 @@ static int __init inotify_user_setup(void)
        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
        event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
-        inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
-        if (!inotify_ignored_event)
-                panic("unable to allocate the inotify ignored event\n");
        inotify_max_queued_events = 16384;
        inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 959b73e756fd..521368574e97 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -136,18 +136,24 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 {
        if ((old->mask == new->mask) &&
            (old->to_tell == new->to_tell) &&
-            (old->data_type == new->data_type)) {
+            (old->data_type == new->data_type) &&
+            (old->name_len == new->name_len)) {
                switch (old->data_type) {
                case (FSNOTIFY_EVENT_INODE):
-                        if (old->inode == new->inode)
+                        /* remember, after old was put on the wait_q we aren't
+                         * allowed to look at the inode any more, only thing
+                         * left to check was if the file_name is the same */
+                        if (old->name_len &&
+                            !strcmp(old->file_name, new->file_name))
                                return true;
                        break;
                case (FSNOTIFY_EVENT_PATH):
                        if ((old->path.mnt == new->path.mnt) &&
                            (old->path.dentry == new->path.dentry))
                                return true;
+                        break;
                case (FSNOTIFY_EVENT_NONE):
-                        return true;
+                        return false;
                };
        }
        return false;
@@ -339,18 +345,19 @@ static void initialize_event(struct fsnotify_event *event)
 * @name the filename, if available
 */
 struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
-                                             int data_type, const char *name, u32 cookie)
+                                             int data_type, const char *name, u32 cookie,
+                                             gfp_t gfp)
 {
        struct fsnotify_event *event;
-        event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+        event = kmem_cache_alloc(fsnotify_event_cachep, gfp);
        if (!event)
                return NULL;
        initialize_event(event);
        if (name) {
-                event->file_name = kstrdup(name, GFP_KERNEL);
+                event->file_name = kstrdup(name, gfp);
                if (!event->file_name) {
                        kmem_cache_free(fsnotify_event_cachep, event);
                        return NULL;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 1a9c7878f864..ea4e6cb29e13 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -436,7 +436,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        rcu_assign_pointer(ptbl->part[partno], p);
        /* suppress uevent if the disk supresses it */
-        if (!dev_get_uevent_suppress(pdev))
+        if (!dev_get_uevent_suppress(ddev))
                kobject_uevent(&pdev->kobj, KOBJ_ADD);
        return p;
diff --git a/fs/pipe.c b/fs/pipe.c
index f7dd21ad85a6..52c415114838 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -68,8 +68,8 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
        } else {
-                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+                pipe_lock_nested(pipe2, I_MUTEX_PARENT);
-                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+                pipe_lock_nested(pipe1, I_MUTEX_CHILD);
        }
 }
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 70f36c043d62..38f7bd559f35 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2043,7 +2043,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                invalidate_bdev(sb->s_bdev);
        }
        mutex_lock(&dqopt->dqonoff_mutex);
-        mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
        if (sb_has_quota_loaded(sb, type)) {
                error = -EBUSY;
                goto out_lock;
@@ -2054,9 +2053,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                 * possible) Also nobody should write to the file - we use
                 * special IO operations which ignore the immutable bit. */
                down_write(&dqopt->dqptr_sem);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
                oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
                                             S_NOQUOTA);
                inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+                mutex_unlock(&inode->i_mutex);
                up_write(&dqopt->dqptr_sem);
                sb->dq_op->drop(inode);
        }
@@ -2080,7 +2081,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                goto out_file_init;
        }
        mutex_unlock(&dqopt->dqio_mutex);
-        mutex_unlock(&inode->i_mutex);
        spin_lock(&dq_state_lock);
        dqopt->flags |= dquot_state_flag(flags, type);
        spin_unlock(&dq_state_lock);
@@ -2096,13 +2096,14 @@ out_file_init:
 out_lock:
        if (oldflags != -1) {
                down_write(&dqopt->dqptr_sem);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
                /* Set the flags back (in the case of accidental quotaon()
                 * on a wrong file we don't want to mess up the flags) */
                inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
                inode->i_flags |= oldflags;
+                mutex_unlock(&inode->i_mutex);
                up_write(&dqopt->dqptr_sem);
        }
-        mutex_unlock(&inode->i_mutex);
        mutex_unlock(&dqopt->dqonoff_mutex);
 out_fmt:
        put_quota_format(fmt);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ebb2c417912c..11f0c06316de 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -20,6 +20,7 @@
 #include <linux/ramfs.h>
 #include <linux/pagevec.h>
 #include <linux/mman.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index d88d0fac9fa5..14f2d71ea3ce 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -939,8 +939,10 @@ again:
        /* Remove from old parent's list and insert into new parent's list. */
        sysfs_unlink_sibling(sd);
        sysfs_get(new_parent_sd);
+        drop_nlink(old_parent->d_inode);
        sysfs_put(sd->s_parent);
        sd->s_parent = new_parent_sd;
+        inc_nlink(new_parent->d_inode);
        sysfs_link_sibling(sd);
 out_unlock:
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 6832135159b6..9d1b8c2e6c45 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1087,11 +1087,23 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
        struct udf_inode_info *vati;
        uint32_t pos;
        struct virtualAllocationTable20 *vat20;
+        sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
        /* VAT file entry is in the last recorded block */
        ino.partitionReferenceNum = type1_index;
        ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
        sbi->s_vat_inode = udf_iget(sb, &ino);
+        if (!sbi->s_vat_inode &&
+            sbi->s_last_block != blocks - 1) {
+                printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
+                       " last recorded block (%lu), retrying with the last "
+                       "block of the device (%lu).\n",
+                       (unsigned long)sbi->s_last_block,
+                       (unsigned long)blocks - 1);
+                ino.partitionReferenceNum = type1_index;
+                ino.logicalBlockNum = blocks - 1 - map->s_partition_root;
+                sbi->s_vat_inode = udf_iget(sb, &ino);
+        }
        if (!sbi->s_vat_inode)
                return 1;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 7ec89fc05b2b..aecf2519db76 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1268,6 +1268,14 @@ xfs_vm_writepage(
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+        /*
+         *  VM calculation for nr_to_write seems off.  Bump it way
+         *  up, this gets simple streaming writes zippy again.
+         *  To be reviewed again after Jens' writeback changes.
+         */
+        wbc->nr_to_write *= 4;
        /*
         * Convert delayed allocate, unwritten or unmapped space
         * to real space and flush out to disk.
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 58973bb46038..8070b34cc287 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -680,8 +680,8 @@ xfs_vn_fiemap(
        else
                bm.bmv_length = BTOBB(length);
-        /* our formatter will tell xfs_getbmap when to stop. */
+        /* We add one because in getbmap world count includes the header */
-        bm.bmv_count = MAXEXTNUM;
+        bm.bmv_count = fieinfo->fi_extents_max + 1;
        bm.bmv_iflags = BMV_IF_PREALLOC;
        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
                bm.bmv_iflags |= BMV_IF_ATTRFORK;
author	James Morris <jmorris@namei.org>	2009-08-05 18:55:03 -0400
committer	James Morris <jmorris@namei.org>	2009-08-05 18:55:03 -0400
commit	012a5299a29672039f42944a37984558393ef769 (patch)
tree	de0815c67cf4156c32c8b552cd7448387cc391b0 /fs
parent	da34d4248bd2013ee64ce51e63ec0ebd1f32b46c (diff)
parent	90bc1a658a53f8832ee799685703977a450e5af9 (diff)