138 files changed, 13321 insertions, 5773 deletions
diff --git a/fs/bio.c b/fs/bio.c
index 98711647ece4..59000215e59b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,10 +26,9 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <scsi/sg.h>            /* for struct sg_iovec */
-DEFINE_TRACE(block_split);
+#include <trace/events/block.h>
 /*
 * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -499,11 +498,11 @@ int bio_get_nr_vecs(struct block_device *bdev)
        struct request_queue *q = bdev_get_queue(bdev);
        int nr_pages;
-        nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        if (nr_pages > q->max_phys_segments)
+        if (nr_pages > queue_max_phys_segments(q))
-                nr_pages = q->max_phys_segments;
+                nr_pages = queue_max_phys_segments(q);
-        if (nr_pages > q->max_hw_segments)
+        if (nr_pages > queue_max_hw_segments(q))
-                nr_pages = q->max_hw_segments;
+                nr_pages = queue_max_hw_segments(q);
        return nr_pages;
 }
@@ -562,8 +561,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
         * make this too complex.
         */
-        while (bio->bi_phys_segments >= q->max_phys_segments
+        while (bio->bi_phys_segments >= queue_max_phys_segments(q)
-               || bio->bi_phys_segments >= q->max_hw_segments) {
+               || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
                if (retried_segments)
                        return 0;
@@ -634,7 +633,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
                    unsigned int len, unsigned int offset)
 {
-        return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
+        return __bio_add_page(q, bio, page, len, offset,
+                              queue_max_hw_sectors(q));
 }
 /**
@@ -654,7 +654,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
                 unsigned int offset)
 {
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-        return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
+        return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
 }
 struct bio_map_data {
@@ -721,7 +721,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
                while (bv_len && iov_idx < iov_count) {
                        unsigned int bytes;
-                        char *iov_addr;
+                        char __user *iov_addr;
                        bytes = min_t(unsigned int,
                                      iov[iov_idx].iov_len - iov_off, bv_len);
@@ -1201,7 +1201,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
                char *addr = page_address(bvec->bv_page);
                int len = bmd->iovecs[i].bv_len;
-                if (read && !err)
+                if (read)
                        memcpy(p, addr, len);
                __free_page(bvec->bv_page);
@@ -1490,11 +1490,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 sector_t bio_sector_offset(struct bio *bio, unsigned short index,
                           unsigned int offset)
 {
-        unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+        unsigned int sector_sz;
        struct bio_vec *bv;
        sector_t sectors;
        int i;
+        sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
        sectors = 0;
        if (index >= bio->bi_idx)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc18dd17..931f6b8c4b2f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,6 +25,7 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
+#include <linux/kmemleak.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -76,7 +77,7 @@ int set_blocksize(struct block_device *bdev, int size)
                return -EINVAL;
        /* Size cannot be smaller than the size supported by the device */
-        if (size < bdev_hardsect_size(bdev))
+        if (size < bdev_logical_block_size(bdev))
                return -EINVAL;
        /* Don't change the size if it is same as current */
@@ -106,7 +107,7 @@ EXPORT_SYMBOL(sb_set_blocksize);
 int sb_min_blocksize(struct super_block *sb, int size)
 {
-        int minsize = bdev_hardsect_size(sb->s_bdev);
+        int minsize = bdev_logical_block_size(sb->s_bdev);
        if (size < minsize)
                size = minsize;
        return sb_set_blocksize(sb, size);
@@ -492,6 +493,11 @@ void __init bdev_cache_init(void)
        bd_mnt = kern_mount(&bd_type);
        if (IS_ERR(bd_mnt))
                panic("Cannot create bdev pseudo-fs");
+        /*
+         * This vfsmount structure is only used to obtain the
+         * blockdev_superblock, so tell kmemleak not to report it.
+         */
+        kmemleak_not_leak(bd_mnt);
        blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
 }
@@ -1111,7 +1117,7 @@ EXPORT_SYMBOL(check_disk_change);
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
-        unsigned bsize = bdev_hardsect_size(bdev);
+        unsigned bsize = bdev_logical_block_size(bdev);
        bdev->bd_inode->i_size = size;
        while (bsize < PAGE_CACHE_SIZE) {
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 94212844a9bc..a35eb36b32fd 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           transaction.o inode.o file.o tree-defrag.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-           ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+           export.o tree-log.o acl.o free-space-cache.o zlib.o \
-           compression.o delayed-ref.o
+           compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index cbba000dccbe..603972576f0f 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -351,9 +351,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
        return 0;
 }
-int btrfs_check_acl(struct inode *inode, int mask)
-{
-        return 0;
-}
 #endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 502c3d61de62..7f88628a1a72 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -294,10 +294,10 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                INIT_LIST_HEAD(&worker->worker_list);
                spin_lock_init(&worker->lock);
                atomic_set(&worker->num_pending, 0);
+                worker->workers = workers;
                worker->task = kthread_run(worker_loop, worker,
                                           "btrfs-%s-%d", workers->name,
                                           workers->num_workers + i);
-                worker->workers = workers;
                if (IS_ERR(worker->task)) {
                        kfree(worker);
                        ret = PTR_ERR(worker->task);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b30986f00b9d..acb4f3517582 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -72,6 +72,9 @@ struct btrfs_inode {
         */
        struct list_head ordered_operations;
+        /* node for the red-black tree that links inodes in subvolume root */
+        struct rb_node rb_node;
        /* the space_info for where this inode's data allocations are done */
        struct btrfs_space_info *space_info;
@@ -154,5 +157,4 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
        BTRFS_I(inode)->disk_i_size = size;
 }
 #endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ab07627084f1..de1e2fd32080 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -123,7 +123,7 @@ static int check_compressed_csum(struct inode *inode,
        u32 csum;
        u32 *cb_sum = &cb->sums;
-        if (btrfs_test_flag(inode, NODATASUM))
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
                return 0;
        for (i = 0; i < cb->nr_pages; i++) {
@@ -670,7 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                         */
                        atomic_inc(&cb->pending_bios);
-                        if (!btrfs_test_flag(inode, NODATASUM)) {
+                        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
                                btrfs_lookup_bio_sums(root, inode, comp_bio,
                                                      sums);
                        }
@@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
        BUG_ON(ret);
-        if (!btrfs_test_flag(inode, NODATASUM))
+        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
                btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
        ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
deleted file mode 100644
index 6e1b3de36700..000000000000
--- a/fs/btrfs/crc32c.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef __BTRFS_CRC32C__
-#define __BTRFS_CRC32C__
-#include <linux/crc32c.h>
-/*
- * this file used to do more for selecting the HW version of crc32c,
- * perhaps it will one day again soon.
- */
-#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
-#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fedf8b9f03a2..60a45f3a4e91 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -197,14 +197,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        u32 nritems;
        int ret = 0;
        int level;
-        struct btrfs_root *new_root;
+        struct btrfs_disk_key disk_key;
-        new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
-        if (!new_root)
-                return -ENOMEM;
-        memcpy(new_root, root, sizeof(*new_root));
-        new_root->root_key.objectid = new_root_objectid;
        WARN_ON(root->ref_cows && trans->transid !=
                root->fs_info->running_transaction->transid);
@@ -212,28 +205,37 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        level = btrfs_header_level(buf);
        nritems = btrfs_header_nritems(buf);
+        if (level == 0)
+                btrfs_item_key(buf, &disk_key, 0);
+        else
+                btrfs_node_key(buf, &disk_key, 0);
-        cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
+        cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
-                                     new_root_objectid, trans->transid,
+                                     new_root_objectid, &disk_key, level,
-                                     level, buf->start, 0);
+                                     buf->start, 0);
-        if (IS_ERR(cow)) {
+        if (IS_ERR(cow))
-                kfree(new_root);
                return PTR_ERR(cow);
-        }
        copy_extent_buffer(cow, buf, 0, 0, cow->len);
        btrfs_set_header_bytenr(cow, cow->start);
        btrfs_set_header_generation(cow, trans->transid);
-        btrfs_set_header_owner(cow, new_root_objectid);
+        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
-        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+                                     BTRFS_HEADER_FLAG_RELOC);
+        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+        else
+                btrfs_set_header_owner(cow, new_root_objectid);
        write_extent_buffer(cow, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(cow),
                            BTRFS_FSID_SIZE);
        WARN_ON(btrfs_header_generation(buf) > trans->transid);
-        ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
+        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-        kfree(new_root);
+                ret = btrfs_inc_ref(trans, root, cow, 1);
+        else
+                ret = btrfs_inc_ref(trans, root, cow, 0);
        if (ret)
                return ret;
@@ -244,6 +246,125 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 }
 /*
+ * check if the tree block can be shared by multiple trees
+ */
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+                              struct extent_buffer *buf)
+{
+        /*
+         * Tree blocks not in refernece counted trees and tree roots
+         * are never shared. If a block was allocated after the last
+         * snapshot and the block was not allocated by tree relocation,
+         * we know the block is not shared.
+         */
+        if (root->ref_cows &&
+            buf != root->node && buf != root->commit_root &&
+            (btrfs_header_generation(buf) <=
+             btrfs_root_last_snapshot(&root->root_item) ||
+             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+                return 1;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (root->ref_cows &&
+            btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+                return 1;
+#endif
+        return 0;
+}
+static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct extent_buffer *buf,
+                                       struct extent_buffer *cow)
+{
+        u64 refs;
+        u64 owner;
+        u64 flags;
+        u64 new_flags = 0;
+        int ret;
+        /*
+         * Backrefs update rules:
+         *
+         * Always use full backrefs for extent pointers in tree block
+         * allocated by tree relocation.
+         *
+         * If a shared tree block is no longer referenced by its owner
+         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
+         * use full backrefs for extent pointers in tree block.
+         *
+         * If a tree block is been relocating
+         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
+         * use full backrefs for extent pointers in tree block.
+         * The reason for this is some operations (such as drop tree)
+         * are only allowed for blocks use full backrefs.
+         */
+        if (btrfs_block_can_be_shared(root, buf)) {
+                ret = btrfs_lookup_extent_info(trans, root, buf->start,
+                                               buf->len, &refs, &flags);
+                BUG_ON(ret);
+                BUG_ON(refs == 0);
+        } else {
+                refs = 1;
+                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+                else
+                        flags = 0;
+        }
+        owner = btrfs_header_owner(buf);
+        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
+               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+        if (refs > 1) {
+                if ((owner == root->root_key.objectid ||
+                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+                        ret = btrfs_inc_ref(trans, root, buf, 1);
+                        BUG_ON(ret);
+                        if (root->root_key.objectid ==
+                            BTRFS_TREE_RELOC_OBJECTID) {
+                                ret = btrfs_dec_ref(trans, root, buf, 0);
+                                BUG_ON(ret);
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
+                                BUG_ON(ret);
+                        }
+                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+                } else {
+                        if (root->root_key.objectid ==
+                            BTRFS_TREE_RELOC_OBJECTID)
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
+                        else
+                                ret = btrfs_inc_ref(trans, root, cow, 0);
+                        BUG_ON(ret);
+                }
+                if (new_flags != 0) {
+                        ret = btrfs_set_disk_extent_flags(trans, root,
+                                                          buf->start,
+                                                          buf->len,
+                                                          new_flags, 0);
+                        BUG_ON(ret);
+                }
+        } else {
+                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+                        if (root->root_key.objectid ==
+                            BTRFS_TREE_RELOC_OBJECTID)
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
+                        else
+                                ret = btrfs_inc_ref(trans, root, cow, 0);
+                        BUG_ON(ret);
+                        ret = btrfs_dec_ref(trans, root, buf, 1);
+                        BUG_ON(ret);
+                }
+                clean_tree_block(trans, root, buf);
+        }
+        return 0;
+}
+/*
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * dirty and returned locked.  If you modify the block it needs to be marked
@@ -262,34 +383,39 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                             struct extent_buffer **cow_ret,
                             u64 search_start, u64 empty_size)
 {
-        u64 parent_start;
+        struct btrfs_disk_key disk_key;
        struct extent_buffer *cow;
-        u32 nritems;
-        int ret = 0;
        int level;
        int unlock_orig = 0;
+        u64 parent_start;
        if (*cow_ret == buf)
                unlock_orig = 1;
        btrfs_assert_tree_locked(buf);
-        if (parent)
-                parent_start = parent->start;
-        else
-                parent_start = 0;
        WARN_ON(root->ref_cows && trans->transid !=
                root->fs_info->running_transaction->transid);
        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
        level = btrfs_header_level(buf);
-        nritems = btrfs_header_nritems(buf);
-        cow = btrfs_alloc_free_block(trans, root, buf->len,
+        if (level == 0)
-                                     parent_start, root->root_key.objectid,
+                btrfs_item_key(buf, &disk_key, 0);
-                                     trans->transid, level,
+        else
-                                     search_start, empty_size);
+                btrfs_node_key(buf, &disk_key, 0);
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                if (parent)
+                        parent_start = parent->start;
+                else
+                        parent_start = 0;
+        } else
+                parent_start = 0;
+        cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
+                                     root->root_key.objectid, &disk_key,
+                                     level, search_start, empty_size);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
@@ -298,83 +424,53 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        copy_extent_buffer(cow, buf, 0, 0, cow->len);
        btrfs_set_header_bytenr(cow, cow->start);
        btrfs_set_header_generation(cow, trans->transid);
-        btrfs_set_header_owner(cow, root->root_key.objectid);
+        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
-        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+                                     BTRFS_HEADER_FLAG_RELOC);
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+        else
+                btrfs_set_header_owner(cow, root->root_key.objectid);
        write_extent_buffer(cow, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(cow),
                            BTRFS_FSID_SIZE);
-        WARN_ON(btrfs_header_generation(buf) > trans->transid);
+        update_ref_for_cow(trans, root, buf, cow);
-        if (btrfs_header_generation(buf) != trans->transid) {
-                u32 nr_extents;
-                ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
-                if (ret)
-                        return ret;
-                ret = btrfs_cache_ref(trans, root, buf, nr_extents);
-                WARN_ON(ret);
-        } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
-                /*
-                 * There are only two places that can drop reference to
-                 * tree blocks owned by living reloc trees, one is here,
-                 * the other place is btrfs_drop_subtree. In both places,
-                 * we check reference count while tree block is locked.
-                 * Furthermore, if reference count is one, it won't get
-                 * increased by someone else.
-                 */
-                u32 refs;
-                ret = btrfs_lookup_extent_ref(trans, root, buf->start,
-                                              buf->len, &refs);
-                BUG_ON(ret);
-                if (refs == 1) {
-                        ret = btrfs_update_ref(trans, root, buf, cow,
-                                               0, nritems);
-                        clean_tree_block(trans, root, buf);
-                } else {
-                        ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
-                }
-                BUG_ON(ret);
-        } else {
-                ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
-                if (ret)
-                        return ret;
-                clean_tree_block(trans, root, buf);
-        }
-        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
-                ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
-                WARN_ON(ret);
-        }
        if (buf == root->node) {
                WARN_ON(parent && parent != buf);
+                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+                        parent_start = buf->start;
+                else
+                        parent_start = 0;
                spin_lock(&root->node_lock);
                root->node = cow;
                extent_buffer_get(cow);
                spin_unlock(&root->node_lock);
-                if (buf != root->commit_root) {
+                btrfs_free_extent(trans, root, buf->start, buf->len,
-                        btrfs_free_extent(trans, root, buf->start,
+                                  parent_start, root->root_key.objectid,
-                                          buf->len, buf->start,
+                                  level, 0);
-                                          root->root_key.objectid,
-                                          btrfs_header_generation(buf),
-                                          level, 1);
-                }
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
+                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                        parent_start = parent->start;
+                else
+                        parent_start = 0;
+                WARN_ON(trans->transid != btrfs_header_generation(parent));
                btrfs_set_node_blockptr(parent, parent_slot,
                                        cow->start);
-                WARN_ON(trans->transid == 0);
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
-                WARN_ON(btrfs_header_generation(parent) != trans->transid);
                btrfs_free_extent(trans, root, buf->start, buf->len,
-                                  parent_start, btrfs_header_owner(parent),
+                                  parent_start, root->root_key.objectid,
-                                  btrfs_header_generation(parent), level, 1);
+                                  level, 0);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -384,6 +480,18 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        return 0;
 }
+static inline int should_cow_block(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct extent_buffer *buf)
+{
+        if (btrfs_header_generation(buf) == trans->transid &&
+            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
+            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+                return 0;
+        return 1;
+}
 /*
 * cows a single block, see __btrfs_cow_block for the real work.
 * This version of it has extra checks so that a block isn't cow'd more than
@@ -411,9 +519,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
                WARN_ON(1);
        }
-        if (btrfs_header_generation(buf) == trans->transid &&
+        if (!should_cow_block(trans, root, buf)) {
-            btrfs_header_owner(buf) == root->root_key.objectid &&
-            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                *cow_ret = buf;
                return 0;
        }
@@ -469,7 +575,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 /*
 * same as comp_keys only with two btrfs_key's
 */
-static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
 {
        if (k1->objectid > k2->objectid)
                return 1;
@@ -845,6 +951,12 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
        return -1;
 }
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+                     int level, int *slot)
+{
+        return bin_search(eb, key, level, slot);
+}
 /* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
@@ -921,13 +1033,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                root->node = child;
                spin_unlock(&root->node_lock);
-                ret = btrfs_update_extent_ref(trans, root, child->start,
-                                              child->len,
-                                              mid->start, child->start,
-                                              root->root_key.objectid,
-                                              trans->transid, level - 1);
-                BUG_ON(ret);
                add_root_to_dirty_list(root);
                btrfs_tree_unlock(child);
@@ -938,9 +1043,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                /* once for the path */
                free_extent_buffer(mid);
                ret = btrfs_free_extent(trans, root, mid->start, mid->len,
-                                        mid->start, root->root_key.objectid,
+                                        0, root->root_key.objectid, level, 1);
-                                        btrfs_header_generation(mid),
-                                        level, 1);
                /* once for the root ptr */
                free_extent_buffer(mid);
                return ret;
@@ -949,8 +1052,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
                return 0;
-        if (trans->transaction->delayed_refs.flushing &&
+        if (btrfs_header_nritems(mid) > 2)
-            btrfs_header_nritems(mid) > 2)
                return 0;
        if (btrfs_header_nritems(mid) < 2)
@@ -998,7 +1100,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        ret = wret;
                if (btrfs_header_nritems(right) == 0) {
                        u64 bytenr = right->start;
-                        u64 generation = btrfs_header_generation(parent);
                        u32 blocksize = right->len;
                        clean_tree_block(trans, root, right);
@@ -1010,9 +1111,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        if (wret)
                                ret = wret;
                        wret = btrfs_free_extent(trans, root, bytenr,
-                                                 blocksize, parent->start,
+                                                 blocksize, 0,
-                                                 btrfs_header_owner(parent),
+                                                 root->root_key.objectid,
-                                                 generation, level, 1);
+                                                 level, 0);
                        if (wret)
                                ret = wret;
                } else {
@@ -1047,7 +1148,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
        }
        if (btrfs_header_nritems(mid) == 0) {
                /* we've managed to empty the middle node, drop it */
-                u64 root_gen = btrfs_header_generation(parent);
                u64 bytenr = mid->start;
                u32 blocksize = mid->len;
@@ -1059,9 +1159,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret)
                        ret = wret;
                wret = btrfs_free_extent(trans, root, bytenr, blocksize,
-                                         parent->start,
+                                         0, root->root_key.objectid,
-                                         btrfs_header_owner(parent),
+                                         level, 0);
-                                         root_gen, level, 1);
                if (wret)
                        ret = wret;
        } else {
@@ -1437,7 +1536,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 {
        int i;
-        if (path->keep_locks || path->lowest_level)
+        if (path->keep_locks)
                return;
        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1552,7 +1651,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                }
                b = p->nodes[level];
        } else if (ins_len < 0 && btrfs_header_nritems(b) <
-                   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
+                   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
                int sret;
                sret = reada_for_balance(root, p, level);
@@ -1614,10 +1713,17 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                lowest_unlock = 2;
 again:
-        if (p->skip_locking)
+        if (p->search_commit_root) {
-                b = btrfs_root_node(root);
+                b = root->commit_root;
-        else
+                extent_buffer_get(b);
-                b = btrfs_lock_root_node(root);
+                if (!p->skip_locking)
+                        btrfs_tree_lock(b);
+        } else {
+                if (p->skip_locking)
+                        b = btrfs_root_node(root);
+                else
+                        b = btrfs_lock_root_node(root);
+        }
        while (b) {
                level = btrfs_header_level(b);
@@ -1638,11 +1744,9 @@ again:
                         * then we don't want to set the path blocking,
                         * so we test it here
                         */
-                        if (btrfs_header_generation(b) == trans->transid &&
+                        if (!should_cow_block(trans, root, b))
-                            btrfs_header_owner(b) == root->root_key.objectid &&
-                            !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
                                goto cow_done;
-                        }
                        btrfs_set_path_blocking(p);
                        wret = btrfs_cow_block(trans, root, b,
@@ -1764,138 +1868,6 @@ done:
        return ret;
 }
-int btrfs_merge_path(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root,
-                     struct btrfs_key *node_keys,
-                     u64 *nodes, int lowest_level)
-{
-        struct extent_buffer *eb;
-        struct extent_buffer *parent;
-        struct btrfs_key key;
-        u64 bytenr;
-        u64 generation;
-        u32 blocksize;
-        int level;
-        int slot;
-        int key_match;
-        int ret;
-        eb = btrfs_lock_root_node(root);
-        ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
-        BUG_ON(ret);
-        btrfs_set_lock_blocking(eb);
-        parent = eb;
-        while (1) {
-                level = btrfs_header_level(parent);
-                if (level == 0 || level <= lowest_level)
-                        break;
-                ret = bin_search(parent, &node_keys[lowest_level], level,
-                                 &slot);
-                if (ret && slot > 0)
-                        slot--;
-                bytenr = btrfs_node_blockptr(parent, slot);
-                if (nodes[level - 1] == bytenr)
-                        break;
-                blocksize = btrfs_level_size(root, level - 1);
-                generation = btrfs_node_ptr_generation(parent, slot);
-                btrfs_node_key_to_cpu(eb, &key, slot);
-                key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
-                if (generation == trans->transid) {
-                        eb = read_tree_block(root, bytenr, blocksize,
-                                             generation);
-                        btrfs_tree_lock(eb);
-                        btrfs_set_lock_blocking(eb);
-                }
-                /*
-                 * if node keys match and node pointer hasn't been modified
-                 * in the running transaction, we can merge the path. for
-                 * blocks owened by reloc trees, the node pointer check is
-                 * skipped, this is because these blocks are fully controlled
-                 * by the space balance code, no one else can modify them.
-                 */
-                if (!nodes[level - 1] || !key_match ||
-                    (generation == trans->transid &&
-                     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
-                        if (level == 1 || level == lowest_level + 1) {
-                                if (generation == trans->transid) {
-                                        btrfs_tree_unlock(eb);
-                                        free_extent_buffer(eb);
-                                }
-                                break;
-                        }
-                        if (generation != trans->transid) {
-                                eb = read_tree_block(root, bytenr, blocksize,
-                                                generation);
-                                btrfs_tree_lock(eb);
-                                btrfs_set_lock_blocking(eb);
-                        }
-                        ret = btrfs_cow_block(trans, root, eb, parent, slot,
-                                              &eb);
-                        BUG_ON(ret);
-                        if (root->root_key.objectid ==
-                            BTRFS_TREE_RELOC_OBJECTID) {
-                                if (!nodes[level - 1]) {
-                                        nodes[level - 1] = eb->start;
-                                        memcpy(&node_keys[level - 1], &key,
-                                               sizeof(node_keys[0]));
-                                } else {
-                                        WARN_ON(1);
-                                }
-                        }
-                        btrfs_tree_unlock(parent);
-                        free_extent_buffer(parent);
-                        parent = eb;
-                        continue;
-                }
-                btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
-                btrfs_set_node_ptr_generation(parent, slot, trans->transid);
-                btrfs_mark_buffer_dirty(parent);
-                ret = btrfs_inc_extent_ref(trans, root,
-                                        nodes[level - 1],
-                                        blocksize, parent->start,
-                                        btrfs_header_owner(parent),
-                                        btrfs_header_generation(parent),
-                                        level - 1);
-                BUG_ON(ret);
-                /*
-                 * If the block was created in the running transaction,
-                 * it's possible this is the last reference to it, so we
-                 * should drop the subtree.
-                 */
-                if (generation == trans->transid) {
-                        ret = btrfs_drop_subtree(trans, root, eb, parent);
-                        BUG_ON(ret);
-                        btrfs_tree_unlock(eb);
-                        free_extent_buffer(eb);
-                } else {
-                        ret = btrfs_free_extent(trans, root, bytenr,
-                                        blocksize, parent->start,
-                                        btrfs_header_owner(parent),
-                                        btrfs_header_generation(parent),
-                                        level - 1, 1);
-                        BUG_ON(ret);
-                }
-                break;
-        }
-        btrfs_tree_unlock(parent);
-        free_extent_buffer(parent);
-        return 0;
-}
 /*
 * adjust the pointers going up the tree, starting at level
 * making sure the right key of each node is points to 'key'.
@@ -2021,9 +1993,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(src);
        btrfs_mark_buffer_dirty(dst);
-        ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
-        BUG_ON(ret);
        return ret;
 }
@@ -2083,9 +2052,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(src);
        btrfs_mark_buffer_dirty(dst);
-        ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
-        BUG_ON(ret);
        return ret;
 }
@@ -2105,7 +2071,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        struct extent_buffer *c;
        struct extent_buffer *old;
        struct btrfs_disk_key lower_key;
-        int ret;
        BUG_ON(path->nodes[level]);
        BUG_ON(path->nodes[level-1] != root->node);
@@ -2117,16 +2082,17 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
                btrfs_node_key(lower, &lower_key, 0);
        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
-                                   root->root_key.objectid, trans->transid,
+                                   root->root_key.objectid, &lower_key,
                                   level, root->node->start, 0);
        if (IS_ERR(c))
                return PTR_ERR(c);
-        memset_extent_buffer(c, 0, 0, root->nodesize);
+        memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_nritems(c, 1);
        btrfs_set_header_level(c, level);
        btrfs_set_header_bytenr(c, c->start);
        btrfs_set_header_generation(c, trans->transid);
+        btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(c, root->root_key.objectid);
        write_extent_buffer(c, root->fs_info->fsid,
@@ -2151,12 +2117,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        root->node = c;
        spin_unlock(&root->node_lock);
-        ret = btrfs_update_extent_ref(trans, root, lower->start,
-                                      lower->len, lower->start, c->start,
-                                      root->root_key.objectid,
-                                      trans->transid, level - 1);
-        BUG_ON(ret);
        /* the super has an extra ref to root->node */
        free_extent_buffer(old);
@@ -2233,7 +2193,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                ret = insert_new_root(trans, root, path, level + 1);
                if (ret)
                        return ret;
-        } else if (!trans->transaction->delayed_refs.flushing) {
+        } else {
                ret = push_nodes_for_insert(trans, root, path, level);
                c = path->nodes[level];
                if (!ret && btrfs_header_nritems(c) <
@@ -2244,20 +2204,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        }
        c_nritems = btrfs_header_nritems(c);
+        mid = (c_nritems + 1) / 2;
+        btrfs_node_key(c, &disk_key, mid);
-        split = btrfs_alloc_free_block(trans, root, root->nodesize,
+        split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
-                                        path->nodes[level + 1]->start,
                                        root->root_key.objectid,
-                                        trans->transid, level, c->start, 0);
+                                        &disk_key, level, c->start, 0);
        if (IS_ERR(split))
                return PTR_ERR(split);
-        btrfs_set_header_flags(split, btrfs_header_flags(c));
+        memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_level(split, btrfs_header_level(c));
        btrfs_set_header_bytenr(split, split->start);
        btrfs_set_header_generation(split, trans->transid);
+        btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(split, root->root_key.objectid);
-        btrfs_set_header_flags(split, 0);
        write_extent_buffer(split, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(split),
                            BTRFS_FSID_SIZE);
@@ -2265,7 +2226,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                            (unsigned long)btrfs_header_chunk_tree_uuid(split),
                            BTRFS_UUID_SIZE);
-        mid = (c_nritems + 1) / 2;
        copy_extent_buffer(split, c,
                           btrfs_node_key_ptr_offset(0),
@@ -2278,16 +2238,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(c);
        btrfs_mark_buffer_dirty(split);
-        btrfs_node_key(split, &disk_key, 0);
        wret = insert_ptr(trans, root, path, &disk_key, split->start,
                          path->slots[level + 1] + 1,
                          level + 1);
        if (wret)
                ret = wret;
-        ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
-        BUG_ON(ret);
        if (path->slots[level] >= mid) {
                path->slots[level] -= mid;
                btrfs_tree_unlock(c);
@@ -2360,7 +2316,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        u32 right_nritems;
        u32 data_end;
        u32 this_item_size;
-        int ret;
        if (empty)
                nr = 0;
@@ -2473,9 +2428,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
                btrfs_mark_buffer_dirty(left);
        btrfs_mark_buffer_dirty(right);
-        ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
-        BUG_ON(ret);
        btrfs_item_key(right, &disk_key, 0);
        btrfs_set_node_key(upper, &disk_key, slot + 1);
        btrfs_mark_buffer_dirty(upper);
@@ -2720,10 +2672,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        if (right_nritems)
                btrfs_mark_buffer_dirty(right);
-        ret = btrfs_update_ref(trans, root, right, left,
-                               old_left_nritems, push_items);
-        BUG_ON(ret);
        btrfs_item_key(right, &disk_key, 0);
        wret = fixup_low_keys(trans, root, path, &disk_key, 1);
        if (wret)
@@ -2880,9 +2828,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(l);
        BUG_ON(path->slots[0] != slot);
-        ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
-        BUG_ON(ret);
        if (mid <= slot) {
                btrfs_tree_unlock(path->nodes[0]);
                free_extent_buffer(path->nodes[0]);
@@ -2911,6 +2856,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
                               struct btrfs_path *path, int data_size,
                               int extend)
 {
+        struct btrfs_disk_key disk_key;
        struct extent_buffer *l;
        u32 nritems;
        int mid;
@@ -2918,12 +2864,11 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        struct extent_buffer *right;
        int ret = 0;
        int wret;
-        int double_split;
+        int split;
        int num_doubles = 0;
        /* first try to make some room by pushing left and right */
-        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
+        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
-            !trans->transaction->delayed_refs.flushing) {
                wret = push_leaf_right(trans, root, path, data_size, 0);
                if (wret < 0)
                        return wret;
@@ -2945,16 +2890,53 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
                        return ret;
        }
 again:
-        double_split = 0;
+        split = 1;
        l = path->nodes[0];
        slot = path->slots[0];
        nritems = btrfs_header_nritems(l);
        mid = (nritems + 1) / 2;
-        right = btrfs_alloc_free_block(trans, root, root->leafsize,
+        if (mid <= slot) {
-                                        path->nodes[1]->start,
+                if (nritems == 1 ||
+                    leaf_space_used(l, mid, nritems - mid) + data_size >
+                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (slot >= nritems) {
+                                split = 0;
+                        } else {
+                                mid = slot;
+                                if (mid != nritems &&
+                                    leaf_space_used(l, mid, nritems - mid) +
+                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        split = 2;
+                                }
+                        }
+                }
+        } else {
+                if (leaf_space_used(l, 0, mid) + data_size >
+                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (!extend && data_size && slot == 0) {
+                                split = 0;
+                        } else if ((extend || !data_size) && slot == 0) {
+                                mid = 1;
+                        } else {
+                                mid = slot;
+                                if (mid != nritems &&
+                                    leaf_space_used(l, mid, nritems - mid) +
+                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        split = 2 ;
+                                }
+                        }
+                }
+        }
+        if (split == 0)
+                btrfs_cpu_key_to_disk(&disk_key, ins_key);
+        else
+                btrfs_item_key(l, &disk_key, mid);
+        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
-                                        trans->transid, 0, l->start, 0);
+                                        &disk_key, 0, l->start, 0);
        if (IS_ERR(right)) {
                BUG_ON(1);
                return PTR_ERR(right);
@@ -2963,6 +2945,7 @@ again:
        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_bytenr(right, right->start);
        btrfs_set_header_generation(right, trans->transid);
+        btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(right, root->root_key.objectid);
        btrfs_set_header_level(right, 0);
        write_extent_buffer(right, root->fs_info->fsid,
@@ -2973,79 +2956,47 @@ again:
                            (unsigned long)btrfs_header_chunk_tree_uuid(right),
                            BTRFS_UUID_SIZE);
-        if (mid <= slot) {
+        if (split == 0) {
-                if (nritems == 1 ||
+                if (mid <= slot) {
-                    leaf_space_used(l, mid, nritems - mid) + data_size >
+                        btrfs_set_header_nritems(right, 0);
-                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        wret = insert_ptr(trans, root, path,
-                        if (slot >= nritems) {
+                                          &disk_key, right->start,
-                                struct btrfs_disk_key disk_key;
+                                          path->slots[1] + 1, 1);
+                        if (wret)
-                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
+                                ret = wret;
-                                btrfs_set_header_nritems(right, 0);
-                                wret = insert_ptr(trans, root, path,
-                                                  &disk_key, right->start,
-                                                  path->slots[1] + 1, 1);
-                                if (wret)
-                                        ret = wret;
-                                btrfs_tree_unlock(path->nodes[0]);
+                        btrfs_tree_unlock(path->nodes[0]);
-                                free_extent_buffer(path->nodes[0]);
+                        free_extent_buffer(path->nodes[0]);
-                                path->nodes[0] = right;
+                        path->nodes[0] = right;
-                                path->slots[0] = 0;
+                        path->slots[0] = 0;
-                                path->slots[1] += 1;
+                        path->slots[1] += 1;
-                                btrfs_mark_buffer_dirty(right);
+                } else {
-                                return ret;
+                        btrfs_set_header_nritems(right, 0);
-                        }
+                        wret = insert_ptr(trans, root, path,
-                        mid = slot;
+                                          &disk_key,
-                        if (mid != nritems &&
+                                          right->start,
-                            leaf_space_used(l, mid, nritems - mid) +
+                                          path->slots[1], 1);
-                            data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (wret)
-                                double_split = 1;
+                                ret = wret;
-                        }
+                        btrfs_tree_unlock(path->nodes[0]);
-                }
+                        free_extent_buffer(path->nodes[0]);
-        } else {
+                        path->nodes[0] = right;
-                if (leaf_space_used(l, 0, mid) + data_size >
+                        path->slots[0] = 0;
-                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (path->slots[1] == 0) {
-                        if (!extend && data_size && slot == 0) {
+                                wret = fixup_low_keys(trans, root,
-                                struct btrfs_disk_key disk_key;
+                                                path, &disk_key, 1);
-                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
-                                btrfs_set_header_nritems(right, 0);
-                                wret = insert_ptr(trans, root, path,
-                                                  &disk_key,
-                                                  right->start,
-                                                  path->slots[1], 1);
                                if (wret)
                                        ret = wret;
-                                btrfs_tree_unlock(path->nodes[0]);
-                                free_extent_buffer(path->nodes[0]);
-                                path->nodes[0] = right;
-                                path->slots[0] = 0;
-                                if (path->slots[1] == 0) {
-                                        wret = fixup_low_keys(trans, root,
-                                                      path, &disk_key, 1);
-                                        if (wret)
-                                                ret = wret;
-                                }
-                                btrfs_mark_buffer_dirty(right);
-                                return ret;
-                        } else if ((extend || !data_size) && slot == 0) {
-                                mid = 1;
-                        } else {
-                                mid = slot;
-                                if (mid != nritems &&
-                                    leaf_space_used(l, mid, nritems - mid) +
-                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
-                                        double_split = 1;
-                                }
                        }
                }
+                btrfs_mark_buffer_dirty(right);
+                return ret;
        }
        ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
        BUG_ON(ret);
-        if (double_split) {
+        if (split == 2) {
                BUG_ON(num_doubles != 0);
                num_doubles++;
                goto again;
@@ -3447,7 +3398,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
                /* figure out how many keys we can insert in here */
                total_data = data_size[0];
                for (i = 1; i < nr; i++) {
-                        if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+                        if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
                                break;
                        total_data += data_size[i];
                }
@@ -3745,9 +3696,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 /*
 * a helper function to delete the leaf pointed to by path->slots[1] and
- * path->nodes[1].  bytenr is the node block pointer, but since the callers
+ * path->nodes[1].
- * already know it, it is faster to have them pass it down than to
- * read it out of the node again.
 *
 * This deletes the pointer in path->nodes[1] and frees the leaf
 * block extent.  zero is returned if it all worked out, < 0 otherwise.
@@ -3755,15 +3704,14 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 * The path must have already been setup for deleting the leaf, including
 * all the proper balancing.  path->nodes[1] must be locked.
 */
-noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
+                                   struct btrfs_root *root,
-                            struct btrfs_path *path, u64 bytenr)
+                                   struct btrfs_path *path,
+                                   struct extent_buffer *leaf)
 {
        int ret;
-        u64 root_gen = btrfs_header_generation(path->nodes[1]);
-        u64 parent_start = path->nodes[1]->start;
-        u64 parent_owner = btrfs_header_owner(path->nodes[1]);
+        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
        ret = del_ptr(trans, root, path, 1, path->slots[1]);
        if (ret)
                return ret;
@@ -3774,10 +3722,8 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
         */
        btrfs_unlock_up_safe(path, 0);
-        ret = btrfs_free_extent(trans, root, bytenr,
+        ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
-                                btrfs_level_size(root, 0),
+                                0, root->root_key.objectid, 0, 0);
-                                parent_start, parent_owner,
-                                root_gen, 0, 1);
        return ret;
 }
 /*
@@ -3845,7 +3791,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                if (leaf == root->node) {
                        btrfs_set_header_level(leaf, 0);
                } else {
-                        ret = btrfs_del_leaf(trans, root, path, leaf->start);
+                        ret = btrfs_del_leaf(trans, root, path, leaf);
                        BUG_ON(ret);
                }
        } else {
@@ -3861,8 +3807,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                }
                /* delete the leaf if it is mostly empty */
-                if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
+                if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
-                    !trans->transaction->delayed_refs.flushing) {
                        /* push_leaf_left fixes the path.
                         * make sure the path still points to our leaf
                         * for possible call to del_ptr below
@@ -3884,8 +3829,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        if (btrfs_header_nritems(leaf) == 0) {
                                path->slots[1] = slot;
-                                ret = btrfs_del_leaf(trans, root, path,
+                                ret = btrfs_del_leaf(trans, root, path, leaf);
-                                                     leaf->start);
                                BUG_ON(ret);
                                free_extent_buffer(leaf);
                        } else {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4414a5d9983a..03441a99ea38 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,8 @@ struct btrfs_ordered_sum;
 #define BTRFS_MAX_LEVEL 8
+#define BTRFS_COMPAT_EXTENT_TREE_V0
 /*
 * files bigger than this get some pre-flushing when they are added
 * to the ordered operations list.  That way we limit the total
@@ -267,7 +269,18 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 }
 #define BTRFS_FSID_SIZE 16
-#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
+#define BTRFS_HEADER_FLAG_WRITTEN       (1ULL << 0)
+#define BTRFS_HEADER_FLAG_RELOC         (1ULL << 1)
+#define BTRFS_SUPER_FLAG_SEEDING        (1ULL << 32)
+#define BTRFS_SUPER_FLAG_METADUMP       (1ULL << 33)
+#define BTRFS_BACKREF_REV_MAX           256
+#define BTRFS_BACKREF_REV_SHIFT         56
+#define BTRFS_BACKREF_REV_MASK          (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
+                                         BTRFS_BACKREF_REV_SHIFT)
+#define BTRFS_OLD_BACKREF_REV           0
+#define BTRFS_MIXED_BACKREF_REV         1
 /*
 * every tree block (leaf or node) starts with this header.
@@ -296,7 +309,6 @@ struct btrfs_header {
                                        sizeof(struct btrfs_item) - \
                                        sizeof(struct btrfs_file_extent_item))
-#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
 /*
 * this is a very generous portion of the super block, giving us
@@ -355,9 +367,12 @@ struct btrfs_super_block {
 * Compat flags that we support.  If any incompat flags are set other than the
 * ones specified below then we will fail to mount
 */
-#define BTRFS_FEATURE_COMPAT_SUPP       0x0
+#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF    (1ULL << 0)
-#define BTRFS_FEATURE_COMPAT_RO_SUPP    0x0
-#define BTRFS_FEATURE_INCOMPAT_SUPP     0x0
+#define BTRFS_FEATURE_COMPAT_SUPP               0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SUPP            0ULL
+#define BTRFS_FEATURE_INCOMPAT_SUPP             \
+        BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
 /*
 * A leaf is full of items. offset and size tell us where to find
@@ -421,23 +436,65 @@ struct btrfs_path {
        unsigned int keep_locks:1;
        unsigned int skip_locking:1;
        unsigned int leave_spinning:1;
+        unsigned int search_commit_root:1;
 };
 /*
 * items in the extent btree are used to record the objectid of the
 * owner of the block and the number of references
 */
 struct btrfs_extent_item {
+        __le64 refs;
+        __le64 generation;
+        __le64 flags;
+} __attribute__ ((__packed__));
+struct btrfs_extent_item_v0 {
        __le32 refs;
 } __attribute__ ((__packed__));
-struct btrfs_extent_ref {
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
+                                        sizeof(struct btrfs_item))
+#define BTRFS_EXTENT_FLAG_DATA          (1ULL << 0)
+#define BTRFS_EXTENT_FLAG_TREE_BLOCK    (1ULL << 1)
+/* following flags only apply to tree blocks */
+/* use full backrefs for extent pointers in the block */
+#define BTRFS_BLOCK_FLAG_FULL_BACKREF   (1ULL << 8)
+struct btrfs_tree_block_info {
+        struct btrfs_disk_key key;
+        u8 level;
+} __attribute__ ((__packed__));
+struct btrfs_extent_data_ref {
+        __le64 root;
+        __le64 objectid;
+        __le64 offset;
+        __le32 count;
+} __attribute__ ((__packed__));
+struct btrfs_shared_data_ref {
+        __le32 count;
+} __attribute__ ((__packed__));
+struct btrfs_extent_inline_ref {
+        u8 type;
+        u64 offset;
+} __attribute__ ((__packed__));
+/* old style backrefs item */
+struct btrfs_extent_ref_v0 {
        __le64 root;
        __le64 generation;
        __le64 objectid;
-        __le32 num_refs;
+        __le32 count;
 } __attribute__ ((__packed__));
 /* dev extents record free space on individual devices.  The owner
 * field points back to the chunk allocation mapping tree that allocated
 * the extent.  The chunk tree uuid field is a way to double check the owner
@@ -695,12 +752,7 @@ struct btrfs_block_group_cache {
        struct list_head cluster_list;
 };
-struct btrfs_leaf_ref_tree {
+struct reloc_control;
-        struct rb_root root;
-        struct list_head list;
-        spinlock_t lock;
-};
 struct btrfs_device;
 struct btrfs_fs_devices;
 struct btrfs_fs_info {
@@ -831,18 +883,11 @@ struct btrfs_fs_info {
        struct task_struct *cleaner_kthread;
        int thread_pool_size;
-        /* tree relocation relocated fields */
-        struct list_head dead_reloc_roots;
-        struct btrfs_leaf_ref_tree reloc_ref_tree;
-        struct btrfs_leaf_ref_tree shared_ref_tree;
        struct kobject super_kobj;
        struct completion kobj_unregister;
        int do_barriers;
        int closing;
        int log_root_recovering;
-        atomic_t throttles;
-        atomic_t throttle_gen;
        u64 total_pinned;
@@ -861,6 +906,8 @@ struct btrfs_fs_info {
         */
        struct list_head space_info;
+        struct reloc_control *reloc_ctl;
        spinlock_t delalloc_lock;
        spinlock_t new_trans_lock;
        u64 delalloc_bytes;
@@ -891,7 +938,6 @@ struct btrfs_fs_info {
 * in ram representation of the tree.  extent_root is used for all allocations
 * and for the extent tree extent_root root.
 */
-struct btrfs_dirty_root;
 struct btrfs_root {
        struct extent_buffer *node;
@@ -899,9 +945,6 @@ struct btrfs_root {
        spinlock_t node_lock;
        struct extent_buffer *commit_root;
-        struct btrfs_leaf_ref_tree *ref_tree;
-        struct btrfs_leaf_ref_tree ref_tree_struct;
-        struct btrfs_dirty_root *dirty_root;
        struct btrfs_root *log_root;
        struct btrfs_root *reloc_root;
@@ -952,10 +995,15 @@ struct btrfs_root {
        /* the dirty list is only used by non-reference counted roots */
        struct list_head dirty_list;
+        struct list_head root_list;
        spinlock_t list_lock;
-        struct list_head dead_list;
        struct list_head orphan_list;
+        spinlock_t inode_lock;
+        /* red-black tree that keeps track of in-memory inodes */
+        struct rb_root inode_tree;
        /*
         * right now this just gets used so that a root has its own devid
         * for stat.  It may be used for more later
@@ -1017,7 +1065,16 @@ struct btrfs_root {
 * are used, and how many references there are to each block
 */
 #define BTRFS_EXTENT_ITEM_KEY   168
-#define BTRFS_EXTENT_REF_KEY    180
+#define BTRFS_TREE_BLOCK_REF_KEY        176
+#define BTRFS_EXTENT_DATA_REF_KEY       178
+#define BTRFS_EXTENT_REF_V0_KEY         180
+#define BTRFS_SHARED_BLOCK_REF_KEY      182
+#define BTRFS_SHARED_DATA_REF_KEY       184
 /*
 * block groups give us hints into the extent allocation trees.  Which
@@ -1043,6 +1100,8 @@ struct btrfs_root {
 #define BTRFS_MOUNT_COMPRESS            (1 << 5)
 #define BTRFS_MOUNT_NOTREELOG           (1 << 6)
 #define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
+#define BTRFS_MOUNT_SSD_SPREAD          (1 << 8)
+#define BTRFS_MOUNT_NOSSD               (1 << 9)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -1056,12 +1115,14 @@ struct btrfs_root {
 #define BTRFS_INODE_READONLY            (1 << 2)
 #define BTRFS_INODE_NOCOMPRESS          (1 << 3)
 #define BTRFS_INODE_PREALLOC            (1 << 4)
-#define btrfs_clear_flag(inode, flag)   (BTRFS_I(inode)->flags &= \
+#define BTRFS_INODE_SYNC                (1 << 5)
-                                         ~BTRFS_INODE_##flag)
+#define BTRFS_INODE_IMMUTABLE           (1 << 6)
-#define btrfs_set_flag(inode, flag)     (BTRFS_I(inode)->flags |= \
+#define BTRFS_INODE_APPEND              (1 << 7)
-                                         BTRFS_INODE_##flag)
+#define BTRFS_INODE_NODUMP              (1 << 8)
-#define btrfs_test_flag(inode, flag)    (BTRFS_I(inode)->flags & \
+#define BTRFS_INODE_NOATIME             (1 << 9)
-                                         BTRFS_INODE_##flag)
+#define BTRFS_INODE_DIRSYNC             (1 << 10)
 /* some macros to generate set/get funcs for the struct fields.  This
 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
 * one for u8:
@@ -1317,24 +1378,67 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
        return (u8 *)((unsigned long)dev + ptr);
 }
-/* struct btrfs_extent_ref */
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
-BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
-BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
+                   generation, 64);
-BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
-BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
-                         generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
-                         objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
+static inline void btrfs_tree_block_key(struct extent_buffer *eb,
-                         num_refs, 32);
+                                        struct btrfs_tree_block_info *item,
+                                        struct btrfs_disk_key *key)
+{
+        read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
+                                            struct btrfs_tree_block_info *item,
+                                            struct btrfs_disk_key *key)
+{
+        write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
+                   root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+                   objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+                   offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
+                   count, 32);
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
+                   count, 32);
-/* struct btrfs_extent_item */
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
-BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
+                   type, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
-                         refs, 32);
+                   offset, 64);
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+        if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+            type == BTRFS_SHARED_BLOCK_REF_KEY)
+                return sizeof(struct btrfs_extent_inline_ref);
+        if (type == BTRFS_SHARED_DATA_REF_KEY)
+                return sizeof(struct btrfs_shared_data_ref) +
+                       sizeof(struct btrfs_extent_inline_ref);
+        if (type == BTRFS_EXTENT_DATA_REF_KEY)
+                return sizeof(struct btrfs_extent_data_ref) +
+                       offsetof(struct btrfs_extent_inline_ref, offset);
+        BUG();
+        return 0;
+}
+BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
+                   generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
 /* struct btrfs_node */
 BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
@@ -1558,6 +1662,21 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
        return (flags & flag) == flag;
 }
+static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+{
+        u64 flags = btrfs_header_flags(eb);
+        return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
+                                                int rev)
+{
+        u64 flags = btrfs_header_flags(eb);
+        flags &= ~BTRFS_BACKREF_REV_MASK;
+        flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+        btrfs_set_header_flags(eb, flags);
+}
 static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
 {
        unsigned long ptr = offsetof(struct btrfs_header, fsid);
@@ -1790,39 +1909,32 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 objectid, u64 bytenr);
+                          struct btrfs_root *root,
+                          u64 objectid, u64 offset, u64 bytenr);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 struct btrfs_fs_info *info,
                                                 u64 bytenr);
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 u64 btrfs_find_block_group(struct btrfs_root *root,
                           u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-                                             struct btrfs_root *root,
+                                        struct btrfs_root *root, u32 blocksize,
-                                             u32 blocksize, u64 parent,
+                                        u64 parent, u64 root_objectid,
-                                             u64 root_objectid,
+                                        struct btrfs_disk_key *key, int level,
-                                             u64 ref_generation,
+                                        u64 hint, u64 empty_size);
-                                             int level,
-                                             u64 hint,
-                                             u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
                                            int level);
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
+                                     struct btrfs_root *root,
-                       u64 num_bytes, u64 parent, u64 min_bytes,
+                                     u64 root_objectid, u64 owner,
-                       u64 root_objectid, u64 ref_generation,
+                                     u64 offset, struct btrfs_key *ins);
-                       u64 owner, u64 empty_size, u64 hint_byte,
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
-                       u64 search_end, struct btrfs_key *ins, u64 data);
+                                   struct btrfs_root *root,
-int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                                   u64 root_objectid, u64 owner, u64 offset,
-                                struct btrfs_root *root, u64 parent,
+                                   struct btrfs_key *ins);
-                                u64 root_objectid, u64 ref_generation,
-                                u64 owner, struct btrfs_key *ins);
-int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, u64 parent,
-                                u64 root_objectid, u64 ref_generation,
-                                u64 owner, struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  u64 num_bytes, u64 min_alloc_size,
@@ -1830,18 +1942,18 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                                  u64 search_end, struct btrfs_key *ins,
                                  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+                  struct extent_buffer *buf, int full_backref);
-                  u32 *nr_extents);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                  struct extent_buffer *buf, int full_backref);
-                    struct extent_buffer *buf, u32 nr_extents);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
-int btrfs_update_ref(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
-                     struct btrfs_root *root, struct extent_buffer *orig_buf,
+                                u64 bytenr, u64 num_bytes, u64 flags,
-                     struct extent_buffer *buf, int start_slot, int nr);
+                                int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent,
-                      u64 root_objectid, u64 ref_generation,
+                      u64 root_objectid, u64 owner, u64 offset);
-                      u64 owner_objectid, int pin);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
@@ -1849,13 +1961,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                         u64 root_objectid, u64 ref_generation,
+                         u64 root_objectid, u64 owner, u64 offset);
-                         u64 owner_objectid);
-int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr, u64 num_bytes,
-                            u64 orig_parent, u64 parent,
-                            u64 root_objectid, u64 ref_generation,
-                            u64 owner_objectid);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
@@ -1867,16 +1974,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
-int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+                                struct btrfs_block_group_cache *group);
-                          struct btrfs_root *root);
-int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
-int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
-                               struct extent_buffer *buf, u64 orig_start);
-int btrfs_add_dead_reloc_root(struct btrfs_root *root);
-int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -1891,13 +1991,12 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
                              u64 bytes);
 /* ctree.c */
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+                     int level, int *slot);
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
 int btrfs_previous_item(struct btrfs_root *root,
                        struct btrfs_path *path, u64 min_objectid,
                        int type);
-int btrfs_merge_path(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root,
-                     struct btrfs_key *node_keys,
-                     u64 *nodes, int lowest_level);
 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, struct btrfs_path *path,
                            struct btrfs_key *new_key);
@@ -1918,6 +2017,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      struct extent_buffer *buf,
                      struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+                              struct extent_buffer *buf);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -1944,9 +2045,6 @@ void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct btrfs_path *path, int slot, int nr);
-int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
-                            struct btrfs_path *path, u64 bytenr);
 static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path)
@@ -2005,8 +2103,9 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
                         btrfs_root_item *item, struct btrfs_key *key);
 int btrfs_search_root(struct btrfs_root *root, u64 search_start,
                      u64 *found_objectid);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
-                          struct btrfs_root *latest_root);
+int btrfs_set_root_node(struct btrfs_root_item *item,
+                        struct extent_buffer *node);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, const char *name,
@@ -2139,7 +2238,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
-void btrfs_read_locked_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, int wait);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
@@ -2147,12 +2245,8 @@ void btrfs_destroy_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
-struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
-                            struct btrfs_root *root, int wait);
-struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
-                                struct btrfs_root *root);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-                         struct btrfs_root *root, int *is_new);
+                         struct btrfs_root *root);
 int btrfs_commit_write(struct file *file, struct page *page,
                       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2168,6 +2262,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size);
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void btrfs_update_iflags(struct inode *inode);
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 /* file.c */
 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
@@ -2205,8 +2301,20 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 /* acl.c */
+#ifdef CONFIG_FS_POSIX_ACL
 int btrfs_check_acl(struct inode *inode, int mask);
+#else
+#define btrfs_check_acl NULL
+#endif
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
+/* relocation.c */
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index d6c01c096a40..84e6781413b1 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -29,27 +29,87 @@
 * add extents in the middle of btrfs_search_slot, and it allows
 * us to buffer up frequently modified backrefs in an rb tree instead
 * of hammering updates on the extent allocation tree.
- *
- * Right now this code is only used for reference counted trees, but
- * the long term goal is to get rid of the similar code for delayed
- * extent tree modifications.
 */
 /*
- * entries in the rb tree are ordered by the byte number of the extent
+ * compare two delayed tree backrefs with same bytenr and type
- * and by the byte number of the parent block.
+ */
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
+                          struct btrfs_delayed_tree_ref *ref1)
+{
+        if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+                if (ref1->root < ref2->root)
+                        return -1;
+                if (ref1->root > ref2->root)
+                        return 1;
+        } else {
+                if (ref1->parent < ref2->parent)
+                        return -1;
+                if (ref1->parent > ref2->parent)
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * compare two delayed data backrefs with same bytenr and type
 */
-static int comp_entry(struct btrfs_delayed_ref_node *ref,
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
-                      u64 bytenr, u64 parent)
+                          struct btrfs_delayed_data_ref *ref1)
 {
-        if (bytenr < ref->bytenr)
+        if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                if (ref1->root < ref2->root)
+                        return -1;
+                if (ref1->root > ref2->root)
+                        return 1;
+                if (ref1->objectid < ref2->objectid)
+                        return -1;
+                if (ref1->objectid > ref2->objectid)
+                        return 1;
+                if (ref1->offset < ref2->offset)
+                        return -1;
+                if (ref1->offset > ref2->offset)
+                        return 1;
+        } else {
+                if (ref1->parent < ref2->parent)
+                        return -1;
+                if (ref1->parent > ref2->parent)
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * entries in the rb tree are ordered by the byte number of the extent,
+ * type of the delayed backrefs and content of delayed backrefs.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref2,
+                      struct btrfs_delayed_ref_node *ref1)
+{
+        if (ref1->bytenr < ref2->bytenr)
                return -1;
-        if (bytenr > ref->bytenr)
+        if (ref1->bytenr > ref2->bytenr)
                return 1;
-        if (parent < ref->parent)
+        if (ref1->is_head && ref2->is_head)
+                return 0;
+        if (ref2->is_head)
                return -1;
-        if (parent > ref->parent)
+        if (ref1->is_head)
                return 1;
+        if (ref1->type < ref2->type)
+                return -1;
+        if (ref1->type > ref2->type)
+                return 1;
+        if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+            ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+                return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
+                                      btrfs_delayed_node_to_tree_ref(ref1));
+        } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
+                   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+                return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
+                                      btrfs_delayed_node_to_data_ref(ref1));
+        }
+        BUG();
        return 0;
 }
@@ -59,20 +119,21 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref,
 * inserted.
 */
 static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
-                                                  u64 bytenr, u64 parent,
                                                  struct rb_node *node)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent_node = NULL;
        struct btrfs_delayed_ref_node *entry;
+        struct btrfs_delayed_ref_node *ins;
        int cmp;
+        ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
        while (*p) {
                parent_node = *p;
                entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
                                 rb_node);
-                cmp = comp_entry(entry, bytenr, parent);
+                cmp = comp_entry(entry, ins);
                if (cmp < 0)
                        p = &(*p)->rb_left;
                else if (cmp > 0)
@@ -81,18 +142,17 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
                        return entry;
        }
-        entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
        rb_link_node(node, parent_node, p);
        rb_insert_color(node, root);
        return NULL;
 }
 /*
- * find an entry based on (bytenr,parent).  This returns the delayed
+ * find an head entry based on bytenr. This returns the delayed ref
- * ref if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot
 */
-static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
+static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
-                                  u64 bytenr, u64 parent,
+                                  u64 bytenr,
                                  struct btrfs_delayed_ref_node **last)
 {
        struct rb_node *n = root->rb_node;
@@ -105,7 +165,15 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
                if (last)
                        *last = entry;
-                cmp = comp_entry(entry, bytenr, parent);
+                if (bytenr < entry->bytenr)
+                        cmp = -1;
+                else if (bytenr > entry->bytenr)
+                        cmp = 1;
+                else if (!btrfs_delayed_ref_is_head(entry))
+                        cmp = 1;
+                else
+                        cmp = 0;
                if (cmp < 0)
                        n = n->rb_left;
                else if (cmp > 0)
@@ -154,7 +222,7 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                node = rb_first(&delayed_refs->root);
        } else {
                ref = NULL;
-                tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+                find_ref_head(&delayed_refs->root, start, &ref);
                if (ref) {
                        struct btrfs_delayed_ref_node *tmp;
@@ -234,7 +302,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
-        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
        if (ref) {
                prev_node = rb_prev(&ref->rb_node);
                if (!prev_node)
@@ -250,25 +318,28 @@ out:
 }
 /*
- * helper function to lookup reference count
+ * helper function to lookup reference count and flags of extent.
 *
 * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree.  This way you
+ * reference count modifications queued up in the rbtree. the head
- * can check to see what the reference count would be if all of the
+ * node may also store the extent flags to set. This way you can check
- * delayed refs are processed.
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
 */
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
+                             struct btrfs_root *root, u64 bytenr,
-                            u64 num_bytes, u32 *refs)
+                             u64 num_bytes, u64 *refs, u64 *flags)
 {
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *head;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_path *path;
-        struct extent_buffer *leaf;
        struct btrfs_extent_item *ei;
+        struct extent_buffer *leaf;
        struct btrfs_key key;
-        u32 num_refs;
+        u32 item_size;
+        u64 num_refs;
+        u64 extent_flags;
        int ret;
        path = btrfs_alloc_path();
@@ -287,37 +358,60 @@ again:
        if (ret == 0) {
                leaf = path->nodes[0];
-                ei = btrfs_item_ptr(leaf, path->slots[0],
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-                                    struct btrfs_extent_item);
+                if (item_size >= sizeof(*ei)) {
-                num_refs = btrfs_extent_refs(leaf, ei);
+                        ei = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_extent_item);
+                        num_refs = btrfs_extent_refs(leaf, ei);
+                        extent_flags = btrfs_extent_flags(leaf, ei);
+                } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        struct btrfs_extent_item_v0 *ei0;
+                        BUG_ON(item_size != sizeof(*ei0));
+                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
+                                             struct btrfs_extent_item_v0);
+                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
+                        /* FIXME: this isn't correct for data */
+                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+                        BUG();
+#endif
+                }
+                BUG_ON(num_refs == 0);
        } else {
                num_refs = 0;
+                extent_flags = 0;
                ret = 0;
        }
        spin_lock(&delayed_refs->lock);
-        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
        if (ref) {
                head = btrfs_delayed_node_to_head(ref);
-                if (mutex_trylock(&head->mutex)) {
+                if (!mutex_trylock(&head->mutex)) {
-                        num_refs += ref->ref_mod;
+                        atomic_inc(&ref->refs);
-                        mutex_unlock(&head->mutex);
+                        spin_unlock(&delayed_refs->lock);
-                        *refs = num_refs;
-                        goto out;
-                }
-                atomic_inc(&ref->refs);
+                        btrfs_release_path(root->fs_info->extent_root, path);
-                spin_unlock(&delayed_refs->lock);
-                btrfs_release_path(root->fs_info->extent_root, path);
+                        mutex_lock(&head->mutex);
+                        mutex_unlock(&head->mutex);
+                        btrfs_put_delayed_ref(ref);
+                        goto again;
+                }
+                if (head->extent_op && head->extent_op->update_flags)
+                        extent_flags |= head->extent_op->flags_to_set;
+                else
+                        BUG_ON(num_refs == 0);
-                mutex_lock(&head->mutex);
+                num_refs += ref->ref_mod;
                mutex_unlock(&head->mutex);
-                btrfs_put_delayed_ref(ref);
-                goto again;
-        } else {
-                *refs = num_refs;
        }
+        WARN_ON(num_refs == 0);
+        if (refs)
+                *refs = num_refs;
+        if (flags)
+                *flags = extent_flags;
 out:
        spin_unlock(&delayed_refs->lock);
        btrfs_free_path(path);
@@ -338,16 +432,7 @@ update_existing_ref(struct btrfs_trans_handle *trans,
                    struct btrfs_delayed_ref_node *existing,
                    struct btrfs_delayed_ref_node *update)
 {
-        struct btrfs_delayed_ref *existing_ref;
+        if (update->action != existing->action) {
-        struct btrfs_delayed_ref *ref;
-        existing_ref = btrfs_delayed_node_to_ref(existing);
-        ref = btrfs_delayed_node_to_ref(update);
-        if (ref->pin)
-                existing_ref->pin = 1;
-        if (ref->action != existing_ref->action) {
                /*
                 * this is effectively undoing either an add or a
                 * drop.  We decrement the ref_mod, and if it goes
@@ -363,20 +448,13 @@ update_existing_ref(struct btrfs_trans_handle *trans,
                        delayed_refs->num_entries--;
                        if (trans->delayed_ref_updates)
                                trans->delayed_ref_updates--;
+                } else {
+                        WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+                                existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
                }
        } else {
-                if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
+                WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
-                        /* if we're adding refs, make sure all the
+                        existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
-                         * details match up.  The extent could
-                         * have been totally freed and reallocated
-                         * by a different owner before the delayed
-                         * ref entries were removed.
-                         */
-                        existing_ref->owner_objectid = ref->owner_objectid;
-                        existing_ref->generation = ref->generation;
-                        existing_ref->root = ref->root;
-                        existing->num_bytes = update->num_bytes;
-                }
                /*
                 * the action on the existing ref matches
                 * the action on the ref we're trying to add.
@@ -401,6 +479,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
        existing_ref = btrfs_delayed_node_to_head(existing);
        ref = btrfs_delayed_node_to_head(update);
+        BUG_ON(existing_ref->is_data != ref->is_data);
        if (ref->must_insert_reserved) {
                /* if the extent was freed and then
@@ -420,6 +499,24 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
        }
+        if (ref->extent_op) {
+                if (!existing_ref->extent_op) {
+                        existing_ref->extent_op = ref->extent_op;
+                } else {
+                        if (ref->extent_op->update_key) {
+                                memcpy(&existing_ref->extent_op->key,
+                                       &ref->extent_op->key,
+                                       sizeof(ref->extent_op->key));
+                                existing_ref->extent_op->update_key = 1;
+                        }
+                        if (ref->extent_op->update_flags) {
+                                existing_ref->extent_op->flags_to_set |=
+                                        ref->extent_op->flags_to_set;
+                                existing_ref->extent_op->update_flags = 1;
+                        }
+                        kfree(ref->extent_op);
+                }
+        }
        /*
         * update the reference mod on the head to reflect this new operation
         */
@@ -427,19 +524,16 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 }
 /*
- * helper function to actually insert a delayed ref into the rbtree.
+ * helper function to actually insert a head node into the rbtree.
 * this does all the dirty work in terms of maintaining the correct
- * overall modification count in the head node and properly dealing
+ * overall modification count.
- * with updating existing nodes as new modifications are queued.
 */
-static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
-                          struct btrfs_delayed_ref_node *ref,
+                                        struct btrfs_delayed_ref_node *ref,
-                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                                        u64 bytenr, u64 num_bytes,
-                          u64 ref_generation, u64 owner_objectid, int action,
+                                        int action, int is_data)
-                          int pin)
 {
        struct btrfs_delayed_ref_node *existing;
-        struct btrfs_delayed_ref *full_ref;
        struct btrfs_delayed_ref_head *head_ref = NULL;
        struct btrfs_delayed_ref_root *delayed_refs;
        int count_mod = 1;
@@ -449,12 +543,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
         * the head node stores the sum of all the mods, so dropping a ref
         * should drop the sum in the head node by one.
         */
-        if (parent == (u64)-1) {
+        if (action == BTRFS_UPDATE_DELAYED_HEAD)
-                if (action == BTRFS_DROP_DELAYED_REF)
+                count_mod = 0;
-                        count_mod = -1;
+        else if (action == BTRFS_DROP_DELAYED_REF)
-                else if (action == BTRFS_UPDATE_DELAYED_HEAD)
+                count_mod = -1;
-                        count_mod = 0;
-        }
        /*
         * BTRFS_ADD_DELAYED_EXTENT means that we need to update
@@ -467,57 +559,148 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
         * Once we record must_insert_reserved, switch the action to
         * BTRFS_ADD_DELAYED_REF because other special casing is not required.
         */
-        if (action == BTRFS_ADD_DELAYED_EXTENT) {
+        if (action == BTRFS_ADD_DELAYED_EXTENT)
                must_insert_reserved = 1;
-                action = BTRFS_ADD_DELAYED_REF;
+        else
-        } else {
                must_insert_reserved = 0;
-        }
        delayed_refs = &trans->transaction->delayed_refs;
        /* first set the basic ref node struct up */
        atomic_set(&ref->refs, 1);
        ref->bytenr = bytenr;
-        ref->parent = parent;
+        ref->num_bytes = num_bytes;
        ref->ref_mod = count_mod;
+        ref->type  = 0;
+        ref->action  = 0;
+        ref->is_head = 1;
        ref->in_tree = 1;
+        head_ref = btrfs_delayed_node_to_head(ref);
+        head_ref->must_insert_reserved = must_insert_reserved;
+        head_ref->is_data = is_data;
+        INIT_LIST_HEAD(&head_ref->cluster);
+        mutex_init(&head_ref->mutex);
+        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+        if (existing) {
+                update_existing_head_ref(existing, ref);
+                /*
+                 * we've updated the existing ref, free the newly
+                 * allocated ref
+                 */
+                kfree(ref);
+        } else {
+                delayed_refs->num_heads++;
+                delayed_refs->num_heads_ready++;
+                delayed_refs->num_entries++;
+                trans->delayed_ref_updates++;
+        }
+        return 0;
+}
+/*
+ * helper to insert a delayed tree ref into the rbtree.
+ */
+static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+                                         struct btrfs_delayed_ref_node *ref,
+                                         u64 bytenr, u64 num_bytes, u64 parent,
+                                         u64 ref_root, int level, int action)
+{
+        struct btrfs_delayed_ref_node *existing;
+        struct btrfs_delayed_tree_ref *full_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        if (action == BTRFS_ADD_DELAYED_EXTENT)
+                action = BTRFS_ADD_DELAYED_REF;
+        delayed_refs = &trans->transaction->delayed_refs;
+        /* first set the basic ref node struct up */
+        atomic_set(&ref->refs, 1);
+        ref->bytenr = bytenr;
        ref->num_bytes = num_bytes;
+        ref->ref_mod = 1;
+        ref->action = action;
+        ref->is_head = 0;
+        ref->in_tree = 1;
-        if (btrfs_delayed_ref_is_head(ref)) {
+        full_ref = btrfs_delayed_node_to_tree_ref(ref);
-                head_ref = btrfs_delayed_node_to_head(ref);
+        if (parent) {
-                head_ref->must_insert_reserved = must_insert_reserved;
+                full_ref->parent = parent;
-                INIT_LIST_HEAD(&head_ref->cluster);
+                ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-                mutex_init(&head_ref->mutex);
        } else {
-                full_ref = btrfs_delayed_node_to_ref(ref);
                full_ref->root = ref_root;
-                full_ref->generation = ref_generation;
+                ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-                full_ref->owner_objectid = owner_objectid;
-                full_ref->pin = pin;
-                full_ref->action = action;
        }
+        full_ref->level = level;
-        existing = tree_insert(&delayed_refs->root, bytenr,
+        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
-                               parent, &ref->rb_node);
        if (existing) {
-                if (btrfs_delayed_ref_is_head(ref))
+                update_existing_ref(trans, delayed_refs, existing, ref);
-                        update_existing_head_ref(existing, ref);
+                /*
-                else
+                 * we've updated the existing ref, free the newly
-                        update_existing_ref(trans, delayed_refs, existing, ref);
+                 * allocated ref
+                 */
+                kfree(ref);
+        } else {
+                delayed_refs->num_entries++;
+                trans->delayed_ref_updates++;
+        }
+        return 0;
+}
+/*
+ * helper to insert a delayed data ref into the rbtree.
+ */
+static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                                         struct btrfs_delayed_ref_node *ref,
+                                         u64 bytenr, u64 num_bytes, u64 parent,
+                                         u64 ref_root, u64 owner, u64 offset,
+                                         int action)
+{
+        struct btrfs_delayed_ref_node *existing;
+        struct btrfs_delayed_data_ref *full_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        if (action == BTRFS_ADD_DELAYED_EXTENT)
+                action = BTRFS_ADD_DELAYED_REF;
+        delayed_refs = &trans->transaction->delayed_refs;
+        /* first set the basic ref node struct up */
+        atomic_set(&ref->refs, 1);
+        ref->bytenr = bytenr;
+        ref->num_bytes = num_bytes;
+        ref->ref_mod = 1;
+        ref->action = action;
+        ref->is_head = 0;
+        ref->in_tree = 1;
+        full_ref = btrfs_delayed_node_to_data_ref(ref);
+        if (parent) {
+                full_ref->parent = parent;
+                ref->type = BTRFS_SHARED_DATA_REF_KEY;
+        } else {
+                full_ref->root = ref_root;
+                ref->type = BTRFS_EXTENT_DATA_REF_KEY;
+        }
+        full_ref->objectid = owner;
+        full_ref->offset = offset;
+        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+        if (existing) {
+                update_existing_ref(trans, delayed_refs, existing, ref);
                /*
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
                kfree(ref);
        } else {
-                if (btrfs_delayed_ref_is_head(ref)) {
-                        delayed_refs->num_heads++;
-                        delayed_refs->num_heads_ready++;
-                }
                delayed_refs->num_entries++;
                trans->delayed_ref_updates++;
        }
@@ -525,37 +708,78 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 }
 /*
- * add a delayed ref to the tree.  This does all of the accounting required
+ * add a delayed tree ref.  This does all of the accounting required
 * to make sure the delayed ref is eventually processed before this
 * transaction commits.
 */
-int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
-                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                               u64 bytenr, u64 num_bytes, u64 parent,
-                          u64 ref_generation, u64 owner_objectid, int action,
+                               u64 ref_root,  int level, int action,
-                          int pin)
+                               struct btrfs_delayed_extent_op *extent_op)
 {
-        struct btrfs_delayed_ref *ref;
+        struct btrfs_delayed_tree_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
        int ret;
+        BUG_ON(extent_op && extent_op->is_data);
        ref = kmalloc(sizeof(*ref), GFP_NOFS);
        if (!ref)
                return -ENOMEM;
+        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        if (!head_ref) {
+                kfree(ref);
+                return -ENOMEM;
+        }
+        head_ref->extent_op = extent_op;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
        /*
-         * the parent = 0 case comes from cases where we don't actually
+         * insert both the head node and the new ref without dropping
-         * know the parent yet.  It will get updated later via a add/drop
+         * the spin lock
-         * pair.
         */
-        if (parent == 0)
+        ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-                parent = bytenr;
+                                   action, 0);
+        BUG_ON(ret);
+        ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
+                                   parent, ref_root, level, action);
+        BUG_ON(ret);
+        spin_unlock(&delayed_refs->lock);
+        return 0;
+}
+/*
+ * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
+ */
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                               u64 bytenr, u64 num_bytes,
+                               u64 parent, u64 ref_root,
+                               u64 owner, u64 offset, int action,
+                               struct btrfs_delayed_extent_op *extent_op)
+{
+        struct btrfs_delayed_data_ref *ref;
+        struct btrfs_delayed_ref_head *head_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int ret;
+        BUG_ON(extent_op && !extent_op->is_data);
+        ref = kmalloc(sizeof(*ref), GFP_NOFS);
+        if (!ref)
+                return -ENOMEM;
        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
        if (!head_ref) {
                kfree(ref);
                return -ENOMEM;
        }
+        head_ref->extent_op = extent_op;
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
@@ -563,14 +787,39 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-        ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+        ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-                                      (u64)-1, 0, 0, 0, action, pin);
+                                   action, 1);
        BUG_ON(ret);
-        ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+        ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
-                                      parent, ref_root, ref_generation,
+                                   parent, ref_root, owner, offset, action);
-                                      owner_objectid, action, pin);
+        BUG_ON(ret);
+        spin_unlock(&delayed_refs->lock);
+        return 0;
+}
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+                                u64 bytenr, u64 num_bytes,
+                                struct btrfs_delayed_extent_op *extent_op)
+{
+        struct btrfs_delayed_ref_head *head_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int ret;
+        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        if (!head_ref)
+                return -ENOMEM;
+        head_ref->extent_op = extent_op;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+                                   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+                                   extent_op->is_data);
        BUG_ON(ret);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -587,7 +836,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
        struct btrfs_delayed_ref_root *delayed_refs;
        delayed_refs = &trans->transaction->delayed_refs;
-        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
        if (ref)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
@@ -603,6 +852,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 *
 * It is the same as doing a ref add and delete in two separate calls.
 */
+#if 0
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
                          u64 bytenr, u64 num_bytes, u64 orig_parent,
                          u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -666,3 +916,4 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
+#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 3bec2ff0b15c..f6fc67ddad36 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -30,9 +30,6 @@ struct btrfs_delayed_ref_node {
        /* the starting bytenr of the extent */
        u64 bytenr;
-        /* the parent our backref will point to */
-        u64 parent;
        /* the size of the extent */
        u64 num_bytes;
@@ -50,10 +47,21 @@ struct btrfs_delayed_ref_node {
         */
        int ref_mod;
+        unsigned int action:8;
+        unsigned int type:8;
        /* is this node still in the rbtree? */
+        unsigned int is_head:1;
        unsigned int in_tree:1;
 };
+struct btrfs_delayed_extent_op {
+        struct btrfs_disk_key key;
+        u64 flags_to_set;
+        unsigned int update_key:1;
+        unsigned int update_flags:1;
+        unsigned int is_data:1;
+};
 /*
 * the head refs are used to hold a lock on a given extent, which allows us
 * to make sure that only one process is running the delayed refs
@@ -71,6 +79,7 @@ struct btrfs_delayed_ref_head {
        struct list_head cluster;
+        struct btrfs_delayed_extent_op *extent_op;
        /*
         * when a new extent is allocated, it is just reserved in memory
         * The actual extent isn't inserted into the extent allocation tree
@@ -84,27 +93,26 @@ struct btrfs_delayed_ref_head {
         * the free has happened.
         */
        unsigned int must_insert_reserved:1;
+        unsigned int is_data:1;
 };
-struct btrfs_delayed_ref {
+struct btrfs_delayed_tree_ref {
        struct btrfs_delayed_ref_node node;
+        union {
+                u64 root;
+                u64 parent;
+        };
+        int level;
+};
-        /* the root objectid our ref will point to */
+struct btrfs_delayed_data_ref {
-        u64 root;
+        struct btrfs_delayed_ref_node node;
+        union {
-        /* the generation for the backref */
+                u64 root;
-        u64 generation;
+                u64 parent;
+        };
-        /* owner_objectid of the backref  */
+        u64 objectid;
-        u64 owner_objectid;
+        u64 offset;
-        /* operation done by this entry in the rbtree */
-        u8 action;
-        /* if pin == 1, when the extent is freed it will be pinned until
-         * transaction commit
-         */
-        unsigned int pin:1;
 };
 struct btrfs_delayed_ref_root {
@@ -143,17 +151,25 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
        }
 }
-int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
-                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                               u64 bytenr, u64 num_bytes, u64 parent,
-                          u64 ref_generation, u64 owner_objectid, int action,
+                               u64 ref_root, int level, int action,
-                          int pin);
+                               struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                               u64 bytenr, u64 num_bytes,
+                               u64 parent, u64 ref_root,
+                               u64 owner, u64 offset, int action,
+                               struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+                                u64 bytenr, u64 num_bytes,
+                                struct btrfs_delayed_extent_op *extent_op);
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
+                             struct btrfs_root *root, u64 bytenr,
-                            u64 num_bytes, u32 *refs);
+                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
                          u64 bytenr, u64 num_bytes, u64 orig_parent,
                          u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -169,18 +185,24 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 */
 static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
 {
-        return node->parent == (u64)-1;
+        return node->is_head;
 }
 /*
 * helper functions to cast a node into its container
 */
-static inline struct btrfs_delayed_ref *
+static inline struct btrfs_delayed_tree_ref *
-btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
 {
        WARN_ON(btrfs_delayed_ref_is_head(node));
-        return container_of(node, struct btrfs_delayed_ref, node);
+        return container_of(node, struct btrfs_delayed_tree_ref, node);
+}
+static inline struct btrfs_delayed_data_ref *
+btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+{
+        WARN_ON(btrfs_delayed_ref_is_head(node));
+        return container_of(node, struct btrfs_delayed_data_ref, node);
 }
 static inline struct btrfs_delayed_ref_head *
@@ -188,6 +210,5 @@ btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
 {
        WARN_ON(!btrfs_delayed_ref_is_head(node));
        return container_of(node, struct btrfs_delayed_ref_head, node);
 }
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b0ea0b80c23..0d50d49d990a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,8 +26,8 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/crc32c.h>
 #include "compat.h"
-#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -36,7 +36,6 @@
 #include "print-tree.h"
 #include "async-thread.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
@@ -172,7 +171,7 @@ out:
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 {
-        return btrfs_crc32c(seed, data, len);
+        return crc32c(seed, data, len);
 }
 void btrfs_csum_final(u32 crc, char *result)
@@ -884,7 +883,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 {
        root->node = NULL;
        root->commit_root = NULL;
-        root->ref_tree = NULL;
        root->sectorsize = sectorsize;
        root->nodesize = nodesize;
        root->leafsize = leafsize;
@@ -899,12 +897,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->last_inode_alloc = 0;
        root->name = NULL;
        root->in_sysfs = 0;
+        root->inode_tree.rb_node = NULL;
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->orphan_list);
-        INIT_LIST_HEAD(&root->dead_list);
+        INIT_LIST_HEAD(&root->root_list);
        spin_lock_init(&root->node_lock);
        spin_lock_init(&root->list_lock);
+        spin_lock_init(&root->inode_lock);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        init_waitqueue_head(&root->log_writer_wait);
@@ -918,9 +918,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        extent_io_tree_init(&root->dirty_log_pages,
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
-        btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
-        root->ref_tree = &root->ref_tree_struct;
        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -959,6 +956,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
+        root->commit_root = btrfs_root_node(root);
        BUG_ON(!root->node);
        return 0;
 }
@@ -1025,20 +1023,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
         */
        root->ref_cows = 0;
-        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-                                      0, BTRFS_TREE_LOG_OBJECTID,
+                                      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
-                                      trans->transid, 0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
        }
+        memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+        btrfs_set_header_bytenr(leaf, leaf->start);
+        btrfs_set_header_generation(leaf, trans->transid);
+        btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+        btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
        root->node = leaf;
-        btrfs_set_header_nritems(root->node, 0);
-        btrfs_set_header_level(root->node, 0);
-        btrfs_set_header_bytenr(root->node, root->node->start);
-        btrfs_set_header_generation(root->node, trans->transid);
-        btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
        write_extent_buffer(root->node, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(root->node),
@@ -1081,8 +1078,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
        inode_item->nbytes = cpu_to_le64(root->leafsize);
        inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
-        btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
+        btrfs_set_root_node(&log_root->root_item, log_root->node);
-        btrfs_set_root_generation(&log_root->root_item, trans->transid);
        WARN_ON(root->log_root);
        root->log_root = log_root;
@@ -1144,6 +1140,7 @@ out:
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
+        root->commit_root = btrfs_root_node(root);
        BUG_ON(!root->node);
 insert:
        if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -1210,7 +1207,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
        }
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                            root->root_key.objectid, root);
+                                            root->root_key.objectid);
                BUG_ON(ret);
                btrfs_orphan_cleanup(root);
        }
@@ -1569,8 +1566,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
        atomic_set(&fs_info->nr_async_bios, 0);
-        atomic_set(&fs_info->throttles, 0);
-        atomic_set(&fs_info->throttle_gen, 0);
        fs_info->sb = sb;
        fs_info->max_extent = (u64)-1;
        fs_info->max_inline = 8192 * 1024;
@@ -1598,6 +1593,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
        fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
+        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
                             fs_info->btree_inode->i_mapping,
                             GFP_NOFS);
@@ -1613,10 +1609,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
        fs_info->do_barriers = 1;
-        INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
-        btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
-        btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
        BTRFS_I(fs_info->btree_inode)->root = tree_root;
        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
               sizeof(struct btrfs_key));
@@ -1674,6 +1666,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                goto fail_iput;
        }
+        features = btrfs_super_incompat_flags(disk_super);
+        if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
+                features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+                btrfs_set_super_incompat_flags(disk_super, features);
+        }
        features = btrfs_super_compat_ro_flags(disk_super) &
                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
        if (!(sb->s_flags & MS_RDONLY) && features) {
@@ -1771,7 +1769,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (ret) {
                printk(KERN_WARNING "btrfs: failed to read the system "
                       "array on %s\n", sb->s_id);
-                goto fail_sys_array;
+                goto fail_sb_buffer;
        }
        blocksize = btrfs_level_size(tree_root,
@@ -1785,6 +1783,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                           btrfs_super_chunk_root(disk_super),
                                           blocksize, generation);
        BUG_ON(!chunk_root->node);
+        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
+        chunk_root->commit_root = btrfs_root_node(chunk_root);
        read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
@@ -1810,7 +1810,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                          blocksize, generation);
        if (!tree_root->node)
                goto fail_chunk_root;
+        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+        tree_root->commit_root = btrfs_root_node(tree_root);
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
@@ -1820,14 +1821,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
-        dev_root->track_dirty = 1;
        if (ret)
                goto fail_extent_root;
+        dev_root->track_dirty = 1;
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
        if (ret)
-                goto fail_extent_root;
+                goto fail_dev_root;
        csum_root->track_dirty = 1;
@@ -1849,6 +1850,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (IS_ERR(fs_info->transaction_kthread))
                goto fail_cleaner;
+        if (!btrfs_test_opt(tree_root, SSD) &&
+            !btrfs_test_opt(tree_root, NOSSD) &&
+            !fs_info->fs_devices->rotating) {
+                printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
+                       "mode\n");
+                btrfs_set_opt(fs_info->mount_opt, SSD);
+        }
        if (btrfs_super_log_root(disk_super) != 0) {
                u64 bytenr = btrfs_super_log_root(disk_super);
@@ -1881,7 +1890,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        }
        if (!(sb->s_flags & MS_RDONLY)) {
-                ret = btrfs_cleanup_reloc_trees(tree_root);
+                ret = btrfs_recover_relocation(tree_root);
                BUG_ON(ret);
        }
@@ -1892,6 +1901,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (!fs_info->fs_root)
                goto fail_trans_kthread;
        return tree_root;
 fail_trans_kthread:
@@ -1908,14 +1918,19 @@ fail_cleaner:
 fail_csum_root:
        free_extent_buffer(csum_root->node);
+        free_extent_buffer(csum_root->commit_root);
+fail_dev_root:
+        free_extent_buffer(dev_root->node);
+        free_extent_buffer(dev_root->commit_root);
 fail_extent_root:
        free_extent_buffer(extent_root->node);
+        free_extent_buffer(extent_root->commit_root);
 fail_tree_root:
        free_extent_buffer(tree_root->node);
+        free_extent_buffer(tree_root->commit_root);
 fail_chunk_root:
        free_extent_buffer(chunk_root->node);
-fail_sys_array:
+        free_extent_buffer(chunk_root->commit_root);
-        free_extent_buffer(dev_root->node);
 fail_sb_buffer:
        btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->delalloc_workers);
@@ -2005,6 +2020,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
        return latest;
 }
+/*
+ * this should be called twice, once with wait == 0 and
+ * once with wait == 1.  When wait == 0 is done, all the buffer heads
+ * we write are pinned.
+ *
+ * They are released when wait == 1 is done.
+ * max_mirrors must be the same for both runs, and it indicates how
+ * many supers on this one device should be written.
+ *
+ * max_mirrors == 0 means to write them all.
+ */
 static int write_dev_supers(struct btrfs_device *device,
                            struct btrfs_super_block *sb,
                            int do_barriers, int wait, int max_mirrors)
@@ -2040,12 +2066,16 @@ static int write_dev_supers(struct btrfs_device *device,
                        bh = __find_get_block(device->bdev, bytenr / 4096,
                                              BTRFS_SUPER_INFO_SIZE);
                        BUG_ON(!bh);
-                        brelse(bh);
                        wait_on_buffer(bh);
-                        if (buffer_uptodate(bh)) {
+                        if (!buffer_uptodate(bh))
-                                brelse(bh);
+                                errors++;
-                                continue;
-                        }
+                        /* drop our reference */
+                        brelse(bh);
+                        /* drop the reference from the wait == 0 run */
+                        brelse(bh);
+                        continue;
                } else {
                        btrfs_set_super_bytenr(sb, bytenr);
@@ -2056,12 +2086,18 @@ static int write_dev_supers(struct btrfs_device *device,
                                              BTRFS_CSUM_SIZE);
                        btrfs_csum_final(crc, sb->csum);
+                        /*
+                         * one reference for us, and we leave it for the
+                         * caller
+                         */
                        bh = __getblk(device->bdev, bytenr / 4096,
                                      BTRFS_SUPER_INFO_SIZE);
                        memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
-                        set_buffer_uptodate(bh);
+                        /* one reference for submit_bh */
                        get_bh(bh);
+                        set_buffer_uptodate(bh);
                        lock_buffer(bh);
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
@@ -2073,6 +2109,7 @@ static int write_dev_supers(struct btrfs_device *device,
                                       device->name);
                                set_buffer_uptodate(bh);
                                device->barriers = 0;
+                                /* one reference for submit_bh */
                                get_bh(bh);
                                lock_buffer(bh);
                                ret = submit_bh(WRITE_SYNC, bh);
@@ -2081,22 +2118,15 @@ static int write_dev_supers(struct btrfs_device *device,
                        ret = submit_bh(WRITE_SYNC, bh);
                }
-                if (!ret && wait) {
+                if (ret)
-                        wait_on_buffer(bh);
-                        if (!buffer_uptodate(bh))
-                                errors++;
-                } else if (ret) {
                        errors++;
-                }
-                if (wait)
-                        brelse(bh);
        }
        return errors < i ? 0 : -1;
 }
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
-        struct list_head *head = &root->fs_info->fs_devices->devices;
+        struct list_head *head;
        struct btrfs_device *dev;
        struct btrfs_super_block *sb;
        struct btrfs_dev_item *dev_item;
@@ -2111,6 +2141,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        sb = &root->fs_info->super_for_commit;
        dev_item = &sb->dev_item;
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        head = &root->fs_info->fs_devices->devices;
        list_for_each_entry(dev, head, dev_list) {
                if (!dev->bdev) {
                        total_errors++;
@@ -2154,6 +2187,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
                if (ret)
                        total_errors++;
        }
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        if (total_errors > max_errors) {
                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
                       total_errors);
@@ -2173,6 +2207,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
+        WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
        radix_tree_delete(&fs_info->fs_roots_radix,
                          (unsigned long)root->root_key.objectid);
        if (root->anon_super.s_dev) {
@@ -2219,10 +2254,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
                                             ARRAY_SIZE(gang));
                if (!ret)
                        break;
+                root_objectid = gang[ret - 1]->root_key.objectid + 1;
                for (i = 0; i < ret; i++) {
                        root_objectid = gang[i]->root_key.objectid;
                        ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                                    root_objectid, gang[i]);
+                                                    root_objectid);
                        BUG_ON(ret);
                        btrfs_orphan_cleanup(gang[i]);
                }
@@ -2278,20 +2315,16 @@ int close_ctree(struct btrfs_root *root)
                       (unsigned long long)fs_info->total_ref_cache_size);
        }
-        if (fs_info->extent_root->node)
+        free_extent_buffer(fs_info->extent_root->node);
-                free_extent_buffer(fs_info->extent_root->node);
+        free_extent_buffer(fs_info->extent_root->commit_root);
+        free_extent_buffer(fs_info->tree_root->node);
-        if (fs_info->tree_root->node)
+        free_extent_buffer(fs_info->tree_root->commit_root);
-                free_extent_buffer(fs_info->tree_root->node);
+        free_extent_buffer(root->fs_info->chunk_root->node);
+        free_extent_buffer(root->fs_info->chunk_root->commit_root);
-        if (root->fs_info->chunk_root->node)
+        free_extent_buffer(root->fs_info->dev_root->node);
-                free_extent_buffer(root->fs_info->chunk_root->node);
+        free_extent_buffer(root->fs_info->dev_root->commit_root);
+        free_extent_buffer(root->fs_info->csum_root->node);
-        if (root->fs_info->dev_root->node)
+        free_extent_buffer(root->fs_info->csum_root->commit_root);
-                free_extent_buffer(root->fs_info->dev_root->node);
-        if (root->fs_info->csum_root->node)
-                free_extent_buffer(root->fs_info->csum_root->node);
        btrfs_free_block_groups(root->fs_info);
@@ -2373,17 +2406,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
         * looks as though older kernels can get into trouble with
         * this code, they end up stuck in balance_dirty_pages forever
         */
-        struct extent_io_tree *tree;
        u64 num_dirty;
-        u64 start = 0;
        unsigned long thresh = 32 * 1024 * 1024;
-        tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
        if (current->flags & PF_MEMALLOC)
                return;
-        num_dirty = count_range_bits(tree, &start, (u64)-1,
+        num_dirty = root->fs_info->dirty_metadata_bytes;
-                                     thresh, EXTENT_DIRTY);
        if (num_dirty > thresh) {
                balance_dirty_pages_ratelimited_nr(
                                   root->fs_info->btree_inode->i_mapping, 1);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 85315d2c90de..9596b40caa4e 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -78,7 +78,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
-        inode = btrfs_iget(sb, &key, root, NULL);
+        inode = btrfs_iget(sb, &key, root);
        if (IS_ERR(inode))
                return (void *)inode;
@@ -192,7 +192,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
-        return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+        return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
 }
 const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 35af93355063..edc7d208c5ce 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,50 +23,39 @@
 #include <linux/rcupdate.h>
 #include "compat.h"
 #include "hash.h"
-#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
 #include "transaction.h"
 #include "volumes.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "free-space-cache.h"
-#define PENDING_EXTENT_INSERT 0
-#define PENDING_EXTENT_DELETE 1
-#define PENDING_BACKREF_UPDATE 2
-struct pending_extent_op {
-        int type;
-        u64 bytenr;
-        u64 num_bytes;
-        u64 parent;
-        u64 orig_parent;
-        u64 generation;
-        u64 orig_generation;
-        int level;
-        struct list_head list;
-        int del;
-};
-static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-                                         struct btrfs_root *root, u64 parent,
-                                         u64 root_objectid, u64 ref_generation,
-                                         u64 owner, struct btrfs_key *ins,
-                                         int ref_mod);
 static int update_reserved_extents(struct btrfs_root *root,
                                   u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc,
                              int mark_free);
-static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
+                                struct btrfs_root *root,
-                                        u64 bytenr, u64 num_bytes, u64 parent,
+                                u64 bytenr, u64 num_bytes, u64 parent,
-                                        u64 root_objectid, u64 ref_generation,
+                                u64 root_objectid, u64 owner_objectid,
-                                        u64 owner_objectid, int pin,
+                                u64 owner_offset, int refs_to_drop,
-                                        int ref_to_drop);
+                                struct btrfs_delayed_extent_op *extra_op);
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+                                    struct extent_buffer *leaf,
+                                    struct btrfs_extent_item *ei);
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      u64 parent, u64 root_objectid,
+                                      u64 flags, u64 owner, u64 offset,
+                                      struct btrfs_key *ins, int ref_mod);
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     u64 parent, u64 root_objectid,
+                                     u64 flags, struct btrfs_disk_key *key,
+                                     int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -453,199 +442,969 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 *    maintenance.  This is actually the same as #2, but with a slightly
 *    different use case.
 *
+ * There are two kinds of back refs. The implicit back refs is optimized
+ * for pointers in non-shared tree blocks. For a given pointer in a block,
+ * back refs of this kind provide information about the block's owner tree
+ * and the pointer's key. These information allow us to find the block by
+ * b-tree searching. The full back refs is for pointers in tree blocks not
+ * referenced by their owner trees. The location of tree block is recorded
+ * in the back refs. Actually the full back refs is generic, and can be
+ * used in all cases the implicit back refs is used. The major shortcoming
+ * of the full back refs is its overhead. Every time a tree block gets
+ * COWed, we have to update back refs entry for all pointers in it.
+ *
+ * For a newly allocated tree block, we use implicit back refs for
+ * pointers in it. This means most tree related operations only involve
+ * implicit back refs. For a tree block created in old transaction, the
+ * only way to drop a reference to it is COW it. So we can detect the
+ * event that tree block loses its owner tree's reference and do the
+ * back refs conversion.
+ *
+ * When a tree block is COW'd through a tree, there are four cases:
+ *
+ * The reference count of the block is one and the tree is the block's
+ * owner tree. Nothing to do in this case.
+ *
+ * The reference count of the block is one and the tree is not the
+ * block's owner tree. In this case, full back refs is used for pointers
+ * in the block. Remove these full back refs, add implicit back refs for
+ * every pointers in the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * the block's owner tree. In this case, implicit back refs is used for
+ * pointers in the block. Add full back refs for every pointers in the
+ * block, increase lower level extents' reference counts. The original
+ * implicit back refs are entailed to the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * not the block's owner tree. Add implicit back refs for every pointer in
+ * the new block, increase lower level extents' reference count.
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent,
+ * The key type is used to differentiate between types of back refs.
+ * There are different meanings of the key offset for different types
+ * of back refs.
+ *
 * File extents can be referenced by:
 *
 * - multiple snapshots, subvolumes, or different generations in one subvol
 * - different files inside a single subvolume
 * - different offsets inside a file (bookend extents in file.c)
 *
- * The extent ref structure has fields for:
+ * The extent ref structure for the implicit back refs has fields for:
 *
 * - Objectid of the subvolume root
- * - Generation number of the tree holding the reference
 * - objectid of the file holding the reference
- * - number of references holding by parent node (alway 1 for tree blocks)
+ * - original offset in the file
- *
+ * - how many bookend extents
- * Btree leaf may hold multiple references to a file extent. In most cases,
- * these references are from same file and the corresponding offsets inside
- * the file are close together.
- *
- * When a file extent is allocated the fields are filled in:
- *     (root_key.objectid, trans->transid, inode objectid, 1)
 *
- * When a leaf is cow'd new references are added for every file extent found
+ * The key offset for the implicit back refs is hash of the first
- * in the leaf.  It looks similar to the create case, but trans->transid will
+ * three fields.
- * be different when the block is cow'd.
 *
- *     (root_key.objectid, trans->transid, inode objectid,
+ * The extent ref structure for the full back refs has field for:
- *      number of references in the leaf)
 *
- * When a file extent is removed either during snapshot deletion or
+ * - number of pointers in the tree leaf
- * file truncation, we find the corresponding back reference and check
- * the following fields:
 *
- *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
+ * The key offset for the implicit back refs is the first byte of
- *      inode objectid)
+ * the tree leaf
 *
- * Btree extents can be referenced by:
+ * When a file extent is allocated, The implicit back refs is used.
- *
+ * the fields are filled in:
- * - Different subvolumes
- * - Different generations of the same subvolume
- *
- * When a tree block is created, back references are inserted:
 *
- * (root->root_key.objectid, trans->transid, level, 1)
+ *     (root_key.objectid, inode objectid, offset in file, 1)
 *
- * When a tree block is cow'd, new back references are added for all the
+ * When a file extent is removed file truncation, we find the
- * blocks it points to. If the tree block isn't in reference counted root,
+ * corresponding implicit back refs and check the following fields:
- * the old back references are removed. These new back references are of
- * the form (trans->transid will have increased since creation):
 *
- * (root->root_key.objectid, trans->transid, level, 1)
+ *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 *
- * When a backref is in deleting, the following fields are checked:
+ * Btree extents can be referenced by:
 *
- * if backref was for a tree root:
+ * - Different subvolumes
- *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
- * else
- *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
 *
- * Back Reference Key composing:
+ * Both the implicit back refs and the full back refs for tree blocks
+ * only consist of key. The key offset for the implicit back refs is
+ * objectid of block's owner tree. The key offset for the full back refs
+ * is the first byte of parent block.
 *
- * The key objectid corresponds to the first byte in the extent, the key
+ * When implicit back refs is used, information about the lowest key and
- * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
+ * level of the tree block are required. These information are stored in
- * byte of parent extent. If a extent is tree root, the key offset is set
+ * tree block info structure.
- * to the key objectid.
 */
-static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                                          struct btrfs_root *root,
+static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
-                                          struct btrfs_path *path,
+                                  struct btrfs_root *root,
-                                          u64 bytenr, u64 parent,
+                                  struct btrfs_path *path,
-                                          u64 ref_root, u64 ref_generation,
+                                  u64 owner, u32 extra_size)
-                                          u64 owner_objectid, int del)
 {
+        struct btrfs_extent_item *item;
+        struct btrfs_extent_item_v0 *ei0;
+        struct btrfs_extent_ref_v0 *ref0;
+        struct btrfs_tree_block_info *bi;
+        struct extent_buffer *leaf;
        struct btrfs_key key;
-        struct btrfs_extent_ref *ref;
+        struct btrfs_key found_key;
+        u32 new_size = sizeof(*item);
+        u64 refs;
+        int ret;
+        leaf = path->nodes[0];
+        BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+        ei0 = btrfs_item_ptr(leaf, path->slots[0],
+                             struct btrfs_extent_item_v0);
+        refs = btrfs_extent_refs_v0(leaf, ei0);
+        if (owner == (u64)-1) {
+                while (1) {
+                        if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret < 0)
+                                        return ret;
+                                BUG_ON(ret > 0);
+                                leaf = path->nodes[0];
+                        }
+                        btrfs_item_key_to_cpu(leaf, &found_key,
+                                              path->slots[0]);
+                        BUG_ON(key.objectid != found_key.objectid);
+                        if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
+                                path->slots[0]++;
+                                continue;
+                        }
+                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
+                                              struct btrfs_extent_ref_v0);
+                        owner = btrfs_ref_objectid_v0(leaf, ref0);
+                        break;
+                }
+        }
+        btrfs_release_path(root, path);
+        if (owner < BTRFS_FIRST_FREE_OBJECTID)
+                new_size += sizeof(*bi);
+        new_size -= sizeof(*ei0);
+        ret = btrfs_search_slot(trans, root, &key, path,
+                                new_size + extra_size, 1);
+        if (ret < 0)
+                return ret;
+        BUG_ON(ret);
+        ret = btrfs_extend_item(trans, root, path, new_size);
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        btrfs_set_extent_refs(leaf, item, refs);
+        /* FIXME: get real generation */
+        btrfs_set_extent_generation(leaf, item, 0);
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                btrfs_set_extent_flags(leaf, item,
+                                       BTRFS_EXTENT_FLAG_TREE_BLOCK |
+                                       BTRFS_BLOCK_FLAG_FULL_BACKREF);
+                bi = (struct btrfs_tree_block_info *)(item + 1);
+                /* FIXME: get first key of the block */
+                memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
+                btrfs_set_tree_block_level(leaf, bi, (int)owner);
+        } else {
+                btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        return 0;
+}
+#endif
+static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
+{
+        u32 high_crc = ~(u32)0;
+        u32 low_crc = ~(u32)0;
+        __le64 lenum;
+        lenum = cpu_to_le64(root_objectid);
+        high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+        lenum = cpu_to_le64(owner);
+        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+        lenum = cpu_to_le64(offset);
+        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+        return ((u64)high_crc << 31) ^ (u64)low_crc;
+}
+static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
+                                     struct btrfs_extent_data_ref *ref)
+{
+        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
+                                    btrfs_extent_data_ref_objectid(leaf, ref),
+                                    btrfs_extent_data_ref_offset(leaf, ref));
+}
+static int match_extent_data_ref(struct extent_buffer *leaf,
+                                 struct btrfs_extent_data_ref *ref,
+                                 u64 root_objectid, u64 owner, u64 offset)
+{
+        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
+            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
+            btrfs_extent_data_ref_offset(leaf, ref) != offset)
+                return 0;
+        return 1;
+}
+static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
+                                           struct btrfs_root *root,
+                                           struct btrfs_path *path,
+                                           u64 bytenr, u64 parent,
+                                           u64 root_objectid,
+                                           u64 owner, u64 offset)
+{
+        struct btrfs_key key;
+        struct btrfs_extent_data_ref *ref;
        struct extent_buffer *leaf;
-        u64 ref_objectid;
+        u32 nritems;
        int ret;
+        int recow;
+        int err = -ENOENT;
        key.objectid = bytenr;
-        key.type = BTRFS_EXTENT_REF_KEY;
+        if (parent) {
-        key.offset = parent;
+                key.type = BTRFS_SHARED_DATA_REF_KEY;
+                key.offset = parent;
+        } else {
+                key.type = BTRFS_EXTENT_DATA_REF_KEY;
+                key.offset = hash_extent_data_ref(root_objectid,
+                                                  owner, offset);
+        }
+again:
+        recow = 0;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0) {
+                err = ret;
+                goto fail;
+        }
-        ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
+        if (parent) {
-        if (ret < 0)
+                if (!ret)
-                goto out;
+                        return 0;
-        if (ret > 0) {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                ret = -ENOENT;
+                key.type = BTRFS_EXTENT_REF_V0_KEY;
-                goto out;
+                btrfs_release_path(root, path);
+                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+                if (ret < 0) {
+                        err = ret;
+                        goto fail;
+                }
+                if (!ret)
+                        return 0;
+#endif
+                goto fail;
        }
        leaf = path->nodes[0];
-        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        nritems = btrfs_header_nritems(leaf);
-        ref_objectid = btrfs_ref_objectid(leaf, ref);
+        while (1) {
-        if (btrfs_ref_root(leaf, ref) != ref_root ||
+                if (path->slots[0] >= nritems) {
-            btrfs_ref_generation(leaf, ref) != ref_generation ||
+                        ret = btrfs_next_leaf(root, path);
-            (ref_objectid != owner_objectid &&
+                        if (ret < 0)
-             ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+                                err = ret;
-                ret = -EIO;
+                        if (ret)
-                WARN_ON(1);
+                                goto fail;
-                goto out;
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                        recow = 1;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != bytenr ||
+                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
+                        goto fail;
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_data_ref);
+                if (match_extent_data_ref(leaf, ref, root_objectid,
+                                          owner, offset)) {
+                        if (recow) {
+                                btrfs_release_path(root, path);
+                                goto again;
+                        }
+                        err = 0;
+                        break;
+                }
+                path->slots[0]++;
        }
-        ret = 0;
+fail:
-out:
+        return err;
-        return ret;
 }
-static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
-                                          struct btrfs_root *root,
+                                           struct btrfs_root *root,
-                                          struct btrfs_path *path,
+                                           struct btrfs_path *path,
-                                          u64 bytenr, u64 parent,
+                                           u64 bytenr, u64 parent,
-                                          u64 ref_root, u64 ref_generation,
+                                           u64 root_objectid, u64 owner,
-                                          u64 owner_objectid,
+                                           u64 offset, int refs_to_add)
-                                          int refs_to_add)
 {
        struct btrfs_key key;
        struct extent_buffer *leaf;
-        struct btrfs_extent_ref *ref;
+        u32 size;
        u32 num_refs;
        int ret;
        key.objectid = bytenr;
-        key.type = BTRFS_EXTENT_REF_KEY;
+        if (parent) {
-        key.offset = parent;
+                key.type = BTRFS_SHARED_DATA_REF_KEY;
+                key.offset = parent;
+                size = sizeof(struct btrfs_shared_data_ref);
+        } else {
+                key.type = BTRFS_EXTENT_DATA_REF_KEY;
+                key.offset = hash_extent_data_ref(root_objectid,
+                                                  owner, offset);
+                size = sizeof(struct btrfs_extent_data_ref);
+        }
-        ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
+        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
-        if (ret == 0) {
+        if (ret && ret != -EEXIST)
-                leaf = path->nodes[0];
+                goto fail;
-                ref = btrfs_item_ptr(leaf, path->slots[0],
-                                     struct btrfs_extent_ref);
+        leaf = path->nodes[0];
-                btrfs_set_ref_root(leaf, ref, ref_root);
+        if (parent) {
-                btrfs_set_ref_generation(leaf, ref, ref_generation);
+                struct btrfs_shared_data_ref *ref;
-                btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-                btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
-        } else if (ret == -EEXIST) {
-                u64 existing_owner;
-                BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
-                leaf = path->nodes[0];
                ref = btrfs_item_ptr(leaf, path->slots[0],
-                                     struct btrfs_extent_ref);
+                                     struct btrfs_shared_data_ref);
-                if (btrfs_ref_root(leaf, ref) != ref_root ||
+                if (ret == 0) {
-                    btrfs_ref_generation(leaf, ref) != ref_generation) {
+                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
-                        ret = -EIO;
+                } else {
-                        WARN_ON(1);
+                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
-                        goto out;
+                        num_refs += refs_to_add;
+                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
                }
+        } else {
+                struct btrfs_extent_data_ref *ref;
+                while (ret == -EEXIST) {
+                        ref = btrfs_item_ptr(leaf, path->slots[0],
+                                             struct btrfs_extent_data_ref);
+                        if (match_extent_data_ref(leaf, ref, root_objectid,
+                                                  owner, offset))
+                                break;
+                        btrfs_release_path(root, path);
+                        key.offset++;
+                        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                                      size);
+                        if (ret && ret != -EEXIST)
+                                goto fail;
-                num_refs = btrfs_ref_num_refs(leaf, ref);
+                        leaf = path->nodes[0];
-                BUG_ON(num_refs == 0);
+                }
-                btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_data_ref);
-                existing_owner = btrfs_ref_objectid(leaf, ref);
+                if (ret == 0) {
-                if (existing_owner != owner_objectid &&
+                        btrfs_set_extent_data_ref_root(leaf, ref,
-                    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
+                                                       root_objectid);
-                        btrfs_set_ref_objectid(leaf, ref,
+                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
-                                        BTRFS_MULTIPLE_OBJECTIDS);
+                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+                } else {
+                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
+                        num_refs += refs_to_add;
+                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
                }
-                ret = 0;
-        } else {
-                goto out;
        }
-        btrfs_unlock_up_safe(path, 1);
+        btrfs_mark_buffer_dirty(leaf);
-        btrfs_mark_buffer_dirty(path->nodes[0]);
+        ret = 0;
-out:
+fail:
        btrfs_release_path(root, path);
        return ret;
 }
-static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
-                                          struct btrfs_root *root,
+                                           struct btrfs_root *root,
-                                          struct btrfs_path *path,
+                                           struct btrfs_path *path,
-                                          int refs_to_drop)
+                                           int refs_to_drop)
 {
+        struct btrfs_key key;
+        struct btrfs_extent_data_ref *ref1 = NULL;
+        struct btrfs_shared_data_ref *ref2 = NULL;
        struct extent_buffer *leaf;
-        struct btrfs_extent_ref *ref;
+        u32 num_refs = 0;
-        u32 num_refs;
        int ret = 0;
        leaf = path->nodes[0];
-        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-        num_refs = btrfs_ref_num_refs(leaf, ref);
+        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                ref1 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_extent_data_ref);
+                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+                ref2 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_shared_data_ref);
+                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+                struct btrfs_extent_ref_v0 *ref0;
+                ref0 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_extent_ref_v0);
+                num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+        } else {
+                BUG();
+        }
        BUG_ON(num_refs < refs_to_drop);
        num_refs -= refs_to_drop;
        if (num_refs == 0) {
                ret = btrfs_del_item(trans, root, path);
        } else {
-                btrfs_set_ref_num_refs(leaf, ref, num_refs);
+                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
+                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
+                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
+                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                else {
+                        struct btrfs_extent_ref_v0 *ref0;
+                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
+                                        struct btrfs_extent_ref_v0);
+                        btrfs_set_ref_count_v0(leaf, ref0, num_refs);
+                }
+#endif
                btrfs_mark_buffer_dirty(leaf);
        }
+        return ret;
+}
+static noinline u32 extent_data_ref_count(struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          struct btrfs_extent_inline_ref *iref)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_data_ref *ref1;
+        struct btrfs_shared_data_ref *ref2;
+        u32 num_refs = 0;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+        if (iref) {
+                if (btrfs_extent_inline_ref_type(leaf, iref) ==
+                    BTRFS_EXTENT_DATA_REF_KEY) {
+                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+                } else {
+                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+                }
+        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                ref1 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_extent_data_ref);
+                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+                ref2 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_shared_data_ref);
+                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+                struct btrfs_extent_ref_v0 *ref0;
+                ref0 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_extent_ref_v0);
+                num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+        } else {
+                WARN_ON(1);
+        }
+        return num_refs;
+}
+static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, u64 parent,
+                                          u64 root_objectid)
+{
+        struct btrfs_key key;
+        int ret;
+        key.objectid = bytenr;
+        if (parent) {
+                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+                key.offset = parent;
+        } else {
+                key.type = BTRFS_TREE_BLOCK_REF_KEY;
+                key.offset = root_objectid;
+        }
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret > 0)
+                ret = -ENOENT;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (ret == -ENOENT && parent) {
+                btrfs_release_path(root, path);
+                key.type = BTRFS_EXTENT_REF_V0_KEY;
+                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+                if (ret > 0)
+                        ret = -ENOENT;
+        }
+#endif
+        return ret;
+}
+static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, u64 parent,
+                                          u64 root_objectid)
+{
+        struct btrfs_key key;
+        int ret;
+        key.objectid = bytenr;
+        if (parent) {
+                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+                key.offset = parent;
+        } else {
+                key.type = BTRFS_TREE_BLOCK_REF_KEY;
+                key.offset = root_objectid;
+        }
+        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
        btrfs_release_path(root, path);
        return ret;
 }
+static inline int extent_ref_type(u64 parent, u64 owner)
+{
+        int type;
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                if (parent > 0)
+                        type = BTRFS_SHARED_BLOCK_REF_KEY;
+                else
+                        type = BTRFS_TREE_BLOCK_REF_KEY;
+        } else {
+                if (parent > 0)
+                        type = BTRFS_SHARED_DATA_REF_KEY;
+                else
+                        type = BTRFS_EXTENT_DATA_REF_KEY;
+        }
+        return type;
+}
+static int find_next_key(struct btrfs_path *path, struct btrfs_key *key)
+{
+        int level;
+        BUG_ON(!path->keep_locks);
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                if (!path->nodes[level])
+                        break;
+                btrfs_assert_tree_locked(path->nodes[level]);
+                if (path->slots[level] + 1 >=
+                    btrfs_header_nritems(path->nodes[level]))
+                        continue;
+                if (level == 0)
+                        btrfs_item_key_to_cpu(path->nodes[level], key,
+                                              path->slots[level] + 1);
+                else
+                        btrfs_node_key_to_cpu(path->nodes[level], key,
+                                              path->slots[level] + 1);
+                return 0;
+        }
+        return 1;
+}
+/*
+ * look for inline back ref. if back ref is found, *ref_ret is set
+ * to the address of inline back ref, and 0 is returned.
+ *
+ * if back ref isn't found, *ref_ret is set to the address where it
+ * should be inserted, and -ENOENT is returned.
+ *
+ * if insert is true and there are too many inline back refs, the path
+ * points to the extent item, and -EAGAIN is returned.
+ *
+ * NOTE: inline back refs are ordered in the same way that back ref
+ *       items in the tree are ordered.
+ */
+static noinline_for_stack
+int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref **ref_ret,
+                                 u64 bytenr, u64 num_bytes,
+                                 u64 parent, u64 root_objectid,
+                                 u64 owner, u64 offset, int insert)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_inline_ref *iref;
+        u64 flags;
+        u64 item_size;
+        unsigned long ptr;
+        unsigned long end;
+        int extra_size;
+        int type;
+        int want;
+        int ret;
+        int err = 0;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = num_bytes;
+        want = extent_ref_type(parent, owner);
+        if (insert) {
+                extra_size = btrfs_extent_inline_ref_size(want);
+                path->keep_locks = 1;
+        } else
+                extra_size = -1;
+        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (item_size < sizeof(*ei)) {
+                if (!insert) {
+                        err = -ENOENT;
+                        goto out;
+                }
+                ret = convert_extent_item_v0(trans, root, path, owner,
+                                             extra_size);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        }
+#endif
+        BUG_ON(item_size < sizeof(*ei));
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        flags = btrfs_extent_flags(leaf, ei);
+        ptr = (unsigned long)(ei + 1);
+        end = (unsigned long)ei + item_size;
+        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                ptr += sizeof(struct btrfs_tree_block_info);
+                BUG_ON(ptr > end);
+        } else {
+                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+        }
+        err = -ENOENT;
+        while (1) {
+                if (ptr >= end) {
+                        WARN_ON(ptr > end);
+                        break;
+                }
+                iref = (struct btrfs_extent_inline_ref *)ptr;
+                type = btrfs_extent_inline_ref_type(leaf, iref);
+                if (want < type)
+                        break;
+                if (want > type) {
+                        ptr += btrfs_extent_inline_ref_size(type);
+                        continue;
+                }
+                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+                        struct btrfs_extent_data_ref *dref;
+                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        if (match_extent_data_ref(leaf, dref, root_objectid,
+                                                  owner, offset)) {
+                                err = 0;
+                                break;
+                        }
+                        if (hash_extent_data_ref_item(leaf, dref) <
+                            hash_extent_data_ref(root_objectid, owner, offset))
+                                break;
+                } else {
+                        u64 ref_offset;
+                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+                        if (parent > 0) {
+                                if (parent == ref_offset) {
+                                        err = 0;
+                                        break;
+                                }
+                                if (ref_offset < parent)
+                                        break;
+                        } else {
+                                if (root_objectid == ref_offset) {
+                                        err = 0;
+                                        break;
+                                }
+                                if (ref_offset < root_objectid)
+                                        break;
+                        }
+                }
+                ptr += btrfs_extent_inline_ref_size(type);
+        }
+        if (err == -ENOENT && insert) {
+                if (item_size + extra_size >=
+                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
+                        err = -EAGAIN;
+                        goto out;
+                }
+                /*
+                 * To add new inline back ref, we have to make sure
+                 * there is no corresponding back ref item.
+                 * For simplicity, we just do not add new inline back
+                 * ref if there is any kind of item for this block
+                 */
+                if (find_next_key(path, &key) == 0 && key.objectid == bytenr &&
+                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
+                        err = -EAGAIN;
+                        goto out;
+                }
+        }
+        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
+out:
+        if (insert) {
+                path->keep_locks = 0;
+                btrfs_unlock_up_safe(path, 1);
+        }
+        return err;
+}
+/*
+ * helper to add new inline back ref
+ */
+static noinline_for_stack
+int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                struct btrfs_extent_inline_ref *iref,
+                                u64 parent, u64 root_objectid,
+                                u64 owner, u64 offset, int refs_to_add,
+                                struct btrfs_delayed_extent_op *extent_op)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        unsigned long ptr;
+        unsigned long end;
+        unsigned long item_offset;
+        u64 refs;
+        int size;
+        int type;
+        int ret;
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        item_offset = (unsigned long)iref - (unsigned long)ei;
+        type = extent_ref_type(parent, owner);
+        size = btrfs_extent_inline_ref_size(type);
+        ret = btrfs_extend_item(trans, root, path, size);
+        BUG_ON(ret);
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        refs = btrfs_extent_refs(leaf, ei);
+        refs += refs_to_add;
+        btrfs_set_extent_refs(leaf, ei, refs);
+        if (extent_op)
+                __run_delayed_extent_op(extent_op, leaf, ei);
+        ptr = (unsigned long)ei + item_offset;
+        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+        if (ptr < end - size)
+                memmove_extent_buffer(leaf, ptr + size, ptr,
+                                      end - size - ptr);
+        iref = (struct btrfs_extent_inline_ref *)ptr;
+        btrfs_set_extent_inline_ref_type(leaf, iref, type);
+        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+                struct btrfs_extent_data_ref *dref;
+                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
+                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
+                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
+                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
+        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+                struct btrfs_shared_data_ref *sref;
+                sref = (struct btrfs_shared_data_ref *)(iref + 1);
+                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
+                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+        } else {
+                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        return 0;
+}
+static int lookup_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref **ref_ret,
+                                 u64 bytenr, u64 num_bytes, u64 parent,
+                                 u64 root_objectid, u64 owner, u64 offset)
+{
+        int ret;
+        ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
+                                           bytenr, num_bytes, parent,
+                                           root_objectid, owner, offset, 0);
+        if (ret != -ENOENT)
+                return ret;
+        btrfs_release_path(root, path);
+        *ref_ret = NULL;
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
+                                            root_objectid);
+        } else {
+                ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
+                                             root_objectid, owner, offset);
+        }
+        return ret;
+}
+/*
+ * helper to update/remove inline back ref
+ */
+static noinline_for_stack
+int update_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref *iref,
+                                 int refs_to_mod,
+                                 struct btrfs_delayed_extent_op *extent_op)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_data_ref *dref = NULL;
+        struct btrfs_shared_data_ref *sref = NULL;
+        unsigned long ptr;
+        unsigned long end;
+        u32 item_size;
+        int size;
+        int type;
+        int ret;
+        u64 refs;
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        refs = btrfs_extent_refs(leaf, ei);
+        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
+        refs += refs_to_mod;
+        btrfs_set_extent_refs(leaf, ei, refs);
+        if (extent_op)
+                __run_delayed_extent_op(extent_op, leaf, ei);
+        type = btrfs_extent_inline_ref_type(leaf, iref);
+        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                refs = btrfs_extent_data_ref_count(leaf, dref);
+        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+                sref = (struct btrfs_shared_data_ref *)(iref + 1);
+                refs = btrfs_shared_data_ref_count(leaf, sref);
+        } else {
+                refs = 1;
+                BUG_ON(refs_to_mod != -1);
+        }
+        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
+        refs += refs_to_mod;
+        if (refs > 0) {
+                if (type == BTRFS_EXTENT_DATA_REF_KEY)
+                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
+                else
+                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
+        } else {
+                size =  btrfs_extent_inline_ref_size(type);
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+                ptr = (unsigned long)iref;
+                end = (unsigned long)ei + item_size;
+                if (ptr + size < end)
+                        memmove_extent_buffer(leaf, ptr, ptr + size,
+                                              end - ptr - size);
+                item_size -= size;
+                ret = btrfs_truncate_item(trans, root, path, item_size, 1);
+                BUG_ON(ret);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        return 0;
+}
+static noinline_for_stack
+int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 u64 bytenr, u64 num_bytes, u64 parent,
+                                 u64 root_objectid, u64 owner,
+                                 u64 offset, int refs_to_add,
+                                 struct btrfs_delayed_extent_op *extent_op)
+{
+        struct btrfs_extent_inline_ref *iref;
+        int ret;
+        ret = lookup_inline_extent_backref(trans, root, path, &iref,
+                                           bytenr, num_bytes, parent,
+                                           root_objectid, owner, offset, 1);
+        if (ret == 0) {
+                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+                ret = update_inline_extent_backref(trans, root, path, iref,
+                                                   refs_to_add, extent_op);
+        } else if (ret == -ENOENT) {
+                ret = setup_inline_extent_backref(trans, root, path, iref,
+                                                  parent, root_objectid,
+                                                  owner, offset, refs_to_add,
+                                                  extent_op);
+        }
+        return ret;
+}
+static int insert_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 u64 bytenr, u64 parent, u64 root_objectid,
+                                 u64 owner, u64 offset, int refs_to_add)
+{
+        int ret;
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                BUG_ON(refs_to_add != 1);
+                ret = insert_tree_block_ref(trans, root, path, bytenr,
+                                            parent, root_objectid);
+        } else {
+                ret = insert_extent_data_ref(trans, root, path, bytenr,
+                                             parent, root_objectid,
+                                             owner, offset, refs_to_add);
+        }
+        return ret;
+}
+static int remove_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref *iref,
+                                 int refs_to_drop, int is_data)
+{
+        int ret;
+        BUG_ON(!is_data && refs_to_drop != 1);
+        if (iref) {
+                ret = update_inline_extent_backref(trans, root, path, iref,
+                                                   -refs_to_drop, NULL);
+        } else if (is_data) {
+                ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+        } else {
+                ret = btrfs_del_item(trans, root, path);
+        }
+        return ret;
+}
 #ifdef BIO_RW_DISCARD
 static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
@@ -686,71 +1445,40 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
-static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root, u64 bytenr,
+                         struct btrfs_root *root,
-                                     u64 num_bytes,
+                         u64 bytenr, u64 num_bytes, u64 parent,
-                                     u64 orig_parent, u64 parent,
+                         u64 root_objectid, u64 owner, u64 offset)
-                                     u64 orig_root, u64 ref_root,
-                                     u64 orig_generation, u64 ref_generation,
-                                     u64 owner_objectid)
 {
        int ret;
-        int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
+        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
+               root_objectid == BTRFS_TREE_LOG_OBJECTID);
-        ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-                                       orig_parent, parent, orig_root,
+                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
-                                       ref_root, orig_generation,
+                                        parent, root_objectid, (int)owner,
-                                       ref_generation, owner_objectid, pin);
+                                        BTRFS_ADD_DELAYED_REF, NULL);
-        BUG_ON(ret);
+        } else {
+                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+                                        parent, root_objectid, owner, offset,
+                                        BTRFS_ADD_DELAYED_REF, NULL);
+        }
        return ret;
 }
-int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
-                            u64 num_bytes, u64 orig_parent, u64 parent,
-                            u64 ref_root, u64 ref_generation,
-                            u64 owner_objectid)
-{
-        int ret;
-        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
-            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
-                return 0;
-        ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-                                        orig_parent, parent, ref_root,
-                                        ref_root, ref_generation,
-                                        ref_generation, owner_objectid);
-        return ret;
-}
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-                                  struct btrfs_root *root, u64 bytenr,
+                                  struct btrfs_root *root,
-                                  u64 num_bytes,
+                                  u64 bytenr, u64 num_bytes,
-                                  u64 orig_parent, u64 parent,
+                                  u64 parent, u64 root_objectid,
-                                  u64 orig_root, u64 ref_root,
+                                  u64 owner, u64 offset, int refs_to_add,
-                                  u64 orig_generation, u64 ref_generation,
+                                  struct btrfs_delayed_extent_op *extent_op)
-                                  u64 owner_objectid)
-{
-        int ret;
-        ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
-                                    ref_generation, owner_objectid,
-                                    BTRFS_ADD_DELAYED_REF, 0);
-        BUG_ON(ret);
-        return ret;
-}
-static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 bytenr,
-                          u64 num_bytes, u64 parent, u64 ref_root,
-                          u64 ref_generation, u64 owner_objectid,
-                          int refs_to_add)
 {
        struct btrfs_path *path;
-        int ret;
+        struct extent_buffer *leaf;
-        struct btrfs_key key;
-        struct extent_buffer *l;
        struct btrfs_extent_item *item;
-        u32 refs;
+        u64 refs;
+        int ret;
+        int err = 0;
        path = btrfs_alloc_path();
        if (!path)
@@ -758,43 +1486,27 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
        path->reada = 1;
        path->leave_spinning = 1;
-        key.objectid = bytenr;
+        /* this will setup the path even if it fails to insert the back ref */
-        key.type = BTRFS_EXTENT_ITEM_KEY;
+        ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
-        key.offset = num_bytes;
+                                           path, bytenr, num_bytes, parent,
+                                           root_objectid, owner, offset,
-        /* first find the extent item and update its reference count */
+                                           refs_to_add, extent_op);
-        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+        if (ret == 0)
-                                path, 0, 1);
+                goto out;
-        if (ret < 0) {
-                btrfs_set_path_blocking(path);
-                return ret;
-        }
-        if (ret > 0) {
-                WARN_ON(1);
-                btrfs_free_path(path);
-                return -EIO;
-        }
-        l = path->nodes[0];
-        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+        if (ret != -EAGAIN) {
-        if (key.objectid != bytenr) {
+                err = ret;
-                btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
+                goto out;
-                printk(KERN_ERR "btrfs wanted %llu found %llu\n",
-                       (unsigned long long)bytenr,
-                       (unsigned long long)key.objectid);
-                BUG();
        }
-        BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
-        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-        refs = btrfs_extent_refs(l, item);
-        btrfs_set_extent_refs(l, item, refs + refs_to_add);
-        btrfs_unlock_up_safe(path, 1);
-        btrfs_mark_buffer_dirty(path->nodes[0]);
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        refs = btrfs_extent_refs(leaf, item);
+        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
+        if (extent_op)
+                __run_delayed_extent_op(extent_op, leaf, item);
+        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(root->fs_info->extent_root, path);
        path->reada = 1;
@@ -802,56 +1514,197 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
        /* now insert the actual backref */
        ret = insert_extent_backref(trans, root->fs_info->extent_root,
-                                    path, bytenr, parent,
+                                    path, bytenr, parent, root_objectid,
-                                    ref_root, ref_generation,
+                                    owner, offset, refs_to_add);
-                                    owner_objectid, refs_to_add);
        BUG_ON(ret);
+out:
        btrfs_free_path(path);
-        return 0;
+        return err;
 }
-int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root,
+                                struct btrfs_root *root,
-                         u64 bytenr, u64 num_bytes, u64 parent,
+                                struct btrfs_delayed_ref_node *node,
-                         u64 ref_root, u64 ref_generation,
+                                struct btrfs_delayed_extent_op *extent_op,
-                         u64 owner_objectid)
+                                int insert_reserved)
 {
-        int ret;
+        int ret = 0;
-        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+        struct btrfs_delayed_data_ref *ref;
-            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+        struct btrfs_key ins;
-                return 0;
+        u64 parent = 0;
+        u64 ref_root = 0;
+        u64 flags = 0;
-        ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
+        ins.objectid = node->bytenr;
-                                     0, ref_root, 0, ref_generation,
+        ins.offset = node->num_bytes;
-                                     owner_objectid);
+        ins.type = BTRFS_EXTENT_ITEM_KEY;
+        ref = btrfs_delayed_node_to_data_ref(node);
+        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
+                parent = ref->parent;
+        else
+                ref_root = ref->root;
+        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+                if (extent_op) {
+                        BUG_ON(extent_op->update_key);
+                        flags |= extent_op->flags_to_set;
+                }
+                ret = alloc_reserved_file_extent(trans, root,
+                                                 parent, ref_root, flags,
+                                                 ref->objectid, ref->offset,
+                                                 &ins, node->ref_mod);
+                update_reserved_extents(root, ins.objectid, ins.offset, 0);
+        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+                                             node->num_bytes, parent,
+                                             ref_root, ref->objectid,
+                                             ref->offset, node->ref_mod,
+                                             extent_op);
+        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
+                ret = __btrfs_free_extent(trans, root, node->bytenr,
+                                          node->num_bytes, parent,
+                                          ref_root, ref->objectid,
+                                          ref->offset, node->ref_mod,
+                                          extent_op);
+        } else {
+                BUG();
+        }
        return ret;
 }
-static int drop_delayed_ref(struct btrfs_trans_handle *trans,
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
-                                        struct btrfs_root *root,
+                                    struct extent_buffer *leaf,
-                                        struct btrfs_delayed_ref_node *node)
+                                    struct btrfs_extent_item *ei)
+{
+        u64 flags = btrfs_extent_flags(leaf, ei);
+        if (extent_op->update_flags) {
+                flags |= extent_op->flags_to_set;
+                btrfs_set_extent_flags(leaf, ei, flags);
+        }
+        if (extent_op->update_key) {
+                struct btrfs_tree_block_info *bi;
+                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+                bi = (struct btrfs_tree_block_info *)(ei + 1);
+                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
+        }
+}
+static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_delayed_ref_node *node,
+                                 struct btrfs_delayed_extent_op *extent_op)
+{
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        struct extent_buffer *leaf;
+        u32 item_size;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = node->bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = node->num_bytes;
+        path->reada = 1;
+        path->leave_spinning = 1;
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+                                path, 0, 1);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret > 0) {
+                err = -EIO;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (item_size < sizeof(*ei)) {
+                ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
+                                             path, (u64)-1, 0);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        }
+#endif
+        BUG_ON(item_size < sizeof(*ei));
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        __run_delayed_extent_op(extent_op, leaf, ei);
+        btrfs_mark_buffer_dirty(leaf);
+out:
+        btrfs_free_path(path);
+        return err;
+}
+static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_delayed_ref_node *node,
+                                struct btrfs_delayed_extent_op *extent_op,
+                                int insert_reserved)
 {
        int ret = 0;
-        struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+        struct btrfs_delayed_tree_ref *ref;
+        struct btrfs_key ins;
+        u64 parent = 0;
+        u64 ref_root = 0;
-        BUG_ON(node->ref_mod == 0);
+        ins.objectid = node->bytenr;
-        ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+        ins.offset = node->num_bytes;
-                                  node->parent, ref->root, ref->generation,
+        ins.type = BTRFS_EXTENT_ITEM_KEY;
-                                  ref->owner_objectid, ref->pin, node->ref_mod);
+        ref = btrfs_delayed_node_to_tree_ref(node);
+        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+                parent = ref->parent;
+        else
+                ref_root = ref->root;
+        BUG_ON(node->ref_mod != 1);
+        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+                BUG_ON(!extent_op || !extent_op->update_flags ||
+                       !extent_op->update_key);
+                ret = alloc_reserved_tree_block(trans, root,
+                                                parent, ref_root,
+                                                extent_op->flags_to_set,
+                                                &extent_op->key,
+                                                ref->level, &ins);
+                update_reserved_extents(root, ins.objectid, ins.offset, 0);
+        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+                                             node->num_bytes, parent, ref_root,
+                                             ref->level, 0, 1, extent_op);
+        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
+                ret = __btrfs_free_extent(trans, root, node->bytenr,
+                                          node->num_bytes, parent, ref_root,
+                                          ref->level, 0, 1, extent_op);
+        } else {
+                BUG();
+        }
        return ret;
 }
 /* helper function to actually process a single delayed ref entry */
-static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
+                               struct btrfs_root *root,
-                                        struct btrfs_delayed_ref_node *node,
+                               struct btrfs_delayed_ref_node *node,
-                                        int insert_reserved)
+                               struct btrfs_delayed_extent_op *extent_op,
+                               int insert_reserved)
 {
        int ret;
-        struct btrfs_delayed_ref *ref;
+        if (btrfs_delayed_ref_is_head(node)) {
-        if (node->parent == (u64)-1) {
                struct btrfs_delayed_ref_head *head;
                /*
                 * we've hit the end of the chain and we were supposed
@@ -859,44 +1712,35 @@ static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                 * deleted before we ever needed to insert it, so all
                 * we have to do is clean up the accounting
                 */
+                BUG_ON(extent_op);
+                head = btrfs_delayed_node_to_head(node);
                if (insert_reserved) {
+                        if (head->is_data) {
+                                ret = btrfs_del_csums(trans, root,
+                                                      node->bytenr,
+                                                      node->num_bytes);
+                                BUG_ON(ret);
+                        }
+                        btrfs_update_pinned_extents(root, node->bytenr,
+                                                    node->num_bytes, 1);
                        update_reserved_extents(root, node->bytenr,
                                                node->num_bytes, 0);
                }
-                head = btrfs_delayed_node_to_head(node);
                mutex_unlock(&head->mutex);
                return 0;
        }
-        ref = btrfs_delayed_node_to_ref(node);
+        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
-        if (ref->action == BTRFS_ADD_DELAYED_REF) {
+            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
-                if (insert_reserved) {
+                ret = run_delayed_tree_ref(trans, root, node, extent_op,
-                        struct btrfs_key ins;
+                                           insert_reserved);
+        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
-                        ins.objectid = node->bytenr;
+                 node->type == BTRFS_SHARED_DATA_REF_KEY)
-                        ins.offset = node->num_bytes;
+                ret = run_delayed_data_ref(trans, root, node, extent_op,
-                        ins.type = BTRFS_EXTENT_ITEM_KEY;
+                                           insert_reserved);
+        else
-                        /* record the full extent allocation */
+                BUG();
-                        ret = __btrfs_alloc_reserved_extent(trans, root,
+        return ret;
-                                        node->parent, ref->root,
-                                        ref->generation, ref->owner_objectid,
-                                        &ins, node->ref_mod);
-                        update_reserved_extents(root, node->bytenr,
-                                                node->num_bytes, 0);
-                } else {
-                        /* just add one backref */
-                        ret = add_extent_ref(trans, root, node->bytenr,
-                                     node->num_bytes,
-                                     node->parent, ref->root, ref->generation,
-                                     ref->owner_objectid, node->ref_mod);
-                }
-                BUG_ON(ret);
-        } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
-                WARN_ON(insert_reserved);
-                ret = drop_delayed_ref(trans, root, node);
-        }
-        return 0;
 }
 static noinline struct btrfs_delayed_ref_node *
@@ -919,7 +1763,7 @@ again:
                                rb_node);
                if (ref->bytenr != head->node.bytenr)
                        break;
-                if (btrfs_delayed_node_to_ref(ref)->action == action)
+                if (ref->action == action)
                        return ref;
                node = rb_prev(node);
        }
@@ -937,6 +1781,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
+        struct btrfs_delayed_extent_op *extent_op;
        int ret;
        int count = 0;
        int must_insert_reserved = 0;
@@ -975,6 +1820,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                must_insert_reserved = locked_ref->must_insert_reserved;
                locked_ref->must_insert_reserved = 0;
+                extent_op = locked_ref->extent_op;
+                locked_ref->extent_op = NULL;
                /*
                 * locked_ref is the head node, so we have to go one
                 * node back for any delayed ref updates
@@ -986,6 +1834,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                         * so that any accounting fixes can happen
                         */
                        ref = &locked_ref->node;
+                        if (extent_op && must_insert_reserved) {
+                                kfree(extent_op);
+                                extent_op = NULL;
+                        }
+                        if (extent_op) {
+                                spin_unlock(&delayed_refs->lock);
+                                ret = run_delayed_extent_op(trans, root,
+                                                            ref, extent_op);
+                                BUG_ON(ret);
+                                kfree(extent_op);
+                                cond_resched();
+                                spin_lock(&delayed_refs->lock);
+                                continue;
+                        }
                        list_del_init(&locked_ref->cluster);
                        locked_ref = NULL;
                }
@@ -993,14 +1860,17 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
                spin_unlock(&delayed_refs->lock);
-                ret = run_one_delayed_ref(trans, root, ref,
+                ret = run_one_delayed_ref(trans, root, ref, extent_op,
                                          must_insert_reserved);
                BUG_ON(ret);
-                btrfs_put_delayed_ref(ref);
+                btrfs_put_delayed_ref(ref);
+                kfree(extent_op);
                count++;
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
@@ -1095,25 +1965,112 @@ out:
        return 0;
 }
-int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 objectid, u64 bytenr)
+                                struct btrfs_root *root,
+                                u64 bytenr, u64 num_bytes, u64 flags,
+                                int is_data)
+{
+        struct btrfs_delayed_extent_op *extent_op;
+        int ret;
+        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+        if (!extent_op)
+                return -ENOMEM;
+        extent_op->flags_to_set = flags;
+        extent_op->update_flags = 1;
+        extent_op->update_key = 0;
+        extent_op->is_data = is_data ? 1 : 0;
+        ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+        if (ret)
+                kfree(extent_op);
+        return ret;
+}
+static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      u64 objectid, u64 offset, u64 bytenr)
+{
+        struct btrfs_delayed_ref_head *head;
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_data_ref *data_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct rb_node *node;
+        int ret = 0;
+        ret = -ENOENT;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        head = btrfs_find_delayed_ref_head(trans, bytenr);
+        if (!head)
+                goto out;
+        if (!mutex_trylock(&head->mutex)) {
+                atomic_inc(&head->node.refs);
+                spin_unlock(&delayed_refs->lock);
+                btrfs_release_path(root->fs_info->extent_root, path);
+                mutex_lock(&head->mutex);
+                mutex_unlock(&head->mutex);
+                btrfs_put_delayed_ref(&head->node);
+                return -EAGAIN;
+        }
+        node = rb_prev(&head->node.rb_node);
+        if (!node)
+                goto out_unlock;
+        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+        if (ref->bytenr != bytenr)
+                goto out_unlock;
+        ret = 1;
+        if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
+                goto out_unlock;
+        data_ref = btrfs_delayed_node_to_data_ref(ref);
+        node = rb_prev(node);
+        if (node) {
+                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                if (ref->bytenr == bytenr)
+                        goto out_unlock;
+        }
+        if (data_ref->root != root->root_key.objectid ||
+            data_ref->objectid != objectid || data_ref->offset != offset)
+                goto out_unlock;
+        ret = 0;
+out_unlock:
+        mutex_unlock(&head->mutex);
+out:
+        spin_unlock(&delayed_refs->lock);
+        return ret;
+}
+static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_path *path,
+                                        u64 objectid, u64 offset, u64 bytenr)
 {
        struct btrfs_root *extent_root = root->fs_info->extent_root;
-        struct btrfs_path *path;
        struct extent_buffer *leaf;
-        struct btrfs_extent_ref *ref_item;
+        struct btrfs_extent_data_ref *ref;
+        struct btrfs_extent_inline_ref *iref;
+        struct btrfs_extent_item *ei;
        struct btrfs_key key;
-        struct btrfs_key found_key;
+        u32 item_size;
-        u64 ref_root;
-        u64 last_snapshot;
-        u32 nritems;
        int ret;
        key.objectid = bytenr;
        key.offset = (u64)-1;
        key.type = BTRFS_EXTENT_ITEM_KEY;
-        path = btrfs_alloc_path();
        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
@@ -1125,55 +2082,83 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
        path->slots[0]--;
        leaf = path->nodes[0];
-        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-        if (found_key.objectid != bytenr ||
+        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
-            found_key.type != BTRFS_EXTENT_ITEM_KEY)
                goto out;
-        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+        ret = 1;
-        while (1) {
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-                leaf = path->nodes[0];
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                nritems = btrfs_header_nritems(leaf);
+        if (item_size < sizeof(*ei)) {
-                if (path->slots[0] >= nritems) {
+                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
-                        ret = btrfs_next_leaf(extent_root, path);
+                goto out;
-                        if (ret < 0)
+        }
-                                goto out;
+#endif
-                        if (ret == 0)
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
-                                continue;
-                        break;
-                }
-                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.objectid != bytenr)
-                        break;
-                if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+        if (item_size != sizeof(*ei) +
-                        path->slots[0]++;
+            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
-                        continue;
+                goto out;
-                }
-                ref_item = btrfs_item_ptr(leaf, path->slots[0],
+        if (btrfs_extent_generation(leaf, ei) <=
-                                          struct btrfs_extent_ref);
+            btrfs_root_last_snapshot(&root->root_item))
-                ref_root = btrfs_ref_root(leaf, ref_item);
+                goto out;
-                if ((ref_root != root->root_key.objectid &&
-                     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
+        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-                     objectid != btrfs_ref_objectid(leaf, ref_item)) {
+        if (btrfs_extent_inline_ref_type(leaf, iref) !=
-                        ret = 1;
+            BTRFS_EXTENT_DATA_REF_KEY)
-                        goto out;
+                goto out;
-                }
-                if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
+        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
-                        ret = 1;
+        if (btrfs_extent_refs(leaf, ei) !=
+            btrfs_extent_data_ref_count(leaf, ref) ||
+            btrfs_extent_data_ref_root(leaf, ref) !=
+            root->root_key.objectid ||
+            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+            btrfs_extent_data_ref_offset(leaf, ref) != offset)
+                goto out;
+        ret = 0;
+out:
+        return ret;
+}
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          u64 objectid, u64 offset, u64 bytenr)
+{
+        struct btrfs_path *path;
+        int ret;
+        int ret2;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOENT;
+        do {
+                ret = check_committed_ref(trans, root, path, objectid,
+                                          offset, bytenr);
+                if (ret && ret != -ENOENT)
                        goto out;
-                }
-                path->slots[0]++;
+                ret2 = check_delayed_ref(trans, root, path, objectid,
+                                         offset, bytenr);
+        } while (ret2 == -EAGAIN);
+        if (ret2 && ret2 != -ENOENT) {
+                ret = ret2;
+                goto out;
        }
-        ret = 0;
+        if (ret != -ENOENT || ret2 != -ENOENT)
+                ret = 0;
 out:
        btrfs_free_path(path);
        return ret;
 }
+#if 0
 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                    struct extent_buffer *buf, u32 nr_extents)
 {
@@ -1291,62 +2276,44 @@ static int refsort_cmp(const void *a_void, const void *b_void)
                return 1;
        return 0;
 }
+#endif
+static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
-noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
-                           struct extent_buffer *orig_buf,
+                           struct extent_buffer *buf,
-                           struct extent_buffer *buf, u32 *nr_extents)
+                           int full_backref, int inc)
 {
        u64 bytenr;
+        u64 num_bytes;
+        u64 parent;
        u64 ref_root;
-        u64 orig_root;
-        u64 ref_generation;
-        u64 orig_generation;
-        struct refsort *sorted;
        u32 nritems;
-        u32 nr_file_extents = 0;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        int i;
        int level;
        int ret = 0;
-        int faili = 0;
-        int refi = 0;
-        int slot;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                            u64, u64, u64, u64, u64, u64, u64, u64, u64);
+                            u64, u64, u64, u64, u64, u64);
        ref_root = btrfs_header_owner(buf);
-        ref_generation = btrfs_header_generation(buf);
-        orig_root = btrfs_header_owner(orig_buf);
-        orig_generation = btrfs_header_generation(orig_buf);
        nritems = btrfs_header_nritems(buf);
        level = btrfs_header_level(buf);
-        sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
+        if (!root->ref_cows && level == 0)
-        BUG_ON(!sorted);
+                return 0;
-        if (root->ref_cows) {
+        if (inc)
-                process_func = __btrfs_inc_extent_ref;
+                process_func = btrfs_inc_extent_ref;
-        } else {
+        else
-                if (level == 0 &&
+                process_func = btrfs_free_extent;
-                    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
-                        goto out;
+        if (full_backref)
-                if (level != 0 &&
+                parent = buf->start;
-                    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+        else
-                        goto out;
+                parent = 0;
-                process_func = __btrfs_update_extent_ref;
-        }
-        /*
-         * we make two passes through the items.  In the first pass we
-         * only record the byte number and slot.  Then we sort based on
-         * byte number and do the actual work based on the sorted results
-         */
        for (i = 0; i < nritems; i++) {
-                cond_resched();
                if (level == 0) {
                        btrfs_item_key_to_cpu(buf, &key, i);
                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
@@ -1360,151 +2327,38 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
                        if (bytenr == 0)
                                continue;
-                        nr_file_extents++;
+                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
-                        sorted[refi].bytenr = bytenr;
+                        key.offset -= btrfs_file_extent_offset(buf, fi);
-                        sorted[refi].slot = i;
+                        ret = process_func(trans, root, bytenr, num_bytes,
-                        refi++;
+                                           parent, ref_root, key.objectid,
-                } else {
+                                           key.offset);
-                        bytenr = btrfs_node_blockptr(buf, i);
+                        if (ret)
-                        sorted[refi].bytenr = bytenr;
-                        sorted[refi].slot = i;
-                        refi++;
-                }
-        }
-        /*
-         * if refi == 0, we didn't actually put anything into the sorted
-         * array and we're done
-         */
-        if (refi == 0)
-                goto out;
-        sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
-        for (i = 0; i < refi; i++) {
-                cond_resched();
-                slot = sorted[i].slot;
-                bytenr = sorted[i].bytenr;
-                if (level == 0) {
-                        btrfs_item_key_to_cpu(buf, &key, slot);
-                        fi = btrfs_item_ptr(buf, slot,
-                                            struct btrfs_file_extent_item);
-                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-                        if (bytenr == 0)
-                                continue;
-                        ret = process_func(trans, root, bytenr,
-                                   btrfs_file_extent_disk_num_bytes(buf, fi),
-                                   orig_buf->start, buf->start,
-                                   orig_root, ref_root,
-                                   orig_generation, ref_generation,
-                                   key.objectid);
-                        if (ret) {
-                                faili = slot;
-                                WARN_ON(1);
                                goto fail;
-                        }
                } else {
-                        ret = process_func(trans, root, bytenr, buf->len,
+                        bytenr = btrfs_node_blockptr(buf, i);
-                                           orig_buf->start, buf->start,
+                        num_bytes = btrfs_level_size(root, level - 1);
-                                           orig_root, ref_root,
+                        ret = process_func(trans, root, bytenr, num_bytes,
-                                           orig_generation, ref_generation,
+                                           parent, ref_root, level - 1, 0);
-                                           level - 1);
+                        if (ret)
-                        if (ret) {
-                                faili = slot;
-                                WARN_ON(1);
                                goto fail;
-                        }
                }
        }
-out:
-        kfree(sorted);
-        if (nr_extents) {
-                if (level == 0)
-                        *nr_extents = nr_file_extents;
-                else
-                        *nr_extents = nritems;
-        }
        return 0;
 fail:
-        kfree(sorted);
+        BUG();
-        WARN_ON(1);
        return ret;
 }
-int btrfs_update_ref(struct btrfs_trans_handle *trans,
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                     struct btrfs_root *root, struct extent_buffer *orig_buf,
+                  struct extent_buffer *buf, int full_backref)
-                     struct extent_buffer *buf, int start_slot, int nr)
 {
-        u64 bytenr;
+        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
-        u64 ref_root;
+}
-        u64 orig_root;
-        u64 ref_generation;
-        u64 orig_generation;
-        struct btrfs_key key;
-        struct btrfs_file_extent_item *fi;
-        int i;
-        int ret;
-        int slot;
-        int level;
-        BUG_ON(start_slot < 0);
-        BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
-        ref_root = btrfs_header_owner(buf);
-        ref_generation = btrfs_header_generation(buf);
-        orig_root = btrfs_header_owner(orig_buf);
-        orig_generation = btrfs_header_generation(orig_buf);
-        level = btrfs_header_level(buf);
-        if (!root->ref_cows) {
-                if (level == 0 &&
-                    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
-                        return 0;
-                if (level != 0 &&
-                    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
-                        return 0;
-        }
-        for (i = 0, slot = start_slot; i < nr; i++, slot++) {
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                cond_resched();
+                  struct extent_buffer *buf, int full_backref)
-                if (level == 0) {
+{
-                        btrfs_item_key_to_cpu(buf, &key, slot);
+        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
-                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-                                continue;
-                        fi = btrfs_item_ptr(buf, slot,
-                                            struct btrfs_file_extent_item);
-                        if (btrfs_file_extent_type(buf, fi) ==
-                            BTRFS_FILE_EXTENT_INLINE)
-                                continue;
-                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-                        if (bytenr == 0)
-                                continue;
-                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                    btrfs_file_extent_disk_num_bytes(buf, fi),
-                                    orig_buf->start, buf->start,
-                                    orig_root, ref_root, orig_generation,
-                                    ref_generation, key.objectid);
-                        if (ret)
-                                goto fail;
-                } else {
-                        bytenr = btrfs_node_blockptr(buf, slot);
-                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                            buf->len, orig_buf->start,
-                                            buf->start, orig_root, ref_root,
-                                            orig_generation, ref_generation,
-                                            level - 1);
-                        if (ret)
-                                goto fail;
-                }
-        }
-        return 0;
-fail:
-        WARN_ON(1);
-        return -1;
 }
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2007,6 +2861,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
        u64 old_val;
        u64 byte_in_group;
+        /* block accounting for super block */
+        spin_lock(&info->delalloc_lock);
+        old_val = btrfs_super_bytes_used(&info->super_copy);
+        if (alloc)
+                old_val += num_bytes;
+        else
+                old_val -= num_bytes;
+        btrfs_set_super_bytes_used(&info->super_copy, old_val);
+        /* block accounting for root item */
+        old_val = btrfs_root_used(&root->root_item);
+        if (alloc)
+                old_val += num_bytes;
+        else
+                old_val -= num_bytes;
+        btrfs_set_root_used(&root->root_item, old_val);
+        spin_unlock(&info->delalloc_lock);
        while (total) {
                cache = btrfs_lookup_block_group(info, bytenr);
                if (!cache)
@@ -2216,8 +3088,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
                u64 header_owner = btrfs_header_owner(buf);
                u64 header_transid = btrfs_header_generation(buf);
                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
-                    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
-                    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
                    header_transid == trans->transid &&
                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                        *must_clean = buf;
@@ -2235,63 +3105,77 @@ pinit:
        return 0;
 }
-/*
- * remove an extent from the root, returns 0 on success
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
- */
+                                struct btrfs_root *root,
-static int __free_extent(struct btrfs_trans_handle *trans,
+                                u64 bytenr, u64 num_bytes, u64 parent,
-                         struct btrfs_root *root,
+                                u64 root_objectid, u64 owner_objectid,
-                         u64 bytenr, u64 num_bytes, u64 parent,
+                                u64 owner_offset, int refs_to_drop,
-                         u64 root_objectid, u64 ref_generation,
+                                struct btrfs_delayed_extent_op *extent_op)
-                         u64 owner_objectid, int pin, int mark_free,
-                         int refs_to_drop)
 {
-        struct btrfs_path *path;
        struct btrfs_key key;
+        struct btrfs_path *path;
        struct btrfs_fs_info *info = root->fs_info;
        struct btrfs_root *extent_root = info->extent_root;
        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_inline_ref *iref;
        int ret;
+        int is_data;
        int extent_slot = 0;
        int found_extent = 0;
        int num_to_del = 1;
-        struct btrfs_extent_item *ei;
+        u32 item_size;
-        u32 refs;
+        u64 refs;
-        key.objectid = bytenr;
-        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-        key.offset = num_bytes;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->reada = 1;
        path->leave_spinning = 1;
-        ret = lookup_extent_backref(trans, extent_root, path,
-                                    bytenr, parent, root_objectid,
+        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
-                                    ref_generation, owner_objectid, 1);
+        BUG_ON(!is_data && refs_to_drop != 1);
+        ret = lookup_extent_backref(trans, extent_root, path, &iref,
+                                    bytenr, num_bytes, parent,
+                                    root_objectid, owner_objectid,
+                                    owner_offset);
        if (ret == 0) {
-                struct btrfs_key found_key;
                extent_slot = path->slots[0];
-                while (extent_slot > 0) {
+                while (extent_slot >= 0) {
-                        extent_slot--;
+                        btrfs_item_key_to_cpu(path->nodes[0], &key,
-                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                              extent_slot);
-                        if (found_key.objectid != bytenr)
+                        if (key.objectid != bytenr)
                                break;
-                        if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
-                            found_key.offset == num_bytes) {
+                            key.offset == num_bytes) {
                                found_extent = 1;
                                break;
                        }
                        if (path->slots[0] - extent_slot > 5)
                                break;
+                        extent_slot--;
                }
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
+                if (found_extent && item_size < sizeof(*ei))
+                        found_extent = 0;
+#endif
                if (!found_extent) {
+                        BUG_ON(iref);
                        ret = remove_extent_backref(trans, extent_root, path,
-                                                    refs_to_drop);
+                                                    NULL, refs_to_drop,
+                                                    is_data);
                        BUG_ON(ret);
                        btrfs_release_path(extent_root, path);
                        path->leave_spinning = 1;
+                        key.objectid = bytenr;
+                        key.type = BTRFS_EXTENT_ITEM_KEY;
+                        key.offset = num_bytes;
                        ret = btrfs_search_slot(trans, extent_root,
                                                &key, path, -1, 1);
                        if (ret) {
@@ -2307,82 +3191,98 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                btrfs_print_leaf(extent_root, path->nodes[0]);
                WARN_ON(1);
                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-                       "parent %llu root %llu gen %llu owner %llu\n",
+                       "parent %llu root %llu  owner %llu offset %llu\n",
                       (unsigned long long)bytenr,
                       (unsigned long long)parent,
                       (unsigned long long)root_objectid,
-                       (unsigned long long)ref_generation,
+                       (unsigned long long)owner_objectid,
-                       (unsigned long long)owner_objectid);
+                       (unsigned long long)owner_offset);
        }
        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, extent_slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (item_size < sizeof(*ei)) {
+                BUG_ON(found_extent || extent_slot != path->slots[0]);
+                ret = convert_extent_item_v0(trans, extent_root, path,
+                                             owner_objectid, 0);
+                BUG_ON(ret < 0);
+                btrfs_release_path(extent_root, path);
+                path->leave_spinning = 1;
+                key.objectid = bytenr;
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = num_bytes;
+                ret = btrfs_search_slot(trans, extent_root, &key, path,
+                                        -1, 1);
+                if (ret) {
+                        printk(KERN_ERR "umm, got %d back from search"
+                               ", was looking for %llu\n", ret,
+                               (unsigned long long)bytenr);
+                        btrfs_print_leaf(extent_root, path->nodes[0]);
+                }
+                BUG_ON(ret);
+                extent_slot = path->slots[0];
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, extent_slot);
+        }
+#endif
+        BUG_ON(item_size < sizeof(*ei));
        ei = btrfs_item_ptr(leaf, extent_slot,
                            struct btrfs_extent_item);
-        refs = btrfs_extent_refs(leaf, ei);
+        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+                struct btrfs_tree_block_info *bi;
-        /*
+                BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
-         * we're not allowed to delete the extent item if there
+                bi = (struct btrfs_tree_block_info *)(ei + 1);
-         * are other delayed ref updates pending
+                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
-         */
+        }
+        refs = btrfs_extent_refs(leaf, ei);
        BUG_ON(refs < refs_to_drop);
        refs -= refs_to_drop;
-        btrfs_set_extent_refs(leaf, ei, refs);
-        btrfs_mark_buffer_dirty(leaf);
-        if (refs == 0 && found_extent &&
+        if (refs > 0) {
-            path->slots[0] == extent_slot + 1) {
+                if (extent_op)
-                struct btrfs_extent_ref *ref;
+                        __run_delayed_extent_op(extent_op, leaf, ei);
-                ref = btrfs_item_ptr(leaf, path->slots[0],
+                /*
-                                     struct btrfs_extent_ref);
+                 * In the case of inline back ref, reference count will
-                BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
+                 * be updated by remove_extent_backref
-                /* if the back ref and the extent are next to each other
-                 * they get deleted below in one shot
                 */
-                path->slots[0] = extent_slot;
+                if (iref) {
-                num_to_del = 2;
+                        BUG_ON(!found_extent);
-        } else if (found_extent) {
+                } else {
-                /* otherwise delete the extent back ref */
+                        btrfs_set_extent_refs(leaf, ei, refs);
-                ret = remove_extent_backref(trans, extent_root, path,
+                        btrfs_mark_buffer_dirty(leaf);
-                                            refs_to_drop);
+                }
-                BUG_ON(ret);
+                if (found_extent) {
-                /* if refs are 0, we need to setup the path for deletion */
+                        ret = remove_extent_backref(trans, extent_root, path,
-                if (refs == 0) {
+                                                    iref, refs_to_drop,
-                        btrfs_release_path(extent_root, path);
+                                                    is_data);
-                        path->leave_spinning = 1;
-                        ret = btrfs_search_slot(trans, extent_root, &key, path,
-                                                -1, 1);
                        BUG_ON(ret);
                }
-        }
+        } else {
+                int mark_free = 0;
-        if (refs == 0) {
-                u64 super_used;
-                u64 root_used;
                struct extent_buffer *must_clean = NULL;
-                if (pin) {
+                if (found_extent) {
-                        ret = pin_down_bytes(trans, root, path,
+                        BUG_ON(is_data && refs_to_drop !=
-                                bytenr, num_bytes,
+                               extent_data_ref_count(root, path, iref));
-                                owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
+                        if (iref) {
-                                &must_clean);
+                                BUG_ON(path->slots[0] != extent_slot);
-                        if (ret > 0)
+                        } else {
-                                mark_free = 1;
+                                BUG_ON(path->slots[0] != extent_slot + 1);
-                        BUG_ON(ret < 0);
+                                path->slots[0] = extent_slot;
+                                num_to_del = 2;
+                        }
                }
-                /* block accounting for super block */
+                ret = pin_down_bytes(trans, root, path, bytenr,
-                spin_lock(&info->delalloc_lock);
+                                     num_bytes, is_data, &must_clean);
-                super_used = btrfs_super_bytes_used(&info->super_copy);
+                if (ret > 0)
-                btrfs_set_super_bytes_used(&info->super_copy,
+                        mark_free = 1;
-                                           super_used - num_bytes);
+                BUG_ON(ret < 0);
-                /* block accounting for root item */
-                root_used = btrfs_root_used(&root->root_item);
-                btrfs_set_root_used(&root->root_item,
-                                           root_used - num_bytes);
-                spin_unlock(&info->delalloc_lock);
                /*
                 * it is going to be very rare for someone to be waiting
                 * on the block we're freeing.  del_items might need to
@@ -2403,7 +3303,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                        free_extent_buffer(must_clean);
                }
-                if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
                } else {
@@ -2421,34 +3321,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 }
 /*
- * remove an extent from the root, returns 0 on success
- */
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
-                                        u64 bytenr, u64 num_bytes, u64 parent,
-                                        u64 root_objectid, u64 ref_generation,
-                                        u64 owner_objectid, int pin,
-                                        int refs_to_drop)
-{
-        WARN_ON(num_bytes < root->sectorsize);
-        /*
-         * if metadata always pin
-         * if data pin when any transaction has committed this
-         */
-        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
-            ref_generation != trans->transid)
-                pin = 1;
-        if (ref_generation != trans->transid)
-                pin = 1;
-        return __free_extent(trans, root, bytenr, num_bytes, parent,
-                            root_objectid, ref_generation,
-                            owner_objectid, pin, pin == 0, refs_to_drop);
-}
-/*
 * when we free an extent, it is possible (and likely) that we free the last
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
@@ -2479,6 +3351,13 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        if (ref->bytenr == bytenr)
                goto out;
+        if (head->extent_op) {
+                if (!head->must_insert_reserved)
+                        goto out;
+                kfree(head->extent_op);
+                head->extent_op = NULL;
+        }
        /*
         * waiting for the lock here would deadlock.  If someone else has it
         * locked they are already in the process of dropping it anyway
@@ -2507,7 +3386,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        spin_unlock(&delayed_refs->lock);
        ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
-                                  &head->node, head->must_insert_reserved);
+                                  &head->node, head->extent_op,
+                                  head->must_insert_reserved);
        BUG_ON(ret);
        btrfs_put_delayed_ref(&head->node);
        return 0;
@@ -2519,32 +3399,32 @@ out:
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent,
-                      u64 root_objectid, u64 ref_generation,
+                      u64 root_objectid, u64 owner, u64 offset)
-                      u64 owner_objectid, int pin)
 {
        int ret;
        /*
         * tree log blocks never actually go into the extent allocation
         * tree, just update pinning info and exit early.
-         *
-         * data extents referenced by the tree log do need to have
-         * their reference counts bumped.
         */
-        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+        if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
-            owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+                WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
                /* unlocks the pinned mutex */
                btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
                update_reserved_extents(root, bytenr, num_bytes, 0);
                ret = 0;
-        } else {
+        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-                ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
-                                       root_objectid, ref_generation,
+                                        parent, root_objectid, (int)owner,
-                                       owner_objectid,
+                                        BTRFS_DROP_DELAYED_REF, NULL);
-                                       BTRFS_DROP_DELAYED_REF, 1);
                BUG_ON(ret);
                ret = check_ref_cleanup(trans, root, bytenr);
                BUG_ON(ret);
+        } else {
+                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+                                        parent, root_objectid, owner,
+                                        offset, BTRFS_DROP_DELAYED_REF, NULL);
+                BUG_ON(ret);
        }
        return ret;
 }
@@ -2719,7 +3599,7 @@ refill_cluster:
                        last_ptr_loop = 0;
                        /* allocate a cluster in this block group */
-                        ret = btrfs_find_space_cluster(trans,
+                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
                                               offset, num_bytes,
                                               empty_cluster + empty_size);
@@ -2969,99 +3849,147 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
        return ret;
 }
-static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-                                         struct btrfs_root *root, u64 parent,
+                                      struct btrfs_root *root,
-                                         u64 root_objectid, u64 ref_generation,
+                                      u64 parent, u64 root_objectid,
-                                         u64 owner, struct btrfs_key *ins,
+                                      u64 flags, u64 owner, u64 offset,
-                                         int ref_mod)
+                                      struct btrfs_key *ins, int ref_mod)
 {
        int ret;
-        u64 super_used;
+        struct btrfs_fs_info *fs_info = root->fs_info;
-        u64 root_used;
-        u64 num_bytes = ins->offset;
-        u32 sizes[2];
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_root *extent_root = info->extent_root;
        struct btrfs_extent_item *extent_item;
-        struct btrfs_extent_ref *ref;
+        struct btrfs_extent_inline_ref *iref;
        struct btrfs_path *path;
-        struct btrfs_key keys[2];
+        struct extent_buffer *leaf;
+        int type;
-        if (parent == 0)
+        u32 size;
-                parent = ins->objectid;
-        /* block accounting for super block */
-        spin_lock(&info->delalloc_lock);
-        super_used = btrfs_super_bytes_used(&info->super_copy);
-        btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
-        /* block accounting for root item */
+        if (parent > 0)
-        root_used = btrfs_root_used(&root->root_item);
+                type = BTRFS_SHARED_DATA_REF_KEY;
-        btrfs_set_root_used(&root->root_item, root_used + num_bytes);
+        else
-        spin_unlock(&info->delalloc_lock);
+                type = BTRFS_EXTENT_DATA_REF_KEY;
-        memcpy(&keys[0], ins, sizeof(*ins));
+        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
-        keys[1].objectid = ins->objectid;
-        keys[1].type = BTRFS_EXTENT_REF_KEY;
-        keys[1].offset = parent;
-        sizes[0] = sizeof(*extent_item);
-        sizes[1] = sizeof(*ref);
        path = btrfs_alloc_path();
        BUG_ON(!path);
        path->leave_spinning = 1;
-        ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
+        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
-                                       sizes, 2);
+                                      ins, size);
        BUG_ON(ret);
-        extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+        leaf = path->nodes[0];
+        extent_item = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_extent_item);
-        btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
+        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
-        ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
-                             struct btrfs_extent_ref);
+        btrfs_set_extent_flags(leaf, extent_item,
+                               flags | BTRFS_EXTENT_FLAG_DATA);
-        btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
-        btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
+        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
-        btrfs_set_ref_objectid(path->nodes[0], ref, owner);
+        btrfs_set_extent_inline_ref_type(leaf, iref, type);
-        btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
+        if (parent > 0) {
+                struct btrfs_shared_data_ref *ref;
+                ref = (struct btrfs_shared_data_ref *)(iref + 1);
+                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
+        } else {
+                struct btrfs_extent_data_ref *ref;
+                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
+                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
+        }
        btrfs_mark_buffer_dirty(path->nodes[0]);
-        trans->alloc_exclude_start = 0;
-        trans->alloc_exclude_nr = 0;
        btrfs_free_path(path);
-        if (ret)
+        ret = update_block_group(trans, root, ins->objectid, ins->offset,
-                goto out;
+                                 1, 0);
-        ret = update_block_group(trans, root, ins->objectid,
-                                 ins->offset, 1, 0);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
                       (unsigned long long)ins->offset);
                BUG();
        }
-out:
        return ret;
 }
-int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, u64 parent,
+                                     struct btrfs_root *root,
-                                u64 root_objectid, u64 ref_generation,
+                                     u64 parent, u64 root_objectid,
-                                u64 owner, struct btrfs_key *ins)
+                                     u64 flags, struct btrfs_disk_key *key,
+                                     int level, struct btrfs_key *ins)
 {
        int ret;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_extent_item *extent_item;
+        struct btrfs_tree_block_info *block_info;
+        struct btrfs_extent_inline_ref *iref;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
-        if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
+        path = btrfs_alloc_path();
-                return 0;
+        BUG_ON(!path);
-        ret = btrfs_add_delayed_ref(trans, ins->objectid,
+        path->leave_spinning = 1;
-                                    ins->offset, parent, root_objectid,
+        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
-                                    ref_generation, owner,
+                                      ins, size);
-                                    BTRFS_ADD_DELAYED_EXTENT, 0);
        BUG_ON(ret);
+        leaf = path->nodes[0];
+        extent_item = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_item);
+        btrfs_set_extent_refs(leaf, extent_item, 1);
+        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+        btrfs_set_extent_flags(leaf, extent_item,
+                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
+        block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+        btrfs_set_tree_block_key(leaf, block_info, key);
+        btrfs_set_tree_block_level(leaf, block_info, level);
+        iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+        if (parent > 0) {
+                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+                btrfs_set_extent_inline_ref_type(leaf, iref,
+                                                 BTRFS_SHARED_BLOCK_REF_KEY);
+                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+        } else {
+                btrfs_set_extent_inline_ref_type(leaf, iref,
+                                                 BTRFS_TREE_BLOCK_REF_KEY);
+                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_free_path(path);
+        ret = update_block_group(trans, root, ins->objectid, ins->offset,
+                                 1, 0);
+        if (ret) {
+                printk(KERN_ERR "btrfs update block group failed for %llu "
+                       "%llu\n", (unsigned long long)ins->objectid,
+                       (unsigned long long)ins->offset);
+                BUG();
+        }
+        return ret;
+}
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     u64 root_objectid, u64 owner,
+                                     u64 offset, struct btrfs_key *ins)
+{
+        int ret;
+        BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+        ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
+                                         0, root_objectid, owner, offset,
+                                         BTRFS_ADD_DELAYED_EXTENT, NULL);
        return ret;
 }
@@ -3070,10 +3998,10 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 * an extent has been allocated and makes sure to clear the free
 * space cache bits as well
 */
-int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, u64 parent,
+                                   struct btrfs_root *root,
-                                u64 root_objectid, u64 ref_generation,
+                                   u64 root_objectid, u64 owner, u64 offset,
-                                u64 owner, struct btrfs_key *ins)
+                                   struct btrfs_key *ins)
 {
        int ret;
        struct btrfs_block_group_cache *block_group;
@@ -3087,8 +4015,8 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
                                      ins->offset);
        BUG_ON(ret);
        btrfs_put_block_group(block_group);
-        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
-                                            ref_generation, owner, ins, 1);
+                                         0, owner, offset, ins, 1);
        return ret;
 }
@@ -3099,26 +4027,48 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 *
 * returns 0 if everything worked, non-zero otherwise.
 */
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+static int alloc_tree_block(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
+                            struct btrfs_root *root,
-                       u64 num_bytes, u64 parent, u64 min_alloc_size,
+                            u64 num_bytes, u64 parent, u64 root_objectid,
-                       u64 root_objectid, u64 ref_generation,
+                            struct btrfs_disk_key *key, int level,
-                       u64 owner_objectid, u64 empty_size, u64 hint_byte,
+                            u64 empty_size, u64 hint_byte, u64 search_end,
-                       u64 search_end, struct btrfs_key *ins, u64 data)
+                            struct btrfs_key *ins)
 {
        int ret;
-        ret = __btrfs_reserve_extent(trans, root, num_bytes,
+        u64 flags = 0;
-                                     min_alloc_size, empty_size, hint_byte,
-                                     search_end, ins, data);
+        ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+                                     empty_size, hint_byte, search_end,
+                                     ins, 0);
        BUG_ON(ret);
+        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                if (parent == 0)
+                        parent = ins->objectid;
+                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+        } else
+                BUG_ON(parent > 0);
+        update_reserved_extents(root, ins->objectid, ins->offset, 1);
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-                ret = btrfs_add_delayed_ref(trans, ins->objectid,
+                struct btrfs_delayed_extent_op *extent_op;
-                                            ins->offset, parent, root_objectid,
+                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-                                            ref_generation, owner_objectid,
+                BUG_ON(!extent_op);
-                                            BTRFS_ADD_DELAYED_EXTENT, 0);
+                if (key)
+                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
+                else
+                        memset(&extent_op->key, 0, sizeof(extent_op->key));
+                extent_op->flags_to_set = flags;
+                extent_op->update_key = 1;
+                extent_op->update_flags = 1;
+                extent_op->is_data = 0;
+                ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
+                                        ins->offset, parent, root_objectid,
+                                        level, BTRFS_ADD_DELAYED_EXTENT,
+                                        extent_op);
                BUG_ON(ret);
        }
-        update_reserved_extents(root, ins->objectid, ins->offset, 1);
        return ret;
 }
@@ -3157,21 +4107,17 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 * returns the tree buffer or NULL.
 */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-                                             struct btrfs_root *root,
+                                        struct btrfs_root *root, u32 blocksize,
-                                             u32 blocksize, u64 parent,
+                                        u64 parent, u64 root_objectid,
-                                             u64 root_objectid,
+                                        struct btrfs_disk_key *key, int level,
-                                             u64 ref_generation,
+                                        u64 hint, u64 empty_size)
-                                             int level,
-                                             u64 hint,
-                                             u64 empty_size)
 {
        struct btrfs_key ins;
        int ret;
        struct extent_buffer *buf;
-        ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
+        ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
-                                 root_objectid, ref_generation, level,
+                               key, level, empty_size, hint, (u64)-1, &ins);
-                                 empty_size, hint, (u64)-1, &ins, 0);
        if (ret) {
                BUG_ON(ret > 0);
                return ERR_PTR(ret);
@@ -3185,32 +4131,19 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf)
 {
-        u64 leaf_owner;
+        u64 disk_bytenr;
-        u64 leaf_generation;
+        u64 num_bytes;
-        struct refsort *sorted;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
+        u32 nritems;
        int i;
-        int nritems;
        int ret;
-        int refi = 0;
-        int slot;
        BUG_ON(!btrfs_is_leaf(leaf));
        nritems = btrfs_header_nritems(leaf);
-        leaf_owner = btrfs_header_owner(leaf);
-        leaf_generation = btrfs_header_generation(leaf);
-        sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
-        /* we do this loop twice.  The first time we build a list
-         * of the extents we have a reference on, then we sort the list
-         * by bytenr.  The second time around we actually do the
-         * extent freeing.
-         */
        for (i = 0; i < nritems; i++) {
-                u64 disk_bytenr;
                cond_resched();
                btrfs_item_key_to_cpu(leaf, &key, i);
                /* only extents have references, skip everything else */
@@ -3230,45 +4163,16 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                if (disk_bytenr == 0)
                        continue;
-                sorted[refi].bytenr = disk_bytenr;
+                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-                sorted[refi].slot = i;
+                ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
-                refi++;
+                                        leaf->start, 0, key.objectid, 0);
-        }
-        if (refi == 0)
-                goto out;
-        sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
-        for (i = 0; i < refi; i++) {
-                u64 disk_bytenr;
-                disk_bytenr = sorted[i].bytenr;
-                slot = sorted[i].slot;
-                cond_resched();
-                btrfs_item_key_to_cpu(leaf, &key, slot);
-                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-                        continue;
-                fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-                ret = btrfs_free_extent(trans, root, disk_bytenr,
-                                btrfs_file_extent_disk_num_bytes(leaf, fi),
-                                leaf->start, leaf_owner, leaf_generation,
-                                key.objectid, 0);
                BUG_ON(ret);
-                atomic_inc(&root->fs_info->throttle_gen);
-                wake_up(&root->fs_info->transaction_throttle);
-                cond_resched();
        }
-out:
-        kfree(sorted);
        return 0;
 }
+#if 0
 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
                                        struct btrfs_leaf_ref *ref)
@@ -3311,13 +4215,14 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
        return 0;
 }
 static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root, u64 start,
                                     u64 len, u32 *refs)
 {
        int ret;
-        ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
+        ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
        BUG_ON(ret);
 #if 0 /* some debugging code in case we see problems here */
@@ -3352,6 +4257,7 @@ static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
        return ret;
 }
 /*
 * this is used while deleting old snapshots, and it drops the refs
 * on a whole subtree starting from a level 1 node.
@@ -3645,32 +4551,36 @@ out:
        cond_resched();
        return 0;
 }
+#endif
 /*
 * helper function for drop_subtree, this function is similar to
 * walk_down_tree. The main difference is that it checks reference
 * counts while tree blocks are locked.
 */
-static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
-                                      struct btrfs_root *root,
+                                   struct btrfs_root *root,
-                                      struct btrfs_path *path, int *level)
+                                   struct btrfs_path *path, int *level)
 {
        struct extent_buffer *next;
        struct extent_buffer *cur;
        struct extent_buffer *parent;
        u64 bytenr;
        u64 ptr_gen;
+        u64 refs;
+        u64 flags;
        u32 blocksize;
-        u32 refs;
        int ret;
        cur = path->nodes[*level];
-        ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
+        ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len,
-                                      &refs);
+                                       &refs, &flags);
        BUG_ON(ret);
        if (refs > 1)
                goto out;
+        BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
        while (*level >= 0) {
                cur = path->nodes[*level];
                if (*level == 0) {
@@ -3692,16 +4602,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
                btrfs_tree_lock(next);
                btrfs_set_lock_blocking(next);
-                ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
+                ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
-                                              &refs);
+                                               &refs, &flags);
                BUG_ON(ret);
                if (refs > 1) {
                        parent = path->nodes[*level];
                        ret = btrfs_free_extent(trans, root, bytenr,
-                                        blocksize, parent->start,
+                                                blocksize, parent->start,
-                                        btrfs_header_owner(parent),
+                                                btrfs_header_owner(parent),
-                                        btrfs_header_generation(parent),
+                                                *level - 1, 0);
-                                        *level - 1, 1);
                        BUG_ON(ret);
                        path->slots[*level]++;
                        btrfs_tree_unlock(next);
@@ -3709,6 +4618,8 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
                        continue;
                }
+                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
                *level = btrfs_header_level(next);
                path->nodes[*level] = next;
                path->slots[*level] = 0;
@@ -3716,13 +4627,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
                cond_resched();
        }
 out:
-        parent = path->nodes[*level + 1];
+        if (path->nodes[*level] == root->node)
+                parent = path->nodes[*level];
+        else
+                parent = path->nodes[*level + 1];
        bytenr = path->nodes[*level]->start;
        blocksize = path->nodes[*level]->len;
-        ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+        ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start,
-                        parent->start, btrfs_header_owner(parent),
+                                btrfs_header_owner(parent), *level, 0);
-                        btrfs_header_generation(parent), *level, 1);
        BUG_ON(ret);
        if (path->locks[*level]) {
@@ -3746,8 +4659,6 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                                 struct btrfs_path *path,
                                 int *level, int max_level)
 {
-        u64 root_owner;
-        u64 root_gen;
        struct btrfs_root_item *root_item = &root->root_item;
        int i;
        int slot;
@@ -3755,24 +4666,22 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
        for (i = *level; i < max_level && path->nodes[i]; i++) {
                slot = path->slots[i];
-                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
-                        struct extent_buffer *node;
-                        struct btrfs_disk_key disk_key;
                        /*
                         * there is more work to do in this level.
                         * Update the drop_progress marker to reflect
                         * the work we've done so far, and then bump
                         * the slot number
                         */
-                        node = path->nodes[i];
                        path->slots[i]++;
-                        *level = i;
                        WARN_ON(*level == 0);
-                        btrfs_node_key(node, &disk_key, path->slots[i]);
+                        if (max_level == BTRFS_MAX_LEVEL) {
-                        memcpy(&root_item->drop_progress,
+                                btrfs_node_key(path->nodes[i],
-                               &disk_key, sizeof(disk_key));
+                                               &root_item->drop_progress,
-                        root_item->drop_level = i;
+                                               path->slots[i]);
+                                root_item->drop_level = i;
+                        }
+                        *level = i;
                        return 0;
                } else {
                        struct extent_buffer *parent;
@@ -3786,22 +4695,20 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                        else
                                parent = path->nodes[*level + 1];
-                        root_owner = btrfs_header_owner(parent);
+                        clean_tree_block(trans, root, path->nodes[i]);
-                        root_gen = btrfs_header_generation(parent);
-                        clean_tree_block(trans, root, path->nodes[*level]);
                        ret = btrfs_free_extent(trans, root,
-                                                path->nodes[*level]->start,
+                                                path->nodes[i]->start,
-                                                path->nodes[*level]->len,
+                                                path->nodes[i]->len,
-                                                parent->start, root_owner,
+                                                parent->start,
-                                                root_gen, *level, 1);
+                                                btrfs_header_owner(parent),
+                                                *level, 0);
                        BUG_ON(ret);
                        if (path->locks[*level]) {
-                                btrfs_tree_unlock(path->nodes[*level]);
+                                btrfs_tree_unlock(path->nodes[i]);
-                                path->locks[*level] = 0;
+                                path->locks[i] = 0;
                        }
-                        free_extent_buffer(path->nodes[*level]);
+                        free_extent_buffer(path->nodes[i]);
-                        path->nodes[*level] = NULL;
+                        path->nodes[i] = NULL;
                        *level = i + 1;
                }
        }
@@ -3820,21 +4727,18 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
        int wret;
        int level;
        struct btrfs_path *path;
-        int i;
-        int orig_level;
        int update_count;
        struct btrfs_root_item *root_item = &root->root_item;
-        WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
        path = btrfs_alloc_path();
        BUG_ON(!path);
        level = btrfs_header_level(root->node);
-        orig_level = level;
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
-                path->nodes[level] = root->node;
+                path->nodes[level] = btrfs_lock_root_node(root);
-                extent_buffer_get(root->node);
+                btrfs_set_lock_blocking(path->nodes[level]);
                path->slots[level] = 0;
+                path->locks[level] = 1;
        } else {
                struct btrfs_key key;
                struct btrfs_disk_key found_key;
@@ -3856,12 +4760,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                 * unlock our path, this is safe because only this
                 * function is allowed to delete this snapshot
                 */
-                for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+                btrfs_unlock_up_safe(path, 0);
-                        if (path->nodes[i] && path->locks[i]) {
-                                path->locks[i] = 0;
-                                btrfs_tree_unlock(path->nodes[i]);
-                        }
-                }
        }
        while (1) {
                unsigned long update;
@@ -3882,8 +4781,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                        ret = -EAGAIN;
                        break;
                }
-                atomic_inc(&root->fs_info->throttle_gen);
-                wake_up(&root->fs_info->transaction_throttle);
                for (update_count = 0; update_count < 16; update_count++) {
                        update = trans->delayed_ref_updates;
                        trans->delayed_ref_updates = 0;
@@ -3893,12 +4790,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                                break;
                }
        }
-        for (i = 0; i <= orig_level; i++) {
-                if (path->nodes[i]) {
-                        free_extent_buffer(path->nodes[i]);
-                        path->nodes[i] = NULL;
-                }
-        }
 out:
        btrfs_free_path(path);
        return ret;
@@ -3931,7 +4822,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        path->slots[level] = 0;
        while (1) {
-                wret = walk_down_subtree(trans, root, path, &level);
+                wret = walk_down_tree(trans, root, path, &level);
                if (wret < 0)
                        ret = wret;
                if (wret != 0)
@@ -3948,6 +4839,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        return ret;
 }
+#if 0
 static unsigned long calc_ra(unsigned long start, unsigned long last,
                             unsigned long nr)
 {
@@ -5429,6 +6321,7 @@ out:
        kfree(ref_path);
        return ret;
 }
+#endif
 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
@@ -5477,7 +6370,8 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
        u64 calc;
        spin_lock(&shrink_block_group->lock);
-        if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+        if (btrfs_block_group_used(&shrink_block_group->item) +
+            shrink_block_group->reserved > 0) {
                spin_unlock(&shrink_block_group->lock);
                trans = btrfs_start_transaction(root, 1);
@@ -5502,6 +6396,17 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
        return 0;
 }
+int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+                                         struct btrfs_block_group_cache *group)
+{
+        __alloc_chunk_for_shrink(root, group, 1);
+        set_block_group_readonly(group);
+        return 0;
+}
+#if 0
 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 u64 objectid, u64 size)
@@ -5781,6 +6686,7 @@ out:
        btrfs_free_path(path);
        return ret;
 }
+#endif
 static int find_first_block_group(struct btrfs_root *root,
                struct btrfs_path *path, struct btrfs_key *key)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fe9eb990e443..68260180f587 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -476,6 +476,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
        struct rb_node *node;
+        u64 last_end;
        int err;
        int set = 0;
@@ -498,6 +499,7 @@ again:
        if (state->start > end)
                goto out;
        WARN_ON(state->end < start);
+        last_end = state->end;
        /*
         *     | ---- desired range ---- |
@@ -524,9 +526,11 @@ again:
                if (err)
                        goto out;
                if (state->end <= end) {
-                        start = state->end + 1;
                        set |= clear_state_bit(tree, state, bits,
                                        wake, delete);
+                        if (last_end == (u64)-1)
+                                goto out;
+                        start = last_end + 1;
                } else {
                        start = state->start;
                }
@@ -552,8 +556,10 @@ again:
                goto out;
        }
-        start = state->end + 1;
        set |= clear_state_bit(tree, state, bits, wake, delete);
+        if (last_end == (u64)-1)
+                goto out;
+        start = last_end + 1;
        goto search_again;
 out:
@@ -707,8 +713,10 @@ again:
                        goto out;
                }
                set_state_bits(tree, state, bits);
-                start = state->end + 1;
                merge_state(tree, state);
+                if (last_end == (u64)-1)
+                        goto out;
+                start = last_end + 1;
                goto search_again;
        }
@@ -742,8 +750,10 @@ again:
                        goto out;
                if (state->end <= end) {
                        set_state_bits(tree, state, bits);
-                        start = state->end + 1;
                        merge_state(tree, state);
+                        if (last_end == (u64)-1)
+                                goto out;
+                        start = last_end + 1;
                } else {
                        start = state->start;
                }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1d51dc38bb49..126477eaecf5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -291,16 +291,12 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 {
        u64 extent_end = 0;
        u64 search_start = start;
-        u64 leaf_start;
        u64 ram_bytes = 0;
-        u64 orig_parent = 0;
        u64 disk_bytenr = 0;
        u64 orig_locked_end = locked_end;
        u8 compression;
        u8 encryption;
        u16 other_encoding = 0;
-        u64 root_gen;
-        u64 root_owner;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *extent;
        struct btrfs_path *path;
@@ -340,9 +336,6 @@ next_slot:
                bookend = 0;
                found_extent = 0;
                found_inline = 0;
-                leaf_start = 0;
-                root_gen = 0;
-                root_owner = 0;
                compression = 0;
                encryption = 0;
                extent = NULL;
@@ -417,9 +410,6 @@ next_slot:
                if (found_extent) {
                        read_extent_buffer(leaf, &old, (unsigned long)extent,
                                           sizeof(old));
-                        root_gen = btrfs_header_generation(leaf);
-                        root_owner = btrfs_header_owner(leaf);
-                        leaf_start = leaf->start;
                }
                if (end < extent_end && end >= key.offset) {
@@ -443,14 +433,14 @@ next_slot:
                                }
                                locked_end = extent_end;
                        }
-                        orig_parent = path->nodes[0]->start;
                        disk_bytenr = le64_to_cpu(old.disk_bytenr);
                        if (disk_bytenr != 0) {
                                ret = btrfs_inc_extent_ref(trans, root,
                                           disk_bytenr,
-                                           le64_to_cpu(old.disk_num_bytes),
+                                           le64_to_cpu(old.disk_num_bytes), 0,
-                                           orig_parent, root->root_key.objectid,
+                                           root->root_key.objectid,
-                                           trans->transid, inode->i_ino);
+                                           key.objectid, key.offset -
+                                           le64_to_cpu(old.offset));
                                BUG_ON(ret);
                        }
                }
@@ -568,17 +558,6 @@ next_slot:
                        btrfs_mark_buffer_dirty(path->nodes[0]);
                        btrfs_set_lock_blocking(path->nodes[0]);
-                        if (disk_bytenr != 0) {
-                                ret = btrfs_update_extent_ref(trans, root,
-                                                disk_bytenr,
-                                                le64_to_cpu(old.disk_num_bytes),
-                                                orig_parent,
-                                                leaf->start,
-                                                root->root_key.objectid,
-                                                trans->transid, ins.objectid);
-                                BUG_ON(ret);
-                        }
                        path->leave_spinning = 0;
                        btrfs_release_path(root, path);
                        if (disk_bytenr != 0)
@@ -594,8 +573,9 @@ next_slot:
                                ret = btrfs_free_extent(trans, root,
                                                old_disk_bytenr,
                                                le64_to_cpu(old.disk_num_bytes),
-                                                leaf_start, root_owner,
+                                                0, root->root_key.objectid,
-                                                root_gen, key.objectid, 0);
+                                                key.objectid, key.offset -
+                                                le64_to_cpu(old.offset));
                                BUG_ON(ret);
                                *hint_byte = old_disk_bytenr;
                        }
@@ -664,12 +644,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        u64 bytenr;
        u64 num_bytes;
        u64 extent_end;
-        u64 extent_offset;
+        u64 orig_offset;
        u64 other_start;
        u64 other_end;
        u64 split = start;
        u64 locked_end = end;
-        u64 orig_parent;
        int extent_type;
        int split_end = 1;
        int ret;
@@ -703,7 +682,7 @@ again:
        bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-        extent_offset = btrfs_file_extent_offset(leaf, fi);
+        orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
        if (key.offset == start)
                split = end;
@@ -711,8 +690,6 @@ again:
        if (key.offset == start && extent_end == end) {
                int del_nr = 0;
                int del_slot = 0;
-                u64 leaf_owner = btrfs_header_owner(leaf);
-                u64 leaf_gen = btrfs_header_generation(leaf);
                other_start = end;
                other_end = 0;
                if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
@@ -721,8 +698,8 @@ again:
                        del_slot = path->slots[0] + 1;
                        del_nr++;
                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-                                                leaf->start, leaf_owner,
+                                                0, root->root_key.objectid,
-                                                leaf_gen, inode->i_ino, 0);
+                                                inode->i_ino, orig_offset);
                        BUG_ON(ret);
                }
                other_start = 0;
@@ -733,8 +710,8 @@ again:
                        del_slot = path->slots[0];
                        del_nr++;
                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-                                                leaf->start, leaf_owner,
+                                                0, root->root_key.objectid,
-                                                leaf_gen, inode->i_ino, 0);
+                                                inode->i_ino, orig_offset);
                        BUG_ON(ret);
                }
                split_end = 0;
@@ -768,13 +745,12 @@ again:
                        locked_end = extent_end;
                }
                btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
-                extent_offset += split - key.offset;
        } else  {
                BUG_ON(key.offset != start);
-                btrfs_set_file_extent_offset(leaf, fi, extent_offset +
-                                             split - key.offset);
-                btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
                key.offset = split;
+                btrfs_set_file_extent_offset(leaf, fi, key.offset -
+                                             orig_offset);
+                btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
                btrfs_set_item_key_safe(trans, root, path, &key);
                extent_end = split;
        }
@@ -793,7 +769,8 @@ again:
                                            struct btrfs_file_extent_item);
                        key.offset = split;
                        btrfs_set_item_key_safe(trans, root, path, &key);
-                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+                        btrfs_set_file_extent_offset(leaf, fi, key.offset -
+                                                     orig_offset);
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        other_end - split);
                        goto done;
@@ -815,10 +792,9 @@ again:
        btrfs_mark_buffer_dirty(leaf);
-        orig_parent = leaf->start;
+        ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
-        ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+                                   root->root_key.objectid,
-                                   orig_parent, root->root_key.objectid,
+                                   inode->i_ino, orig_offset);
-                                   trans->transid, inode->i_ino);
        BUG_ON(ret);
        btrfs_release_path(root, path);
@@ -833,20 +809,12 @@ again:
        btrfs_set_file_extent_type(leaf, fi, extent_type);
        btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
        btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
-        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+        btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
        btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
        btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
        btrfs_set_file_extent_compression(leaf, fi, 0);
        btrfs_set_file_extent_encryption(leaf, fi, 0);
        btrfs_set_file_extent_other_encoding(leaf, fi, 0);
-        if (orig_parent != leaf->start) {
-                ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-                                              orig_parent, leaf->start,
-                                              root->root_key.objectid,
-                                              trans->transid, inode->i_ino);
-                BUG_ON(ret);
-        }
 done:
        btrfs_mark_buffer_dirty(leaf);
@@ -1189,6 +1157,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
        btrfs_wait_ordered_range(inode, 0, (u64)-1);
        root->log_batch++;
+        if (datasync && !(inode->i_state & I_DIRTY_PAGES))
+                goto out;
        /*
         * ok we haven't committed the transaction yet, lets do a commit
         */
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0bc93657b460..4538e48581a5 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -579,6 +579,7 @@ out:
 * it returns -enospc
 */
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
                             struct btrfs_block_group_cache *block_group,
                             struct btrfs_free_cluster *cluster,
                             u64 offset, u64 bytes, u64 empty_size)
@@ -595,7 +596,9 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
        int ret;
        /* for metadata, allow allocates with more holes */
-        if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+        if (btrfs_test_opt(root, SSD_SPREAD)) {
+                min_bytes = bytes + empty_size;
+        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
                /*
                 * we want to do larger allocations when we are
                 * flushing out the delayed refs, it helps prevent
@@ -645,14 +648,15 @@ again:
                 * we haven't filled the empty size and the window is
                 * very large.  reset and try again
                 */
-                if (next->offset - window_start > (bytes + empty_size) * 2) {
+                if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
+                    next->offset - window_start > (bytes + empty_size) * 2) {
                        entry = next;
                        window_start = entry->offset;
                        window_free = entry->bytes;
                        last = entry;
                        max_extent = 0;
                        total_retries++;
-                        if (total_retries % 256 == 0) {
+                        if (total_retries % 64 == 0) {
                                if (min_bytes >= (bytes + empty_size)) {
                                        ret = -ENOSPC;
                                        goto out;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index ab0bdc0a63ce..266fb8764054 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -31,6 +31,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                           u64 bytes);
 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
                             struct btrfs_block_group_cache *block_group,
                             struct btrfs_free_cluster *cluster,
                             u64 offset, u64 bytes, u64 empty_size);
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 2a020b276768..db2ff9773b99 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -19,9 +19,9 @@
 #ifndef __HASH__
 #define __HASH__
-#include "crc32c.h"
+#include <linux/crc32c.h>
 static inline u64 btrfs_name_hash(const char *name, int len)
 {
-        return btrfs_crc32c((u32)~1, name, len);
+        return crc32c((u32)~1, name, len);
 }
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c8b0190d031..5b68330f8585 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -48,7 +48,6 @@
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
-#include "ref-cache.h"
 #include "compression.h"
 #include "locking.h"
@@ -369,7 +368,7 @@ again:
         * inode has not been flagged as nocompress.  This flag can
         * change at any time if we discover bad compression ratios.
         */
-        if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
            btrfs_test_opt(root, COMPRESS)) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
@@ -470,7 +469,7 @@ again:
                nr_pages_ret = 0;
                /* flag the file so we don't compress in the future */
-                btrfs_set_flag(inode, NOCOMPRESS);
+                BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
        }
        if (will_compress) {
                *num_added += 1;
@@ -863,7 +862,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                async_cow->locked_page = locked_page;
                async_cow->start = start;
-                if (btrfs_test_flag(inode, NOCOMPRESS))
+                if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
                        cur_end = end;
                else
                        cur_end = min(end, start + 512 * 1024 - 1);
@@ -944,6 +943,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        u64 cow_start;
        u64 cur_offset;
        u64 extent_end;
+        u64 extent_offset;
        u64 disk_bytenr;
        u64 num_bytes;
        int extent_type;
@@ -1005,6 +1005,7 @@ next_slot:
                if (extent_type == BTRFS_FILE_EXTENT_REG ||
                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                        extent_offset = btrfs_file_extent_offset(leaf, fi);
                        extent_end = found_key.offset +
                                btrfs_file_extent_num_bytes(leaf, fi);
                        if (extent_end <= start) {
@@ -1022,9 +1023,10 @@ next_slot:
                        if (btrfs_extent_readonly(root, disk_bytenr))
                                goto out_check;
                        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
-                                                  disk_bytenr))
+                                                  found_key.offset -
+                                                  extent_offset, disk_bytenr))
                                goto out_check;
-                        disk_bytenr += btrfs_file_extent_offset(leaf, fi);
+                        disk_bytenr += extent_offset;
                        disk_bytenr += cur_offset - found_key.offset;
                        num_bytes = min(end + 1, extent_end) - cur_offset;
                        /*
@@ -1131,10 +1133,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
        int ret;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        if (btrfs_test_flag(inode, NODATACOW))
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 1, nr_written);
-        else if (btrfs_test_flag(inode, PREALLOC))
+        else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
        else if (!btrfs_test_opt(root, COMPRESS))
@@ -1288,7 +1290,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        int ret = 0;
        int skip_sum;
-        skip_sum = btrfs_test_flag(inode, NODATASUM);
+        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
        BUG_ON(ret);
@@ -1489,9 +1491,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        ins.objectid = disk_bytenr;
        ins.offset = disk_num_bytes;
        ins.type = BTRFS_EXTENT_ITEM_KEY;
-        ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+        ret = btrfs_alloc_reserved_file_extent(trans, root,
-                                          root->root_key.objectid,
+                                        root->root_key.objectid,
-                                          trans->transid, inode->i_ino, &ins);
+                                        inode->i_ino, file_pos, &ins);
        BUG_ON(ret);
        btrfs_free_path(path);
@@ -1788,7 +1790,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                ClearPageChecked(page);
                goto good;
        }
-        if (btrfs_test_flag(inode, NODATASUM))
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
                return 0;
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
@@ -1956,23 +1959,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 * crossing root thing.  we store the inode number in the
                 * offset of the orphan item.
                 */
-                inode = btrfs_iget_locked(root->fs_info->sb,
+                found_key.objectid = found_key.offset;
-                                          found_key.offset, root);
+                found_key.type = BTRFS_INODE_ITEM_KEY;
-                if (!inode)
+                found_key.offset = 0;
+                inode = btrfs_iget(root->fs_info->sb, &found_key, root);
+                if (IS_ERR(inode))
                        break;
-                if (inode->i_state & I_NEW) {
-                        BTRFS_I(inode)->root = root;
-                        /* have to set the location manually */
-                        BTRFS_I(inode)->location.objectid = inode->i_ino;
-                        BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-                        BTRFS_I(inode)->location.offset = 0;
-                        btrfs_read_locked_inode(inode);
-                        unlock_new_inode(inode);
-                }
                /*
                 * add this inode to the orphan list so btrfs_orphan_del does
                 * the proper thing when we hit it
@@ -2069,7 +2062,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 /*
 * read an inode from the btree into the in-memory inode
 */
-void btrfs_read_locked_inode(struct inode *inode)
+static void btrfs_read_locked_inode(struct inode *inode)
 {
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -2164,6 +2157,8 @@ void btrfs_read_locked_inode(struct inode *inode)
                init_special_inode(inode, inode->i_mode, rdev);
                break;
        }
+        btrfs_update_iflags(inode);
        return;
 make_bad:
@@ -2599,9 +2594,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        struct btrfs_file_extent_item *fi;
        u64 extent_start = 0;
        u64 extent_num_bytes = 0;
+        u64 extent_offset = 0;
        u64 item_end = 0;
-        u64 root_gen = 0;
-        u64 root_owner = 0;
        int found_extent;
        int del_item;
        int pending_del_nr = 0;
@@ -2716,6 +2710,9 @@ search_again:
                                extent_num_bytes =
                                        btrfs_file_extent_disk_num_bytes(leaf,
                                                                         fi);
+                                extent_offset = found_key.offset -
+                                        btrfs_file_extent_offset(leaf, fi);
                                /* FIXME blocksize != 4096 */
                                num_dec = btrfs_file_extent_num_bytes(leaf, fi);
                                if (extent_start != 0) {
@@ -2723,8 +2720,6 @@ search_again:
                                        if (root->ref_cows)
                                                inode_sub_bytes(inode, num_dec);
                                }
-                                root_gen = btrfs_header_generation(leaf);
-                                root_owner = btrfs_header_owner(leaf);
                        }
                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                        /*
@@ -2768,12 +2763,12 @@ delete:
                } else {
                        break;
                }
-                if (found_extent) {
+                if (found_extent && root->ref_cows) {
                        btrfs_set_path_blocking(path);
                        ret = btrfs_free_extent(trans, root, extent_start,
-                                                extent_num_bytes,
+                                                extent_num_bytes, 0,
-                                                leaf->start, root_owner,
+                                                btrfs_header_owner(leaf),
-                                                root_gen, inode->i_ino, 0);
+                                                inode->i_ino, extent_offset);
                        BUG_ON(ret);
                }
 next:
@@ -3105,6 +3100,45 @@ static int fixup_tree_root_location(struct btrfs_root *root,
        return 0;
 }
+static void inode_tree_add(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_inode *entry;
+        struct rb_node **p = &root->inode_tree.rb_node;
+        struct rb_node *parent = NULL;
+        spin_lock(&root->inode_lock);
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct btrfs_inode, rb_node);
+                if (inode->i_ino < entry->vfs_inode.i_ino)
+                        p = &(*p)->rb_left;
+                else if (inode->i_ino > entry->vfs_inode.i_ino)
+                        p = &(*p)->rb_right;
+                else {
+                        WARN_ON(!(entry->vfs_inode.i_state &
+                                  (I_WILL_FREE | I_FREEING | I_CLEAR)));
+                        break;
+                }
+        }
+        rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
+        rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+        spin_unlock(&root->inode_lock);
+}
+static void inode_tree_del(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
+                spin_lock(&root->inode_lock);
+                rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+                spin_unlock(&root->inode_lock);
+                RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+        }
+}
 static noinline void init_btrfs_i(struct inode *inode)
 {
        struct btrfs_inode *bi = BTRFS_I(inode);
@@ -3130,6 +3164,7 @@ static noinline void init_btrfs_i(struct inode *inode)
                             inode->i_mapping, GFP_NOFS);
        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
        INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
+        RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
        mutex_init(&BTRFS_I(inode)->extent_mutex);
        mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3152,26 +3187,9 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
                args->root == BTRFS_I(inode)->root;
 }
-struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+static struct inode *btrfs_iget_locked(struct super_block *s,
-                            struct btrfs_root *root, int wait)
+                                       u64 objectid,
-{
+                                       struct btrfs_root *root)
-        struct inode *inode;
-        struct btrfs_iget_args args;
-        args.ino = objectid;
-        args.root = root;
-        if (wait) {
-                inode = ilookup5(s, objectid, btrfs_find_actor,
-                                 (void *)&args);
-        } else {
-                inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
-                                        (void *)&args);
-        }
-        return inode;
-}
-struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
-                                struct btrfs_root *root)
 {
        struct inode *inode;
        struct btrfs_iget_args args;
@@ -3188,24 +3206,21 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 * Returns in *is_new if the inode was read from disk
 */
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-                         struct btrfs_root *root, int *is_new)
+                         struct btrfs_root *root)
 {
        struct inode *inode;
        inode = btrfs_iget_locked(s, location->objectid, root);
        if (!inode)
-                return ERR_PTR(-EACCES);
+                return ERR_PTR(-ENOMEM);
        if (inode->i_state & I_NEW) {
                BTRFS_I(inode)->root = root;
                memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
                btrfs_read_locked_inode(inode);
+                inode_tree_add(inode);
                unlock_new_inode(inode);
-                if (is_new)
-                        *is_new = 1;
-        } else {
-                if (is_new)
-                        *is_new = 0;
        }
        return inode;
@@ -3218,7 +3233,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        struct btrfs_root *root = bi->root;
        struct btrfs_root *sub_root = root;
        struct btrfs_key location;
-        int ret, new;
+        int ret;
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
@@ -3236,7 +3251,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                        return ERR_PTR(ret);
                if (ret > 0)
                        return ERR_PTR(-ENOENT);
-                inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
+                inode = btrfs_iget(dir->i_sb, &location, sub_root);
                if (IS_ERR(inode))
                        return ERR_CAST(inode);
        }
@@ -3574,9 +3589,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                        btrfs_find_block_group(root, 0, alloc_hint, owner);
        if ((mode & S_IFREG)) {
                if (btrfs_test_opt(root, NODATASUM))
-                        btrfs_set_flag(inode, NODATASUM);
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
                if (btrfs_test_opt(root, NODATACOW))
-                        btrfs_set_flag(inode, NODATACOW);
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
        }
        key[0].objectid = objectid;
@@ -3630,7 +3645,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        location->offset = 0;
        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+        btrfs_inherit_iflags(inode, dir);
        insert_inode_hash(inode);
+        inode_tree_add(inode);
        return inode;
 fail:
        if (dir)
@@ -4683,6 +4701,7 @@ void btrfs_destroy_inode(struct inode *inode)
                        btrfs_put_ordered_extent(ordered);
                }
        }
+        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
@@ -5061,7 +5080,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 out:
        if (cur_offset > start) {
                inode->i_ctime = CURRENT_TIME;
-                btrfs_set_flag(inode, PREALLOC);
+                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                    cur_offset > i_size_read(inode))
                        btrfs_i_size_write(inode, cur_offset);
@@ -5182,7 +5201,7 @@ static int btrfs_set_page_dirty(struct page *page)
 static int btrfs_permission(struct inode *inode, int mask)
 {
-        if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
+        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
                return -EACCES;
        return generic_permission(inode, mask, btrfs_check_acl);
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2624b53ea783..eff18f5b5362 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -50,7 +50,177 @@
 #include "volumes.h"
 #include "locking.h"
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+{
+        if (S_ISDIR(mode))
+                return flags;
+        else if (S_ISREG(mode))
+                return flags & ~FS_DIRSYNC_FL;
+        else
+                return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+/*
+ * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ */
+static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+{
+        unsigned int iflags = 0;
+        if (flags & BTRFS_INODE_SYNC)
+                iflags |= FS_SYNC_FL;
+        if (flags & BTRFS_INODE_IMMUTABLE)
+                iflags |= FS_IMMUTABLE_FL;
+        if (flags & BTRFS_INODE_APPEND)
+                iflags |= FS_APPEND_FL;
+        if (flags & BTRFS_INODE_NODUMP)
+                iflags |= FS_NODUMP_FL;
+        if (flags & BTRFS_INODE_NOATIME)
+                iflags |= FS_NOATIME_FL;
+        if (flags & BTRFS_INODE_DIRSYNC)
+                iflags |= FS_DIRSYNC_FL;
+        return iflags;
+}
+/*
+ * Update inode->i_flags based on the btrfs internal flags.
+ */
+void btrfs_update_iflags(struct inode *inode)
+{
+        struct btrfs_inode *ip = BTRFS_I(inode);
+        inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+        if (ip->flags & BTRFS_INODE_SYNC)
+                inode->i_flags |= S_SYNC;
+        if (ip->flags & BTRFS_INODE_IMMUTABLE)
+                inode->i_flags |= S_IMMUTABLE;
+        if (ip->flags & BTRFS_INODE_APPEND)
+                inode->i_flags |= S_APPEND;
+        if (ip->flags & BTRFS_INODE_NOATIME)
+                inode->i_flags |= S_NOATIME;
+        if (ip->flags & BTRFS_INODE_DIRSYNC)
+                inode->i_flags |= S_DIRSYNC;
+}
+/*
+ * Inherit flags from the parent inode.
+ *
+ * Unlike extN we don't have any flags we don't want to inherit currently.
+ */
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+{
+        unsigned int flags;
+        if (!dir)
+                return;
+        flags = BTRFS_I(dir)->flags;
+        if (S_ISREG(inode->i_mode))
+                flags &= ~BTRFS_INODE_DIRSYNC;
+        else if (!S_ISDIR(inode->i_mode))
+                flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+        BTRFS_I(inode)->flags = flags;
+        btrfs_update_iflags(inode);
+}
+static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
+{
+        struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
+        unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+        if (copy_to_user(arg, &flags, sizeof(flags)))
+                return -EFAULT;
+        return 0;
+}
+static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct btrfs_inode *ip = BTRFS_I(inode);
+        struct btrfs_root *root = ip->root;
+        struct btrfs_trans_handle *trans;
+        unsigned int flags, oldflags;
+        int ret;
+        if (copy_from_user(&flags, arg, sizeof(flags)))
+                return -EFAULT;
+        if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+                      FS_NOATIME_FL | FS_NODUMP_FL | \
+                      FS_SYNC_FL | FS_DIRSYNC_FL))
+                return -EOPNOTSUPP;
+        if (!is_owner_or_cap(inode))
+                return -EACCES;
+        mutex_lock(&inode->i_mutex);
+        flags = btrfs_mask_flags(inode->i_mode, flags);
+        oldflags = btrfs_flags_to_ioctl(ip->flags);
+        if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+                if (!capable(CAP_LINUX_IMMUTABLE)) {
+                        ret = -EPERM;
+                        goto out_unlock;
+                }
+        }
+        ret = mnt_want_write(file->f_path.mnt);
+        if (ret)
+                goto out_unlock;
+        if (flags & FS_SYNC_FL)
+                ip->flags |= BTRFS_INODE_SYNC;
+        else
+                ip->flags &= ~BTRFS_INODE_SYNC;
+        if (flags & FS_IMMUTABLE_FL)
+                ip->flags |= BTRFS_INODE_IMMUTABLE;
+        else
+                ip->flags &= ~BTRFS_INODE_IMMUTABLE;
+        if (flags & FS_APPEND_FL)
+                ip->flags |= BTRFS_INODE_APPEND;
+        else
+                ip->flags &= ~BTRFS_INODE_APPEND;
+        if (flags & FS_NODUMP_FL)
+                ip->flags |= BTRFS_INODE_NODUMP;
+        else
+                ip->flags &= ~BTRFS_INODE_NODUMP;
+        if (flags & FS_NOATIME_FL)
+                ip->flags |= BTRFS_INODE_NOATIME;
+        else
+                ip->flags &= ~BTRFS_INODE_NOATIME;
+        if (flags & FS_DIRSYNC_FL)
+                ip->flags |= BTRFS_INODE_DIRSYNC;
+        else
+                ip->flags &= ~BTRFS_INODE_DIRSYNC;
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(!trans);
+        ret = btrfs_update_inode(trans, root, inode);
+        BUG_ON(ret);
+        btrfs_update_iflags(inode);
+        inode->i_ctime = CURRENT_TIME;
+        btrfs_end_transaction(trans, root);
+        mnt_drop_write(file->f_path.mnt);
+ out_unlock:
+        mutex_unlock(&inode->i_mutex);
+        return 0;
+}
+static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        return put_user(inode->i_generation, arg);
+}
 static noinline int create_subvol(struct btrfs_root *root,
                                  struct dentry *dentry,
@@ -82,22 +252,25 @@ static noinline int create_subvol(struct btrfs_root *root,
        if (ret)
                goto fail;
-        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-                                      objectid, trans->transid, 0, 0, 0);
+                                      0, objectid, NULL, 0, 0, 0);
        if (IS_ERR(leaf)) {
                ret = PTR_ERR(leaf);
                goto fail;
        }
-        btrfs_set_header_nritems(leaf, 0);
+        memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
-        btrfs_set_header_level(leaf, 0);
        btrfs_set_header_bytenr(leaf, leaf->start);
        btrfs_set_header_generation(leaf, trans->transid);
+        btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(leaf, objectid);
        write_extent_buffer(leaf, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(leaf),
                            BTRFS_FSID_SIZE);
+        write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+                            (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+                            BTRFS_UUID_SIZE);
        btrfs_mark_buffer_dirty(leaf);
        inode_item = &root_item.inode;
@@ -125,7 +298,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        btrfs_set_root_dirid(&root_item, new_dirid);
        key.objectid = objectid;
-        key.offset = 1;
+        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
                                &root_item);
@@ -911,10 +1084,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                if (disko) {
                                        inode_add_bytes(inode, datal);
                                        ret = btrfs_inc_extent_ref(trans, root,
-                                                   disko, diskl, leaf->start,
+                                                        disko, diskl, 0,
-                                                   root->root_key.objectid,
+                                                        root->root_key.objectid,
-                                                   trans->transid,
+                                                        inode->i_ino,
-                                                   inode->i_ino);
+                                                        new_key.offset - datao);
                                        BUG_ON(ret);
                                }
                        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -1074,6 +1247,12 @@ long btrfs_ioctl(struct file *file, unsigned int
        void __user *argp = (void __user *)arg;
        switch (cmd) {
+        case FS_IOC_GETFLAGS:
+                return btrfs_ioctl_getflags(file, argp);
+        case FS_IOC_SETFLAGS:
+                return btrfs_ioctl_setflags(file, argp);
+        case FS_IOC_GETVERSION:
+                return btrfs_ioctl_getversion(file, argp);
        case BTRFS_IOC_SNAP_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 0);
        case BTRFS_IOC_SUBVOL_CREATE:
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5f8f218c1005..6d6523da0a30 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -45,22 +45,132 @@ static void print_dev_item(struct extent_buffer *eb,
               (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
               (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
 }
+static void print_extent_data_ref(struct extent_buffer *eb,
+                                  struct btrfs_extent_data_ref *ref)
+{
+        printk(KERN_INFO "\t\textent data backref root %llu "
+               "objectid %llu offset %llu count %u\n",
+               (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
+               (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
+               (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
+               btrfs_extent_data_ref_count(eb, ref));
+}
+static void print_extent_item(struct extent_buffer *eb, int slot)
+{
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_inline_ref *iref;
+        struct btrfs_extent_data_ref *dref;
+        struct btrfs_shared_data_ref *sref;
+        struct btrfs_disk_key key;
+        unsigned long end;
+        unsigned long ptr;
+        int type;
+        u32 item_size = btrfs_item_size_nr(eb, slot);
+        u64 flags;
+        u64 offset;
+        if (item_size < sizeof(*ei)) {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                struct btrfs_extent_item_v0 *ei0;
+                BUG_ON(item_size != sizeof(*ei0));
+                ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
+                printk(KERN_INFO "\t\textent refs %u\n",
+                       btrfs_extent_refs_v0(eb, ei0));
+                return;
+#else
+                BUG();
+#endif
+        }
+        ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
+        flags = btrfs_extent_flags(eb, ei);
+        printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
+               (unsigned long long)btrfs_extent_refs(eb, ei),
+               (unsigned long long)btrfs_extent_generation(eb, ei),
+               (unsigned long long)flags);
+        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                struct btrfs_tree_block_info *info;
+                info = (struct btrfs_tree_block_info *)(ei + 1);
+                btrfs_tree_block_key(eb, info, &key);
+                printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
+                       "level %d\n",
+                       (unsigned long long)btrfs_disk_key_objectid(&key),
+                       key.type,
+                       (unsigned long long)btrfs_disk_key_offset(&key),
+                       btrfs_tree_block_level(eb, info));
+                iref = (struct btrfs_extent_inline_ref *)(info + 1);
+        } else {
+                iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+        }
+        ptr = (unsigned long)iref;
+        end = (unsigned long)ei + item_size;
+        while (ptr < end) {
+                iref = (struct btrfs_extent_inline_ref *)ptr;
+                type = btrfs_extent_inline_ref_type(eb, iref);
+                offset = btrfs_extent_inline_ref_offset(eb, iref);
+                switch (type) {
+                case BTRFS_TREE_BLOCK_REF_KEY:
+                        printk(KERN_INFO "\t\ttree block backref "
+                                "root %llu\n", (unsigned long long)offset);
+                        break;
+                case BTRFS_SHARED_BLOCK_REF_KEY:
+                        printk(KERN_INFO "\t\tshared block backref "
+                                "parent %llu\n", (unsigned long long)offset);
+                        break;
+                case BTRFS_EXTENT_DATA_REF_KEY:
+                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        print_extent_data_ref(eb, dref);
+                        break;
+                case BTRFS_SHARED_DATA_REF_KEY:
+                        sref = (struct btrfs_shared_data_ref *)(iref + 1);
+                        printk(KERN_INFO "\t\tshared data backref "
+                               "parent %llu count %u\n",
+                               (unsigned long long)offset,
+                               btrfs_shared_data_ref_count(eb, sref));
+                        break;
+                default:
+                        BUG();
+                }
+                ptr += btrfs_extent_inline_ref_size(type);
+        }
+        WARN_ON(ptr > end);
+}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
+{
+        struct btrfs_extent_ref_v0 *ref0;
+        ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
+        printk("\t\textent back ref root %llu gen %llu "
+                "owner %llu num_refs %lu\n",
+                (unsigned long long)btrfs_ref_root_v0(eb, ref0),
+                (unsigned long long)btrfs_ref_generation_v0(eb, ref0),
+                (unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
+                (unsigned long)btrfs_ref_count_v0(eb, ref0));
+}
+#endif
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
        int i;
+        u32 type;
        u32 nr = btrfs_header_nritems(l);
        struct btrfs_item *item;
-        struct btrfs_extent_item *ei;
        struct btrfs_root_item *ri;
        struct btrfs_dir_item *di;
        struct btrfs_inode_item *ii;
        struct btrfs_block_group_item *bi;
        struct btrfs_file_extent_item *fi;
+        struct btrfs_extent_data_ref *dref;
+        struct btrfs_shared_data_ref *sref;
+        struct btrfs_dev_extent *dev_extent;
        struct btrfs_key key;
        struct btrfs_key found_key;
-        struct btrfs_extent_ref *ref;
-        struct btrfs_dev_extent *dev_extent;
-        u32 type;
        printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
                (unsigned long long)btrfs_header_bytenr(l), nr,
@@ -100,20 +210,25 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                                btrfs_disk_root_refs(l, ri));
                        break;
                case BTRFS_EXTENT_ITEM_KEY:
-                        ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
+                        print_extent_item(l, i);
-                        printk(KERN_INFO "\t\textent data refs %u\n",
+                        break;
-                                btrfs_extent_refs(l, ei));
+                case BTRFS_TREE_BLOCK_REF_KEY:
-                        break;
+                        printk(KERN_INFO "\t\ttree block backref\n");
-                case BTRFS_EXTENT_REF_KEY:
+                        break;
-                        ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
+                case BTRFS_SHARED_BLOCK_REF_KEY:
-                        printk(KERN_INFO "\t\textent back ref root %llu "
+                        printk(KERN_INFO "\t\tshared block backref\n");
-                               "gen %llu owner %llu num_refs %lu\n",
+                        break;
-                               (unsigned long long)btrfs_ref_root(l, ref),
+                case BTRFS_EXTENT_DATA_REF_KEY:
-                               (unsigned long long)btrfs_ref_generation(l, ref),
+                        dref = btrfs_item_ptr(l, i,
-                               (unsigned long long)btrfs_ref_objectid(l, ref),
+                                              struct btrfs_extent_data_ref);
-                               (unsigned long)btrfs_ref_num_refs(l, ref));
+                        print_extent_data_ref(l, dref);
+                        break;
+                case BTRFS_SHARED_DATA_REF_KEY:
+                        sref = btrfs_item_ptr(l, i,
+                                              struct btrfs_shared_data_ref);
+                        printk(KERN_INFO "\t\tshared data backref count %u\n",
+                               btrfs_shared_data_ref_count(l, sref));
                        break;
                case BTRFS_EXTENT_DATA_KEY:
                        fi = btrfs_item_ptr(l, i,
                                            struct btrfs_file_extent_item);
@@ -139,6 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                               (unsigned long long)
                               btrfs_file_extent_ram_bytes(l, fi));
                        break;
+                case BTRFS_EXTENT_REF_V0_KEY:
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        print_extent_ref_v0(l, i);
+#else
+                        BUG();
+#endif
                case BTRFS_BLOCK_GROUP_ITEM_KEY:
                        bi = btrfs_item_ptr(l, i,
                                            struct btrfs_block_group_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 000000000000..b23dc209ae10
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,3711 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "btrfs_inode.h"
+#include "async-thread.h"
+/*
+ * backref_node, mapping_node and tree_block start with this
+ */
+struct tree_entry {
+        struct rb_node rb_node;
+        u64 bytenr;
+};
+/*
+ * present a tree block in the backref cache
+ */
+struct backref_node {
+        struct rb_node rb_node;
+        u64 bytenr;
+        /* objectid tree block owner */
+        u64 owner;
+        /* list of upper level blocks reference this block */
+        struct list_head upper;
+        /* list of child blocks in the cache */
+        struct list_head lower;
+        /* NULL if this node is not tree root */
+        struct btrfs_root *root;
+        /* extent buffer got by COW the block */
+        struct extent_buffer *eb;
+        /* level of tree block */
+        unsigned int level:8;
+        /* 1 if the block is root of old snapshot */
+        unsigned int old_root:1;
+        /* 1 if no child blocks in the cache */
+        unsigned int lowest:1;
+        /* is the extent buffer locked */
+        unsigned int locked:1;
+        /* has the block been processed */
+        unsigned int processed:1;
+        /* have backrefs of this block been checked */
+        unsigned int checked:1;
+};
+/*
+ * present a block pointer in the backref cache
+ */
+struct backref_edge {
+        struct list_head list[2];
+        struct backref_node *node[2];
+        u64 blockptr;
+};
+#define LOWER   0
+#define UPPER   1
+struct backref_cache {
+        /* red black tree of all backref nodes in the cache */
+        struct rb_root rb_root;
+        /* list of backref nodes with no child block in the cache */
+        struct list_head pending[BTRFS_MAX_LEVEL];
+        spinlock_t lock;
+};
+/*
+ * map address of tree root to tree
+ */
+struct mapping_node {
+        struct rb_node rb_node;
+        u64 bytenr;
+        void *data;
+};
+struct mapping_tree {
+        struct rb_root rb_root;
+        spinlock_t lock;
+};
+/*
+ * present a tree block to process
+ */
+struct tree_block {
+        struct rb_node rb_node;
+        u64 bytenr;
+        struct btrfs_key key;
+        unsigned int level:8;
+        unsigned int key_ready:1;
+};
+/* inode vector */
+#define INODEVEC_SIZE 16
+struct inodevec {
+        struct list_head list;
+        struct inode *inode[INODEVEC_SIZE];
+        int nr;
+};
+struct reloc_control {
+        /* block group to relocate */
+        struct btrfs_block_group_cache *block_group;
+        /* extent tree */
+        struct btrfs_root *extent_root;
+        /* inode for moving data */
+        struct inode *data_inode;
+        struct btrfs_workers workers;
+        /* tree blocks have been processed */
+        struct extent_io_tree processed_blocks;
+        /* map start of tree root to corresponding reloc tree */
+        struct mapping_tree reloc_root_tree;
+        /* list of reloc trees */
+        struct list_head reloc_roots;
+        u64 search_start;
+        u64 extents_found;
+        u64 extents_skipped;
+        int stage;
+        int create_reloc_root;
+        unsigned int found_file_extent:1;
+        unsigned int found_old_snapshot:1;
+};
+/* stages of data relocation */
+#define MOVE_DATA_EXTENTS       0
+#define UPDATE_DATA_PTRS        1
+/*
+ * merge reloc tree to corresponding fs tree in worker threads
+ */
+struct async_merge {
+        struct btrfs_work work;
+        struct reloc_control *rc;
+        struct btrfs_root *root;
+        struct completion *done;
+        atomic_t *num_pending;
+};
+static void mapping_tree_init(struct mapping_tree *tree)
+{
+        tree->rb_root.rb_node = NULL;
+        spin_lock_init(&tree->lock);
+}
+static void backref_cache_init(struct backref_cache *cache)
+{
+        int i;
+        cache->rb_root.rb_node = NULL;
+        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+                INIT_LIST_HEAD(&cache->pending[i]);
+        spin_lock_init(&cache->lock);
+}
+static void backref_node_init(struct backref_node *node)
+{
+        memset(node, 0, sizeof(*node));
+        INIT_LIST_HEAD(&node->upper);
+        INIT_LIST_HEAD(&node->lower);
+        RB_CLEAR_NODE(&node->rb_node);
+}
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+                                   struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct tree_entry *entry;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct tree_entry, rb_node);
+                if (bytenr < entry->bytenr)
+                        p = &(*p)->rb_left;
+                else if (bytenr > entry->bytenr)
+                        p = &(*p)->rb_right;
+                else
+                        return parent;
+        }
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+        struct rb_node *n = root->rb_node;
+        struct tree_entry *entry;
+        while (n) {
+                entry = rb_entry(n, struct tree_entry, rb_node);
+                if (bytenr < entry->bytenr)
+                        n = n->rb_left;
+                else if (bytenr > entry->bytenr)
+                        n = n->rb_right;
+                else
+                        return n;
+        }
+        return NULL;
+}
+/*
+ * walk up backref nodes until reach node presents tree root
+ */
+static struct backref_node *walk_up_backref(struct backref_node *node,
+                                            struct backref_edge *edges[],
+                                            int *index)
+{
+        struct backref_edge *edge;
+        int idx = *index;
+        while (!list_empty(&node->upper)) {
+                edge = list_entry(node->upper.next,
+                                  struct backref_edge, list[LOWER]);
+                edges[idx++] = edge;
+                node = edge->node[UPPER];
+        }
+        *index = idx;
+        return node;
+}
+/*
+ * walk down backref nodes to find start of next reference path
+ */
+static struct backref_node *walk_down_backref(struct backref_edge *edges[],
+                                              int *index)
+{
+        struct backref_edge *edge;
+        struct backref_node *lower;
+        int idx = *index;
+        while (idx > 0) {
+                edge = edges[idx - 1];
+                lower = edge->node[LOWER];
+                if (list_is_last(&edge->list[LOWER], &lower->upper)) {
+                        idx--;
+                        continue;
+                }
+                edge = list_entry(edge->list[LOWER].next,
+                                  struct backref_edge, list[LOWER]);
+                edges[idx - 1] = edge;
+                *index = idx;
+                return edge->node[UPPER];
+        }
+        *index = 0;
+        return NULL;
+}
+static void drop_node_buffer(struct backref_node *node)
+{
+        if (node->eb) {
+                if (node->locked) {
+                        btrfs_tree_unlock(node->eb);
+                        node->locked = 0;
+                }
+                free_extent_buffer(node->eb);
+                node->eb = NULL;
+        }
+}
+static void drop_backref_node(struct backref_cache *tree,
+                              struct backref_node *node)
+{
+        BUG_ON(!node->lowest);
+        BUG_ON(!list_empty(&node->upper));
+        drop_node_buffer(node);
+        list_del(&node->lower);
+        rb_erase(&node->rb_node, &tree->rb_root);
+        kfree(node);
+}
+/*
+ * remove a backref node from the backref cache
+ */
+static void remove_backref_node(struct backref_cache *cache,
+                                struct backref_node *node)
+{
+        struct backref_node *upper;
+        struct backref_edge *edge;
+        if (!node)
+                return;
+        BUG_ON(!node->lowest);
+        while (!list_empty(&node->upper)) {
+                edge = list_entry(node->upper.next, struct backref_edge,
+                                  list[LOWER]);
+                upper = edge->node[UPPER];
+                list_del(&edge->list[LOWER]);
+                list_del(&edge->list[UPPER]);
+                kfree(edge);
+                /*
+                 * add the node to pending list if no other
+                 * child block cached.
+                 */
+                if (list_empty(&upper->lower)) {
+                        list_add_tail(&upper->lower,
+                                      &cache->pending[upper->level]);
+                        upper->lowest = 1;
+                }
+        }
+        drop_backref_node(cache, node);
+}
+/*
+ * find reloc tree by address of tree root
+ */
+static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
+                                          u64 bytenr)
+{
+        struct rb_node *rb_node;
+        struct mapping_node *node;
+        struct btrfs_root *root = NULL;
+        spin_lock(&rc->reloc_root_tree.lock);
+        rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
+        if (rb_node) {
+                node = rb_entry(rb_node, struct mapping_node, rb_node);
+                root = (struct btrfs_root *)node->data;
+        }
+        spin_unlock(&rc->reloc_root_tree.lock);
+        return root;
+}
+static int is_cowonly_root(u64 root_objectid)
+{
+        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+            root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+                return 1;
+        return 0;
+}
+static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
+                                        u64 root_objectid)
+{
+        struct btrfs_key key;
+        key.objectid = root_objectid;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        if (is_cowonly_root(root_objectid))
+                key.offset = 0;
+        else
+                key.offset = (u64)-1;
+        return btrfs_read_fs_root_no_name(fs_info, &key);
+}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static noinline_for_stack
+struct btrfs_root *find_tree_root(struct reloc_control *rc,
+                                  struct extent_buffer *leaf,
+                                  struct btrfs_extent_ref_v0 *ref0)
+{
+        struct btrfs_root *root;
+        u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
+        u64 generation = btrfs_ref_generation_v0(leaf, ref0);
+        BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
+        root = read_fs_root(rc->extent_root->fs_info, root_objectid);
+        BUG_ON(IS_ERR(root));
+        if (root->ref_cows &&
+            generation != btrfs_root_generation(&root->root_item))
+                return NULL;
+        return root;
+}
+#endif
+static noinline_for_stack
+int find_inline_backref(struct extent_buffer *leaf, int slot,
+                        unsigned long *ptr, unsigned long *end)
+{
+        struct btrfs_extent_item *ei;
+        struct btrfs_tree_block_info *bi;
+        u32 item_size;
+        item_size = btrfs_item_size_nr(leaf, slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (item_size < sizeof(*ei)) {
+                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+                return 1;
+        }
+#endif
+        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+        WARN_ON(!(btrfs_extent_flags(leaf, ei) &
+                  BTRFS_EXTENT_FLAG_TREE_BLOCK));
+        if (item_size <= sizeof(*ei) + sizeof(*bi)) {
+                WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
+                return 1;
+        }
+        bi = (struct btrfs_tree_block_info *)(ei + 1);
+        *ptr = (unsigned long)(bi + 1);
+        *end = (unsigned long)ei + item_size;
+        return 0;
+}
+/*
+ * build backref tree for a given tree block. root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond
+ * roots of b-trees that reference the tree block.
+ *
+ * the basic idea of this function is check backrefs of a given block
+ * to find upper level blocks that refernece the block, and then check
+ * bakcrefs of these upper level blocks recursively. the recursion stop
+ * when tree root is reached or backrefs for the block is cached.
+ *
+ * NOTE: if we find backrefs for a block are cached, we know backrefs
+ * for all upper level blocks that directly/indirectly reference the
+ * block are also cached.
+ */
+static struct backref_node *build_backref_tree(struct reloc_control *rc,
+                                               struct backref_cache *cache,
+                                               struct btrfs_key *node_key,
+                                               int level, u64 bytenr)
+{
+        struct btrfs_path *path1;
+        struct btrfs_path *path2;
+        struct extent_buffer *eb;
+        struct btrfs_root *root;
+        struct backref_node *cur;
+        struct backref_node *upper;
+        struct backref_node *lower;
+        struct backref_node *node = NULL;
+        struct backref_node *exist = NULL;
+        struct backref_edge *edge;
+        struct rb_node *rb_node;
+        struct btrfs_key key;
+        unsigned long end;
+        unsigned long ptr;
+        LIST_HEAD(list);
+        int ret;
+        int err = 0;
+        path1 = btrfs_alloc_path();
+        path2 = btrfs_alloc_path();
+        if (!path1 || !path2) {
+                err = -ENOMEM;
+                goto out;
+        }
+        node = kmalloc(sizeof(*node), GFP_NOFS);
+        if (!node) {
+                err = -ENOMEM;
+                goto out;
+        }
+        backref_node_init(node);
+        node->bytenr = bytenr;
+        node->owner = 0;
+        node->level = level;
+        node->lowest = 1;
+        cur = node;
+again:
+        end = 0;
+        ptr = 0;
+        key.objectid = cur->bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = (u64)-1;
+        path1->search_commit_root = 1;
+        path1->skip_locking = 1;
+        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
+                                0, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        BUG_ON(!ret || !path1->slots[0]);
+        path1->slots[0]--;
+        WARN_ON(cur->checked);
+        if (!list_empty(&cur->upper)) {
+                /*
+                 * the backref was added previously when processsing
+                 * backref of type BTRFS_TREE_BLOCK_REF_KEY
+                 */
+                BUG_ON(!list_is_singular(&cur->upper));
+                edge = list_entry(cur->upper.next, struct backref_edge,
+                                  list[LOWER]);
+                BUG_ON(!list_empty(&edge->list[UPPER]));
+                exist = edge->node[UPPER];
+                /*
+                 * add the upper level block to pending list if we need
+                 * check its backrefs
+                 */
+                if (!exist->checked)
+                        list_add_tail(&edge->list[UPPER], &list);
+        } else {
+                exist = NULL;
+        }
+        while (1) {
+                cond_resched();
+                eb = path1->nodes[0];
+                if (ptr >= end) {
+                        if (path1->slots[0] >= btrfs_header_nritems(eb)) {
+                                ret = btrfs_next_leaf(rc->extent_root, path1);
+                                if (ret < 0) {
+                                        err = ret;
+                                        goto out;
+                                }
+                                if (ret > 0)
+                                        break;
+                                eb = path1->nodes[0];
+                        }
+                        btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
+                        if (key.objectid != cur->bytenr) {
+                                WARN_ON(exist);
+                                break;
+                        }
+                        if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+                                ret = find_inline_backref(eb, path1->slots[0],
+                                                          &ptr, &end);
+                                if (ret)
+                                        goto next;
+                        }
+                }
+                if (ptr < end) {
+                        /* update key for inline back ref */
+                        struct btrfs_extent_inline_ref *iref;
+                        iref = (struct btrfs_extent_inline_ref *)ptr;
+                        key.type = btrfs_extent_inline_ref_type(eb, iref);
+                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+                        WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+                                key.type != BTRFS_SHARED_BLOCK_REF_KEY);
+                }
+                if (exist &&
+                    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+                      exist->owner == key.offset) ||
+                     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+                      exist->bytenr == key.offset))) {
+                        exist = NULL;
+                        goto next;
+                }
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
+                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+                        if (key.objectid == key.offset &&
+                            key.type == BTRFS_EXTENT_REF_V0_KEY) {
+                                struct btrfs_extent_ref_v0 *ref0;
+                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
+                                                struct btrfs_extent_ref_v0);
+                                root = find_tree_root(rc, eb, ref0);
+                                if (root)
+                                        cur->root = root;
+                                else
+                                        cur->old_root = 1;
+                                break;
+                        }
+#else
+                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+#endif
+                        if (key.objectid == key.offset) {
+                                /*
+                                 * only root blocks of reloc trees use
+                                 * backref of this type.
+                                 */
+                                root = find_reloc_root(rc, cur->bytenr);
+                                BUG_ON(!root);
+                                cur->root = root;
+                                break;
+                        }
+                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        if (!edge) {
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        rb_node = tree_search(&cache->rb_root, key.offset);
+                        if (!rb_node) {
+                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                if (!upper) {
+                                        kfree(edge);
+                                        err = -ENOMEM;
+                                        goto out;
+                                }
+                                backref_node_init(upper);
+                                upper->bytenr = key.offset;
+                                upper->owner = 0;
+                                upper->level = cur->level + 1;
+                                /*
+                                 *  backrefs for the upper level block isn't
+                                 *  cached, add the block to pending list
+                                 */
+                                list_add_tail(&edge->list[UPPER], &list);
+                        } else {
+                                upper = rb_entry(rb_node, struct backref_node,
+                                                 rb_node);
+                                INIT_LIST_HEAD(&edge->list[UPPER]);
+                        }
+                        list_add(&edge->list[LOWER], &cur->upper);
+                        edge->node[UPPER] = upper;
+                        edge->node[LOWER] = cur;
+                        goto next;
+                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+                        goto next;
+                }
+                /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+                root = read_fs_root(rc->extent_root->fs_info, key.offset);
+                if (IS_ERR(root)) {
+                        err = PTR_ERR(root);
+                        goto out;
+                }
+                if (btrfs_root_level(&root->root_item) == cur->level) {
+                        /* tree root */
+                        BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+                               cur->bytenr);
+                        cur->root = root;
+                        break;
+                }
+                level = cur->level + 1;
+                /*
+                 * searching the tree to find upper level blocks
+                 * reference the block.
+                 */
+                path2->search_commit_root = 1;
+                path2->skip_locking = 1;
+                path2->lowest_level = level;
+                ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
+                path2->lowest_level = 0;
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                eb = path2->nodes[level];
+                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
+                        cur->bytenr);
+                lower = cur;
+                for (; level < BTRFS_MAX_LEVEL; level++) {
+                        if (!path2->nodes[level]) {
+                                BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+                                       lower->bytenr);
+                                lower->root = root;
+                                break;
+                        }
+                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        if (!edge) {
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        eb = path2->nodes[level];
+                        rb_node = tree_search(&cache->rb_root, eb->start);
+                        if (!rb_node) {
+                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                if (!upper) {
+                                        kfree(edge);
+                                        err = -ENOMEM;
+                                        goto out;
+                                }
+                                backref_node_init(upper);
+                                upper->bytenr = eb->start;
+                                upper->owner = btrfs_header_owner(eb);
+                                upper->level = lower->level + 1;
+                                /*
+                                 * if we know the block isn't shared
+                                 * we can void checking its backrefs.
+                                 */
+                                if (btrfs_block_can_be_shared(root, eb))
+                                        upper->checked = 0;
+                                else
+                                        upper->checked = 1;
+                                /*
+                                 * add the block to pending list if we
+                                 * need check its backrefs. only block
+                                 * at 'cur->level + 1' is added to the
+                                 * tail of pending list. this guarantees
+                                 * we check backrefs from lower level
+                                 * blocks to upper level blocks.
+                                 */
+                                if (!upper->checked &&
+                                    level == cur->level + 1) {
+                                        list_add_tail(&edge->list[UPPER],
+                                                      &list);
+                                } else
+                                        INIT_LIST_HEAD(&edge->list[UPPER]);
+                        } else {
+                                upper = rb_entry(rb_node, struct backref_node,
+                                                 rb_node);
+                                BUG_ON(!upper->checked);
+                                INIT_LIST_HEAD(&edge->list[UPPER]);
+                        }
+                        list_add_tail(&edge->list[LOWER], &lower->upper);
+                        edge->node[UPPER] = upper;
+                        edge->node[LOWER] = lower;
+                        if (rb_node)
+                                break;
+                        lower = upper;
+                        upper = NULL;
+                }
+                btrfs_release_path(root, path2);
+next:
+                if (ptr < end) {
+                        ptr += btrfs_extent_inline_ref_size(key.type);
+                        if (ptr >= end) {
+                                WARN_ON(ptr > end);
+                                ptr = 0;
+                                end = 0;
+                        }
+                }
+                if (ptr >= end)
+                        path1->slots[0]++;
+        }
+        btrfs_release_path(rc->extent_root, path1);
+        cur->checked = 1;
+        WARN_ON(exist);
+        /* the pending list isn't empty, take the first block to process */
+        if (!list_empty(&list)) {
+                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+                list_del_init(&edge->list[UPPER]);
+                cur = edge->node[UPPER];
+                goto again;
+        }
+        /*
+         * everything goes well, connect backref nodes and insert backref nodes
+         * into the cache.
+         */
+        BUG_ON(!node->checked);
+        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+        BUG_ON(rb_node);
+        list_for_each_entry(edge, &node->upper, list[LOWER])
+                list_add_tail(&edge->list[UPPER], &list);
+        while (!list_empty(&list)) {
+                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+                list_del_init(&edge->list[UPPER]);
+                upper = edge->node[UPPER];
+                if (!RB_EMPTY_NODE(&upper->rb_node)) {
+                        if (upper->lowest) {
+                                list_del_init(&upper->lower);
+                                upper->lowest = 0;
+                        }
+                        list_add_tail(&edge->list[UPPER], &upper->lower);
+                        continue;
+                }
+                BUG_ON(!upper->checked);
+                rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+                                      &upper->rb_node);
+                BUG_ON(rb_node);
+                list_add_tail(&edge->list[UPPER], &upper->lower);
+                list_for_each_entry(edge, &upper->upper, list[LOWER])
+                        list_add_tail(&edge->list[UPPER], &list);
+        }
+out:
+        btrfs_free_path(path1);
+        btrfs_free_path(path2);
+        if (err) {
+                INIT_LIST_HEAD(&list);
+                upper = node;
+                while (upper) {
+                        if (RB_EMPTY_NODE(&upper->rb_node)) {
+                                list_splice_tail(&upper->upper, &list);
+                                kfree(upper);
+                        }
+                        if (list_empty(&list))
+                                break;
+                        edge = list_entry(list.next, struct backref_edge,
+                                          list[LOWER]);
+                        upper = edge->node[UPPER];
+                        kfree(edge);
+                }
+                return ERR_PTR(err);
+        }
+        return node;
+}
+/*
+ * helper to add 'address of tree root -> reloc tree' mapping
+ */
+static int __add_reloc_root(struct btrfs_root *root)
+{
+        struct rb_node *rb_node;
+        struct mapping_node *node;
+        struct reloc_control *rc = root->fs_info->reloc_ctl;
+        node = kmalloc(sizeof(*node), GFP_NOFS);
+        BUG_ON(!node);
+        node->bytenr = root->node->start;
+        node->data = root;
+        spin_lock(&rc->reloc_root_tree.lock);
+        rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+                              node->bytenr, &node->rb_node);
+        spin_unlock(&rc->reloc_root_tree.lock);
+        BUG_ON(rb_node);
+        list_add_tail(&root->root_list, &rc->reloc_roots);
+        return 0;
+}
+/*
+ * helper to update/delete the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, int del)
+{
+        struct rb_node *rb_node;
+        struct mapping_node *node = NULL;
+        struct reloc_control *rc = root->fs_info->reloc_ctl;
+        spin_lock(&rc->reloc_root_tree.lock);
+        rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+                              root->commit_root->start);
+        if (rb_node) {
+                node = rb_entry(rb_node, struct mapping_node, rb_node);
+                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+        }
+        spin_unlock(&rc->reloc_root_tree.lock);
+        BUG_ON((struct btrfs_root *)node->data != root);
+        if (!del) {
+                spin_lock(&rc->reloc_root_tree.lock);
+                node->bytenr = root->node->start;
+                rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+                                      node->bytenr, &node->rb_node);
+                spin_unlock(&rc->reloc_root_tree.lock);
+                BUG_ON(rb_node);
+        } else {
+                list_del_init(&root->root_list);
+                kfree(node);
+        }
+        return 0;
+}
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct extent_buffer *eb;
+        struct btrfs_root_item *root_item;
+        struct btrfs_key root_key;
+        int ret;
+        if (root->reloc_root) {
+                reloc_root = root->reloc_root;
+                reloc_root->last_trans = trans->transid;
+                return 0;
+        }
+        if (!root->fs_info->reloc_ctl ||
+            !root->fs_info->reloc_ctl->create_reloc_root ||
+            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                return 0;
+        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+        BUG_ON(!root_item);
+        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = root->root_key.objectid;
+        ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+                              BTRFS_TREE_RELOC_OBJECTID);
+        BUG_ON(ret);
+        btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
+        memcpy(root_item, &root->root_item, sizeof(*root_item));
+        btrfs_set_root_refs(root_item, 1);
+        btrfs_set_root_bytenr(root_item, eb->start);
+        btrfs_set_root_level(root_item, btrfs_header_level(eb));
+        btrfs_set_root_generation(root_item, trans->transid);
+        memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
+        root_item->drop_level = 0;
+        btrfs_tree_unlock(eb);
+        free_extent_buffer(eb);
+        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+                                &root_key, root_item);
+        BUG_ON(ret);
+        kfree(root_item);
+        reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+                                                 &root_key);
+        BUG_ON(IS_ERR(reloc_root));
+        reloc_root->last_trans = trans->transid;
+        __add_reloc_root(reloc_root);
+        root->reloc_root = reloc_root;
+        return 0;
+}
+/*
+ * update root item of reloc tree
+ */
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct btrfs_root_item *root_item;
+        int del = 0;
+        int ret;
+        if (!root->reloc_root)
+                return 0;
+        reloc_root = root->reloc_root;
+        root_item = &reloc_root->root_item;
+        if (btrfs_root_refs(root_item) == 0) {
+                root->reloc_root = NULL;
+                del = 1;
+        }
+        __update_reloc_root(reloc_root, del);
+        if (reloc_root->commit_root != reloc_root->node) {
+                btrfs_set_root_node(root_item, reloc_root->node);
+                free_extent_buffer(reloc_root->commit_root);
+                reloc_root->commit_root = btrfs_root_node(reloc_root);
+        }
+        ret = btrfs_update_root(trans, root->fs_info->tree_root,
+                                &reloc_root->root_key, root_item);
+        BUG_ON(ret);
+        return 0;
+}
+/*
+ * helper to find first cached inode with inode number >= objectid
+ * in a subvolume
+ */
+static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
+{
+        struct rb_node *node;
+        struct rb_node *prev;
+        struct btrfs_inode *entry;
+        struct inode *inode;
+        spin_lock(&root->inode_lock);
+again:
+        node = root->inode_tree.rb_node;
+        prev = NULL;
+        while (node) {
+                prev = node;
+                entry = rb_entry(node, struct btrfs_inode, rb_node);
+                if (objectid < entry->vfs_inode.i_ino)
+                        node = node->rb_left;
+                else if (objectid > entry->vfs_inode.i_ino)
+                        node = node->rb_right;
+                else
+                        break;
+        }
+        if (!node) {
+                while (prev) {
+                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
+                        if (objectid <= entry->vfs_inode.i_ino) {
+                                node = prev;
+                                break;
+                        }
+                        prev = rb_next(prev);
+                }
+        }
+        while (node) {
+                entry = rb_entry(node, struct btrfs_inode, rb_node);
+                inode = igrab(&entry->vfs_inode);
+                if (inode) {
+                        spin_unlock(&root->inode_lock);
+                        return inode;
+                }
+                objectid = entry->vfs_inode.i_ino + 1;
+                if (cond_resched_lock(&root->inode_lock))
+                        goto again;
+                node = rb_next(node);
+        }
+        spin_unlock(&root->inode_lock);
+        return NULL;
+}
+static int in_block_group(u64 bytenr,
+                          struct btrfs_block_group_cache *block_group)
+{
+        if (bytenr >= block_group->key.objectid &&
+            bytenr < block_group->key.objectid + block_group->key.offset)
+                return 1;
+        return 0;
+}
+/*
+ * get new location of data
+ */
+static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
+                            u64 bytenr, u64 num_bytes)
+{
+        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+        ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+                                       bytenr, 0);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
+               btrfs_file_extent_compression(leaf, fi) ||
+               btrfs_file_extent_encryption(leaf, fi) ||
+               btrfs_file_extent_other_encoding(leaf, fi));
+        if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
+                ret = 1;
+                goto out;
+        }
+        if (new_bytenr)
+                *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * update file extent items in the tree leaf to point to
+ * the new locations.
+ */
+static int replace_file_extents(struct btrfs_trans_handle *trans,
+                                struct reloc_control *rc,
+                                struct btrfs_root *root,
+                                struct extent_buffer *leaf,
+                                struct list_head *inode_list)
+{
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        struct inode *inode = NULL;
+        struct inodevec *ivec = NULL;
+        u64 parent;
+        u64 bytenr;
+        u64 new_bytenr;
+        u64 num_bytes;
+        u64 end;
+        u32 nritems;
+        u32 i;
+        int ret;
+        int first = 1;
+        int dirty = 0;
+        if (rc->stage != UPDATE_DATA_PTRS)
+                return 0;
+        /* reloc trees always use full backref */
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                parent = leaf->start;
+        else
+                parent = 0;
+        nritems = btrfs_header_nritems(leaf);
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                btrfs_item_key_to_cpu(leaf, &key, i);
+                if (key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+                if (bytenr == 0)
+                        continue;
+                if (!in_block_group(bytenr, rc->block_group))
+                        continue;
+                /*
+                 * if we are modifying block in fs tree, wait for readpage
+                 * to complete and drop the extent cache
+                 */
+                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+                        if (!ivec || ivec->nr == INODEVEC_SIZE) {
+                                ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
+                                BUG_ON(!ivec);
+                                ivec->nr = 0;
+                                list_add_tail(&ivec->list, inode_list);
+                        }
+                        if (first) {
+                                inode = find_next_inode(root, key.objectid);
+                                if (inode)
+                                        ivec->inode[ivec->nr++] = inode;
+                                first = 0;
+                        } else if (inode && inode->i_ino < key.objectid) {
+                                inode = find_next_inode(root, key.objectid);
+                                if (inode)
+                                        ivec->inode[ivec->nr++] = inode;
+                        }
+                        if (inode && inode->i_ino == key.objectid) {
+                                end = key.offset +
+                                      btrfs_file_extent_num_bytes(leaf, fi);
+                                WARN_ON(!IS_ALIGNED(key.offset,
+                                                    root->sectorsize));
+                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+                                end--;
+                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+                                                      key.offset, end,
+                                                      GFP_NOFS);
+                                if (!ret)
+                                        continue;
+                                btrfs_drop_extent_cache(inode, key.offset, end,
+                                                        1);
+                                unlock_extent(&BTRFS_I(inode)->io_tree,
+                                              key.offset, end, GFP_NOFS);
+                        }
+                }
+                ret = get_new_location(rc->data_inode, &new_bytenr,
+                                       bytenr, num_bytes);
+                if (ret > 0)
+                        continue;
+                BUG_ON(ret < 0);
+                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
+                dirty = 1;
+                key.offset -= btrfs_file_extent_offset(leaf, fi);
+                ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
+                                           num_bytes, parent,
+                                           btrfs_header_owner(leaf),
+                                           key.objectid, key.offset);
+                BUG_ON(ret);
+                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+                                        parent, btrfs_header_owner(leaf),
+                                        key.objectid, key.offset);
+                BUG_ON(ret);
+        }
+        if (dirty)
+                btrfs_mark_buffer_dirty(leaf);
+        return 0;
+}
+static noinline_for_stack
+int memcmp_node_keys(struct extent_buffer *eb, int slot,
+                     struct btrfs_path *path, int level)
+{
+        struct btrfs_disk_key key1;
+        struct btrfs_disk_key key2;
+        btrfs_node_key(eb, &key1, slot);
+        btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
+        return memcmp(&key1, &key2, sizeof(key1));
+}
+/*
+ * try to replace tree blocks in fs tree with the new blocks
+ * in reloc tree. tree blocks haven't been modified since the
+ * reloc tree was create can be replaced.
+ *
+ * if a block was replaced, level of the block + 1 is returned.
+ * if no block got replaced, 0 is returned. if there are other
+ * errors, a negative error number is returned.
+ */
+static int replace_path(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *dest, struct btrfs_root *src,
+                        struct btrfs_path *path, struct btrfs_key *next_key,
+                        struct extent_buffer **leaf,
+                        int lowest_level, int max_level)
+{
+        struct extent_buffer *eb;
+        struct extent_buffer *parent;
+        struct btrfs_key key;
+        u64 old_bytenr;
+        u64 new_bytenr;
+        u64 old_ptr_gen;
+        u64 new_ptr_gen;
+        u64 last_snapshot;
+        u32 blocksize;
+        int level;
+        int ret;
+        int slot;
+        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+        BUG_ON(lowest_level > 1 && leaf);
+        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+        slot = path->slots[lowest_level];
+        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
+        eb = btrfs_lock_root_node(dest);
+        btrfs_set_lock_blocking(eb);
+        level = btrfs_header_level(eb);
+        if (level < lowest_level) {
+                btrfs_tree_unlock(eb);
+                free_extent_buffer(eb);
+                return 0;
+        }
+        ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+        BUG_ON(ret);
+        btrfs_set_lock_blocking(eb);
+        if (next_key) {
+                next_key->objectid = (u64)-1;
+                next_key->type = (u8)-1;
+                next_key->offset = (u64)-1;
+        }
+        parent = eb;
+        while (1) {
+                level = btrfs_header_level(parent);
+                BUG_ON(level < lowest_level);
+                ret = btrfs_bin_search(parent, &key, level, &slot);
+                if (ret && slot > 0)
+                        slot--;
+                if (next_key && slot + 1 < btrfs_header_nritems(parent))
+                        btrfs_node_key_to_cpu(parent, next_key, slot + 1);
+                old_bytenr = btrfs_node_blockptr(parent, slot);
+                blocksize = btrfs_level_size(dest, level - 1);
+                old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+                if (level <= max_level) {
+                        eb = path->nodes[level];
+                        new_bytenr = btrfs_node_blockptr(eb,
+                                                        path->slots[level]);
+                        new_ptr_gen = btrfs_node_ptr_generation(eb,
+                                                        path->slots[level]);
+                } else {
+                        new_bytenr = 0;
+                        new_ptr_gen = 0;
+                }
+                if (new_bytenr > 0 && new_bytenr == old_bytenr) {
+                        WARN_ON(1);
+                        ret = level;
+                        break;
+                }
+                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
+                    memcmp_node_keys(parent, slot, path, level)) {
+                        if (level <= lowest_level && !leaf) {
+                                ret = 0;
+                                break;
+                        }
+                        eb = read_tree_block(dest, old_bytenr, blocksize,
+                                             old_ptr_gen);
+                        btrfs_tree_lock(eb);
+                        ret = btrfs_cow_block(trans, dest, eb, parent,
+                                              slot, &eb);
+                        BUG_ON(ret);
+                        btrfs_set_lock_blocking(eb);
+                        if (level <= lowest_level) {
+                                *leaf = eb;
+                                ret = 0;
+                                break;
+                        }
+                        btrfs_tree_unlock(parent);
+                        free_extent_buffer(parent);
+                        parent = eb;
+                        continue;
+                }
+                btrfs_node_key_to_cpu(path->nodes[level], &key,
+                                      path->slots[level]);
+                btrfs_release_path(src, path);
+                path->lowest_level = level;
+                ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+                path->lowest_level = 0;
+                BUG_ON(ret);
+                /*
+                 * swap blocks in fs tree and reloc tree.
+                 */
+                btrfs_set_node_blockptr(parent, slot, new_bytenr);
+                btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
+                btrfs_mark_buffer_dirty(parent);
+                btrfs_set_node_blockptr(path->nodes[level],
+                                        path->slots[level], old_bytenr);
+                btrfs_set_node_ptr_generation(path->nodes[level],
+                                              path->slots[level], old_ptr_gen);
+                btrfs_mark_buffer_dirty(path->nodes[level]);
+                ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
+                                        path->nodes[level]->start,
+                                        src->root_key.objectid, level - 1, 0);
+                BUG_ON(ret);
+                ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
+                                        0, dest->root_key.objectid, level - 1,
+                                        0);
+                BUG_ON(ret);
+                ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
+                                        path->nodes[level]->start,
+                                        src->root_key.objectid, level - 1, 0);
+                BUG_ON(ret);
+                ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
+                                        0, dest->root_key.objectid, level - 1,
+                                        0);
+                BUG_ON(ret);
+                btrfs_unlock_up_safe(path, 0);
+                ret = level;
+                break;
+        }
+        btrfs_tree_unlock(parent);
+        free_extent_buffer(parent);
+        return ret;
+}
+/*
+ * helper to find next relocated block in reloc tree
+ */
+static noinline_for_stack
+int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+                       int *level)
+{
+        struct extent_buffer *eb;
+        int i;
+        u64 last_snapshot;
+        u32 nritems;
+        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+        for (i = 0; i < *level; i++) {
+                free_extent_buffer(path->nodes[i]);
+                path->nodes[i] = NULL;
+        }
+        for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+                eb = path->nodes[i];
+                nritems = btrfs_header_nritems(eb);
+                while (path->slots[i] + 1 < nritems) {
+                        path->slots[i]++;
+                        if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
+                            last_snapshot)
+                                continue;
+                        *level = i;
+                        return 0;
+                }
+                free_extent_buffer(path->nodes[i]);
+                path->nodes[i] = NULL;
+        }
+        return 1;
+}
+/*
+ * walk down reloc tree to find relocated block of lowest level
+ */
+static noinline_for_stack
+int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+                         int *level)
+{
+        struct extent_buffer *eb = NULL;
+        int i;
+        u64 bytenr;
+        u64 ptr_gen = 0;
+        u64 last_snapshot;
+        u32 blocksize;
+        u32 nritems;
+        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+        for (i = *level; i > 0; i--) {
+                eb = path->nodes[i];
+                nritems = btrfs_header_nritems(eb);
+                while (path->slots[i] < nritems) {
+                        ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
+                        if (ptr_gen > last_snapshot)
+                                break;
+                        path->slots[i]++;
+                }
+                if (path->slots[i] >= nritems) {
+                        if (i == *level)
+                                break;
+                        *level = i + 1;
+                        return 0;
+                }
+                if (i == 1) {
+                        *level = i;
+                        return 0;
+                }
+                bytenr = btrfs_node_blockptr(eb, path->slots[i]);
+                blocksize = btrfs_level_size(root, i - 1);
+                eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+                BUG_ON(btrfs_header_level(eb) != i - 1);
+                path->nodes[i - 1] = eb;
+                path->slots[i - 1] = 0;
+        }
+        return 1;
+}
+/*
+ * invalidate extent cache for file extents whose key in range of
+ * [min_key, max_key)
+ */
+static int invalidate_extent_cache(struct btrfs_root *root,
+                                   struct btrfs_key *min_key,
+                                   struct btrfs_key *max_key)
+{
+        struct inode *inode = NULL;
+        u64 objectid;
+        u64 start, end;
+        objectid = min_key->objectid;
+        while (1) {
+                cond_resched();
+                iput(inode);
+                if (objectid > max_key->objectid)
+                        break;
+                inode = find_next_inode(root, objectid);
+                if (!inode)
+                        break;
+                if (inode->i_ino > max_key->objectid) {
+                        iput(inode);
+                        break;
+                }
+                objectid = inode->i_ino + 1;
+                if (!S_ISREG(inode->i_mode))
+                        continue;
+                if (unlikely(min_key->objectid == inode->i_ino)) {
+                        if (min_key->type > BTRFS_EXTENT_DATA_KEY)
+                                continue;
+                        if (min_key->type < BTRFS_EXTENT_DATA_KEY)
+                                start = 0;
+                        else {
+                                start = min_key->offset;
+                                WARN_ON(!IS_ALIGNED(start, root->sectorsize));
+                        }
+                } else {
+                        start = 0;
+                }
+                if (unlikely(max_key->objectid == inode->i_ino)) {
+                        if (max_key->type < BTRFS_EXTENT_DATA_KEY)
+                                continue;
+                        if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
+                                end = (u64)-1;
+                        } else {
+                                if (max_key->offset == 0)
+                                        continue;
+                                end = max_key->offset;
+                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+                                end--;
+                        }
+                } else {
+                        end = (u64)-1;
+                }
+                /* the lock_extent waits for readpage to complete */
+                lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+                btrfs_drop_extent_cache(inode, start, end, 1);
+                unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+        }
+        return 0;
+}
+static int find_next_key(struct btrfs_path *path, int level,
+                         struct btrfs_key *key)
+{
+        while (level < BTRFS_MAX_LEVEL) {
+                if (!path->nodes[level])
+                        break;
+                if (path->slots[level] + 1 <
+                    btrfs_header_nritems(path->nodes[level])) {
+                        btrfs_node_key_to_cpu(path->nodes[level], key,
+                                              path->slots[level] + 1);
+                        return 0;
+                }
+                level++;
+        }
+        return 1;
+}
+/*
+ * merge the relocated tree blocks in reloc tree with corresponding
+ * fs tree.
+ */
+static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
+                                               struct btrfs_root *root)
+{
+        LIST_HEAD(inode_list);
+        struct btrfs_key key;
+        struct btrfs_key next_key;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *reloc_root;
+        struct btrfs_root_item *root_item;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf = NULL;
+        unsigned long nr;
+        int level;
+        int max_level;
+        int replaced = 0;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        reloc_root = root->reloc_root;
+        root_item = &reloc_root->root_item;
+        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+                level = btrfs_root_level(root_item);
+                extent_buffer_get(reloc_root->node);
+                path->nodes[level] = reloc_root->node;
+                path->slots[level] = 0;
+        } else {
+                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+                level = root_item->drop_level;
+                BUG_ON(level == 0);
+                path->lowest_level = level;
+                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+                if (ret < 0) {
+                        btrfs_free_path(path);
+                        return ret;
+                }
+                btrfs_node_key_to_cpu(path->nodes[level], &next_key,
+                                      path->slots[level]);
+                WARN_ON(memcmp(&key, &next_key, sizeof(key)));
+                btrfs_unlock_up_safe(path, 0);
+        }
+        if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+                trans = btrfs_start_transaction(root, 1);
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &key, 0);
+                btrfs_release_path(reloc_root, path);
+                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                leaf = path->nodes[0];
+                btrfs_unlock_up_safe(path, 1);
+                ret = replace_file_extents(trans, rc, root, leaf,
+                                           &inode_list);
+                if (ret < 0)
+                        err = ret;
+                goto out;
+        }
+        memset(&next_key, 0, sizeof(next_key));
+        while (1) {
+                leaf = NULL;
+                replaced = 0;
+                trans = btrfs_start_transaction(root, 1);
+                max_level = level;
+                ret = walk_down_reloc_tree(reloc_root, path, &level);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                if (ret > 0)
+                        break;
+                if (!find_next_key(path, level, &key) &&
+                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
+                        ret = 0;
+                } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
+                        ret = replace_path(trans, root, reloc_root,
+                                           path, &next_key, &leaf,
+                                           level, max_level);
+                } else {
+                        ret = replace_path(trans, root, reloc_root,
+                                           path, &next_key, NULL,
+                                           level, max_level);
+                }
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                if (ret > 0) {
+                        level = ret;
+                        btrfs_node_key_to_cpu(path->nodes[level], &key,
+                                              path->slots[level]);
+                        replaced = 1;
+                } else if (leaf) {
+                        /*
+                         * no block got replaced, try replacing file extents
+                         */
+                        btrfs_item_key_to_cpu(leaf, &key, 0);
+                        ret = replace_file_extents(trans, rc, root, leaf,
+                                                   &inode_list);
+                        btrfs_tree_unlock(leaf);
+                        free_extent_buffer(leaf);
+                        BUG_ON(ret < 0);
+                }
+                ret = walk_up_reloc_tree(reloc_root, path, &level);
+                if (ret > 0)
+                        break;
+                BUG_ON(level == 0);
+                /*
+                 * save the merging progress in the drop_progress.
+                 * this is OK since root refs == 1 in this case.
+                 */
+                btrfs_node_key(path->nodes[level], &root_item->drop_progress,
+                               path->slots[level]);
+                root_item->drop_level = level;
+                nr = trans->blocks_used;
+                btrfs_end_transaction(trans, root);
+                btrfs_btree_balance_dirty(root, nr);
+                if (replaced && rc->stage == UPDATE_DATA_PTRS)
+                        invalidate_extent_cache(root, &key, &next_key);
+        }
+        /*
+         * handle the case only one block in the fs tree need to be
+         * relocated and the block is tree root.
+         */
+        leaf = btrfs_lock_root_node(root);
+        ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+        btrfs_tree_unlock(leaf);
+        free_extent_buffer(leaf);
+        if (ret < 0)
+                err = ret;
+out:
+        btrfs_free_path(path);
+        if (err == 0) {
+                memset(&root_item->drop_progress, 0,
+                       sizeof(root_item->drop_progress));
+                root_item->drop_level = 0;
+                btrfs_set_root_refs(root_item, 0);
+        }
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+        /*
+         * put inodes while we aren't holding the tree locks
+         */
+        while (!list_empty(&inode_list)) {
+                struct inodevec *ivec;
+                ivec = list_entry(inode_list.next, struct inodevec, list);
+                list_del(&ivec->list);
+                while (ivec->nr > 0) {
+                        ivec->nr--;
+                        iput(ivec->inode[ivec->nr]);
+                }
+                kfree(ivec);
+        }
+        if (replaced && rc->stage == UPDATE_DATA_PTRS)
+                invalidate_extent_cache(root, &key, &next_key);
+        return err;
+}
+/*
+ * callback for the work threads.
+ * this function merges reloc tree with corresponding fs tree,
+ * and then drops the reloc tree.
+ */
+static void merge_func(struct btrfs_work *work)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
+        struct btrfs_root *reloc_root;
+        struct async_merge *async;
+        async = container_of(work, struct async_merge, work);
+        reloc_root = async->root;
+        if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+                root = read_fs_root(reloc_root->fs_info,
+                                    reloc_root->root_key.offset);
+                BUG_ON(IS_ERR(root));
+                BUG_ON(root->reloc_root != reloc_root);
+                merge_reloc_root(async->rc, root);
+                trans = btrfs_start_transaction(root, 1);
+                btrfs_update_reloc_root(trans, root);
+                btrfs_end_transaction(trans, root);
+        }
+        btrfs_drop_dead_root(reloc_root);
+        if (atomic_dec_and_test(async->num_pending))
+                complete(async->done);
+        kfree(async);
+}
+static int merge_reloc_roots(struct reloc_control *rc)
+{
+        struct async_merge *async;
+        struct btrfs_root *root;
+        struct completion done;
+        atomic_t num_pending;
+        init_completion(&done);
+        atomic_set(&num_pending, 1);
+        while (!list_empty(&rc->reloc_roots)) {
+                root = list_entry(rc->reloc_roots.next,
+                                  struct btrfs_root, root_list);
+                list_del_init(&root->root_list);
+                async = kmalloc(sizeof(*async), GFP_NOFS);
+                BUG_ON(!async);
+                async->work.func = merge_func;
+                async->work.flags = 0;
+                async->rc = rc;
+                async->root = root;
+                async->done = &done;
+                async->num_pending = &num_pending;
+                atomic_inc(&num_pending);
+                btrfs_queue_worker(&rc->workers, &async->work);
+        }
+        if (!atomic_dec_and_test(&num_pending))
+                wait_for_completion(&done);
+        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+        return 0;
+}
+static void free_block_list(struct rb_root *blocks)
+{
+        struct tree_block *block;
+        struct rb_node *rb_node;
+        while ((rb_node = rb_first(blocks))) {
+                block = rb_entry(rb_node, struct tree_block, rb_node);
+                rb_erase(rb_node, blocks);
+                kfree(block);
+        }
+}
+static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *reloc_root)
+{
+        struct btrfs_root *root;
+        if (reloc_root->last_trans == trans->transid)
+                return 0;
+        root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
+        BUG_ON(IS_ERR(root));
+        BUG_ON(root->reloc_root != reloc_root);
+        return btrfs_record_root_in_trans(trans, root);
+}
+/*
+ * select one tree from trees that references the block.
+ * for blocks in refernce counted trees, we preper reloc tree.
+ * if no reloc tree found and reloc_only is true, NULL is returned.
+ */
+static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
+                                            struct backref_node *node,
+                                            struct backref_edge *edges[],
+                                            int *nr, int reloc_only)
+{
+        struct backref_node *next;
+        struct btrfs_root *root;
+        int index;
+        int loop = 0;
+again:
+        index = 0;
+        next = node;
+        while (1) {
+                cond_resched();
+                next = walk_up_backref(next, edges, &index);
+                root = next->root;
+                if (!root) {
+                        BUG_ON(!node->old_root);
+                        goto skip;
+                }
+                /* no other choice for non-refernce counted tree */
+                if (!root->ref_cows) {
+                        BUG_ON(reloc_only);
+                        break;
+                }
+                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                        record_reloc_root_in_trans(trans, root);
+                        break;
+                }
+                if (loop) {
+                        btrfs_record_root_in_trans(trans, root);
+                        break;
+                }
+                if (reloc_only || next != node) {
+                        if (!root->reloc_root)
+                                btrfs_record_root_in_trans(trans, root);
+                        root = root->reloc_root;
+                        /*
+                         * if the reloc tree was created in current
+                         * transation, there is no node in backref tree
+                         * corresponds to the root of the reloc tree.
+                         */
+                        if (btrfs_root_last_snapshot(&root->root_item) ==
+                            trans->transid - 1)
+                                break;
+                }
+skip:
+                root = NULL;
+                next = walk_down_backref(edges, &index);
+                if (!next || next->level <= node->level)
+                        break;
+        }
+        if (!root && !loop && !reloc_only) {
+                loop = 1;
+                goto again;
+        }
+        if (root)
+                *nr = index;
+        else
+                *nr = 0;
+        return root;
+}
+static noinline_for_stack
+struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
+                                   struct backref_node *node)
+{
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        int nr;
+        return __select_one_root(trans, node, edges, &nr, 0);
+}
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+                                     struct backref_node *node,
+                                     struct backref_edge *edges[], int *nr)
+{
+        return __select_one_root(trans, node, edges, nr, 1);
+}
+static void grab_path_buffers(struct btrfs_path *path,
+                              struct backref_node *node,
+                              struct backref_edge *edges[], int nr)
+{
+        int i = 0;
+        while (1) {
+                drop_node_buffer(node);
+                node->eb = path->nodes[node->level];
+                BUG_ON(!node->eb);
+                if (path->locks[node->level])
+                        node->locked = 1;
+                path->nodes[node->level] = NULL;
+                path->locks[node->level] = 0;
+                if (i >= nr)
+                        break;
+                edges[i]->blockptr = node->eb->start;
+                node = edges[i]->node[UPPER];
+                i++;
+        }
+}
+/*
+ * relocate a block tree, and then update pointers in upper level
+ * blocks that reference the block to point to the new location.
+ *
+ * if called by link_to_upper, the block has already been relocated.
+ * in that case this function just updates pointers.
+ */
+static int do_relocation(struct btrfs_trans_handle *trans,
+                         struct backref_node *node,
+                         struct btrfs_key *key,
+                         struct btrfs_path *path, int lowest)
+{
+        struct backref_node *upper;
+        struct backref_edge *edge;
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        struct btrfs_root *root;
+        struct extent_buffer *eb;
+        u32 blocksize;
+        u64 bytenr;
+        u64 generation;
+        int nr;
+        int slot;
+        int ret;
+        int err = 0;
+        BUG_ON(lowest && node->eb);
+        path->lowest_level = node->level + 1;
+        list_for_each_entry(edge, &node->upper, list[LOWER]) {
+                cond_resched();
+                if (node->eb && node->eb->start == edge->blockptr)
+                        continue;
+                upper = edge->node[UPPER];
+                root = select_reloc_root(trans, upper, edges, &nr);
+                if (!root)
+                        continue;
+                if (upper->eb && !upper->locked)
+                        drop_node_buffer(upper);
+                if (!upper->eb) {
+                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                        BUG_ON(ret > 0);
+                        slot = path->slots[upper->level];
+                        btrfs_unlock_up_safe(path, upper->level + 1);
+                        grab_path_buffers(path, upper, edges, nr);
+                        btrfs_release_path(NULL, path);
+                } else {
+                        ret = btrfs_bin_search(upper->eb, key, upper->level,
+                                               &slot);
+                        BUG_ON(ret);
+                }
+                bytenr = btrfs_node_blockptr(upper->eb, slot);
+                if (!lowest) {
+                        if (node->eb->start == bytenr) {
+                                btrfs_tree_unlock(upper->eb);
+                                upper->locked = 0;
+                                continue;
+                        }
+                } else {
+                        BUG_ON(node->bytenr != bytenr);
+                }
+                blocksize = btrfs_level_size(root, node->level);
+                generation = btrfs_node_ptr_generation(upper->eb, slot);
+                eb = read_tree_block(root, bytenr, blocksize, generation);
+                btrfs_tree_lock(eb);
+                btrfs_set_lock_blocking(eb);
+                if (!node->eb) {
+                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
+                                              slot, &eb);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                        btrfs_set_lock_blocking(eb);
+                        node->eb = eb;
+                        node->locked = 1;
+                } else {
+                        btrfs_set_node_blockptr(upper->eb, slot,
+                                                node->eb->start);
+                        btrfs_set_node_ptr_generation(upper->eb, slot,
+                                                      trans->transid);
+                        btrfs_mark_buffer_dirty(upper->eb);
+                        ret = btrfs_inc_extent_ref(trans, root,
+                                                node->eb->start, blocksize,
+                                                upper->eb->start,
+                                                btrfs_header_owner(upper->eb),
+                                                node->level, 0);
+                        BUG_ON(ret);
+                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
+                        BUG_ON(ret);
+                        btrfs_tree_unlock(eb);
+                        free_extent_buffer(eb);
+                }
+                if (!lowest) {
+                        btrfs_tree_unlock(upper->eb);
+                        upper->locked = 0;
+                }
+        }
+        path->lowest_level = 0;
+        return err;
+}
+static int link_to_upper(struct btrfs_trans_handle *trans,
+                         struct backref_node *node,
+                         struct btrfs_path *path)
+{
+        struct btrfs_key key;
+        if (!node->eb || list_empty(&node->upper))
+                return 0;
+        btrfs_node_key_to_cpu(node->eb, &key, 0);
+        return do_relocation(trans, node, &key, path, 0);
+}
+static int finish_pending_nodes(struct btrfs_trans_handle *trans,
+                                struct backref_cache *cache,
+                                struct btrfs_path *path)
+{
+        struct backref_node *node;
+        int level;
+        int ret;
+        int err = 0;
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                while (!list_empty(&cache->pending[level])) {
+                        node = list_entry(cache->pending[level].next,
+                                          struct backref_node, lower);
+                        BUG_ON(node->level != level);
+                        ret = link_to_upper(trans, node, path);
+                        if (ret < 0)
+                                err = ret;
+                        /*
+                         * this remove the node from the pending list and
+                         * may add some other nodes to the level + 1
+                         * pending list
+                         */
+                        remove_backref_node(cache, node);
+                }
+        }
+        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+        return err;
+}
+static void mark_block_processed(struct reloc_control *rc,
+                                 struct backref_node *node)
+{
+        u32 blocksize;
+        if (node->level == 0 ||
+            in_block_group(node->bytenr, rc->block_group)) {
+                blocksize = btrfs_level_size(rc->extent_root, node->level);
+                set_extent_bits(&rc->processed_blocks, node->bytenr,
+                                node->bytenr + blocksize - 1, EXTENT_DIRTY,
+                                GFP_NOFS);
+        }
+        node->processed = 1;
+}
+/*
+ * mark a block and all blocks directly/indirectly reference the block
+ * as processed.
+ */
+static void update_processed_blocks(struct reloc_control *rc,
+                                    struct backref_node *node)
+{
+        struct backref_node *next = node;
+        struct backref_edge *edge;
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        int index = 0;
+        while (next) {
+                cond_resched();
+                while (1) {
+                        if (next->processed)
+                                break;
+                        mark_block_processed(rc, next);
+                        if (list_empty(&next->upper))
+                                break;
+                        edge = list_entry(next->upper.next,
+                                          struct backref_edge, list[LOWER]);
+                        edges[index++] = edge;
+                        next = edge->node[UPPER];
+                }
+                next = walk_down_backref(edges, &index);
+        }
+}
+static int tree_block_processed(u64 bytenr, u32 blocksize,
+                                struct reloc_control *rc)
+{
+        if (test_range_bit(&rc->processed_blocks, bytenr,
+                           bytenr + blocksize - 1, EXTENT_DIRTY, 1))
+                return 1;
+        return 0;
+}
+/*
+ * check if there are any file extent pointers in the leaf point to
+ * data require processing
+ */
+static int check_file_extents(struct reloc_control *rc,
+                              u64 bytenr, u32 blocksize, u64 ptr_gen)
+{
+        struct btrfs_key found_key;
+        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        u32 nritems;
+        int i;
+        int ret = 0;
+        leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
+        nritems = btrfs_header_nritems(leaf);
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                btrfs_item_key_to_cpu(leaf, &found_key, i);
+                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                if (bytenr == 0)
+                        continue;
+                if (in_block_group(bytenr, rc->block_group)) {
+                        ret = 1;
+                        break;
+                }
+        }
+        free_extent_buffer(leaf);
+        return ret;
+}
+/*
+ * scan child blocks of a given block to find blocks require processing
+ */
+static int add_child_blocks(struct btrfs_trans_handle *trans,
+                            struct reloc_control *rc,
+                            struct backref_node *node,
+                            struct rb_root *blocks)
+{
+        struct tree_block *block;
+        struct rb_node *rb_node;
+        u64 bytenr;
+        u64 ptr_gen;
+        u32 blocksize;
+        u32 nritems;
+        int i;
+        int err = 0;
+        nritems = btrfs_header_nritems(node->eb);
+        blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                bytenr = btrfs_node_blockptr(node->eb, i);
+                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
+                if (ptr_gen == trans->transid)
+                        continue;
+                if (!in_block_group(bytenr, rc->block_group) &&
+                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
+                        continue;
+                if (tree_block_processed(bytenr, blocksize, rc))
+                        continue;
+                readahead_tree_block(rc->extent_root,
+                                     bytenr, blocksize, ptr_gen);
+        }
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                bytenr = btrfs_node_blockptr(node->eb, i);
+                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
+                if (ptr_gen == trans->transid)
+                        continue;
+                if (!in_block_group(bytenr, rc->block_group) &&
+                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
+                        continue;
+                if (tree_block_processed(bytenr, blocksize, rc))
+                        continue;
+                if (!in_block_group(bytenr, rc->block_group) &&
+                    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
+                        continue;
+                block = kmalloc(sizeof(*block), GFP_NOFS);
+                if (!block) {
+                        err = -ENOMEM;
+                        break;
+                }
+                block->bytenr = bytenr;
+                btrfs_node_key_to_cpu(node->eb, &block->key, i);
+                block->level = node->level - 1;
+                block->key_ready = 1;
+                rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+                BUG_ON(rb_node);
+        }
+        if (err)
+                free_block_list(blocks);
+        return err;
+}
+/*
+ * find adjacent blocks require processing
+ */
+static noinline_for_stack
+int add_adjacent_blocks(struct btrfs_trans_handle *trans,
+                        struct reloc_control *rc,
+                        struct backref_cache *cache,
+                        struct rb_root *blocks, int level,
+                        struct backref_node **upper)
+{
+        struct backref_node *node;
+        int ret = 0;
+        WARN_ON(!list_empty(&cache->pending[level]));
+        if (list_empty(&cache->pending[level + 1]))
+                return 1;
+        node = list_entry(cache->pending[level + 1].next,
+                          struct backref_node, lower);
+        if (node->eb)
+                ret = add_child_blocks(trans, rc, node, blocks);
+        *upper = node;
+        return ret;
+}
+static int get_tree_block_key(struct reloc_control *rc,
+                              struct tree_block *block)
+{
+        struct extent_buffer *eb;
+        BUG_ON(block->key_ready);
+        eb = read_tree_block(rc->extent_root, block->bytenr,
+                             block->key.objectid, block->key.offset);
+        WARN_ON(btrfs_header_level(eb) != block->level);
+        if (block->level == 0)
+                btrfs_item_key_to_cpu(eb, &block->key, 0);
+        else
+                btrfs_node_key_to_cpu(eb, &block->key, 0);
+        free_extent_buffer(eb);
+        block->key_ready = 1;
+        return 0;
+}
+static int reada_tree_block(struct reloc_control *rc,
+                            struct tree_block *block)
+{
+        BUG_ON(block->key_ready);
+        readahead_tree_block(rc->extent_root, block->bytenr,
+                             block->key.objectid, block->key.offset);
+        return 0;
+}
+/*
+ * helper function to relocate a tree block
+ */
+static int relocate_tree_block(struct btrfs_trans_handle *trans,
+                                struct reloc_control *rc,
+                                struct backref_node *node,
+                                struct btrfs_key *key,
+                                struct btrfs_path *path)
+{
+        struct btrfs_root *root;
+        int ret;
+        root = select_one_root(trans, node);
+        if (unlikely(!root)) {
+                rc->found_old_snapshot = 1;
+                update_processed_blocks(rc, node);
+                return 0;
+        }
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                ret = do_relocation(trans, node, key, path, 1);
+                if (ret < 0)
+                        goto out;
+                if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+                        ret = replace_file_extents(trans, rc, root,
+                                                   node->eb, NULL);
+                        if (ret < 0)
+                                goto out;
+                }
+                drop_node_buffer(node);
+        } else if (!root->ref_cows) {
+                path->lowest_level = node->level;
+                ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+                btrfs_release_path(root, path);
+                if (ret < 0)
+                        goto out;
+        } else if (root != node->root) {
+                WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
+        }
+        update_processed_blocks(rc, node);
+        ret = 0;
+out:
+        drop_node_buffer(node);
+        return ret;
+}
+/*
+ * relocate a list of blocks
+ */
+static noinline_for_stack
+int relocate_tree_blocks(struct btrfs_trans_handle *trans,
+                         struct reloc_control *rc, struct rb_root *blocks)
+{
+        struct backref_cache *cache;
+        struct backref_node *node;
+        struct btrfs_path *path;
+        struct tree_block *block;
+        struct rb_node *rb_node;
+        int level = -1;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        cache = kmalloc(sizeof(*cache), GFP_NOFS);
+        if (!cache) {
+                btrfs_free_path(path);
+                return -ENOMEM;
+        }
+        backref_cache_init(cache);
+        rb_node = rb_first(blocks);
+        while (rb_node) {
+                block = rb_entry(rb_node, struct tree_block, rb_node);
+                if (level == -1)
+                        level = block->level;
+                else
+                        BUG_ON(level != block->level);
+                if (!block->key_ready)
+                        reada_tree_block(rc, block);
+                rb_node = rb_next(rb_node);
+        }
+        rb_node = rb_first(blocks);
+        while (rb_node) {
+                block = rb_entry(rb_node, struct tree_block, rb_node);
+                if (!block->key_ready)
+                        get_tree_block_key(rc, block);
+                rb_node = rb_next(rb_node);
+        }
+        rb_node = rb_first(blocks);
+        while (rb_node) {
+                block = rb_entry(rb_node, struct tree_block, rb_node);
+                node = build_backref_tree(rc, cache, &block->key,
+                                          block->level, block->bytenr);
+                if (IS_ERR(node)) {
+                        err = PTR_ERR(node);
+                        goto out;
+                }
+                ret = relocate_tree_block(trans, rc, node, &block->key,
+                                          path);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                remove_backref_node(cache, node);
+                rb_node = rb_next(rb_node);
+        }
+        if (level > 0)
+                goto out;
+        free_block_list(blocks);
+        /*
+         * now backrefs of some upper level tree blocks have been cached,
+         * try relocating blocks referenced by these upper level blocks.
+         */
+        while (1) {
+                struct backref_node *upper = NULL;
+                if (trans->transaction->in_commit ||
+                    trans->transaction->delayed_refs.flushing)
+                        break;
+                ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
+                                          &upper);
+                if (ret < 0)
+                        err = ret;
+                if (ret != 0)
+                        break;
+                rb_node = rb_first(blocks);
+                while (rb_node) {
+                        block = rb_entry(rb_node, struct tree_block, rb_node);
+                        if (trans->transaction->in_commit ||
+                            trans->transaction->delayed_refs.flushing)
+                                goto out;
+                        BUG_ON(!block->key_ready);
+                        node = build_backref_tree(rc, cache, &block->key,
+                                                  level, block->bytenr);
+                        if (IS_ERR(node)) {
+                                err = PTR_ERR(node);
+                                goto out;
+                        }
+                        ret = relocate_tree_block(trans, rc, node,
+                                                  &block->key, path);
+                        if (ret < 0) {
+                                err = ret;
+                                goto out;
+                        }
+                        remove_backref_node(cache, node);
+                        rb_node = rb_next(rb_node);
+                }
+                free_block_list(blocks);
+                if (upper) {
+                        ret = link_to_upper(trans, upper, path);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                        remove_backref_node(cache, upper);
+                }
+        }
+out:
+        free_block_list(blocks);
+        ret = finish_pending_nodes(trans, cache, path);
+        if (ret < 0)
+                err = ret;
+        kfree(cache);
+        btrfs_free_path(path);
+        return err;
+}
+static noinline_for_stack
+int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+{
+        u64 page_start;
+        u64 page_end;
+        unsigned long i;
+        unsigned long first_index;
+        unsigned long last_index;
+        unsigned int total_read = 0;
+        unsigned int total_dirty = 0;
+        struct page *page;
+        struct file_ra_state *ra;
+        struct btrfs_ordered_extent *ordered;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        int ret = 0;
+        ra = kzalloc(sizeof(*ra), GFP_NOFS);
+        if (!ra)
+                return -ENOMEM;
+        mutex_lock(&inode->i_mutex);
+        first_index = start >> PAGE_CACHE_SHIFT;
+        last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+        /* make sure the dirty trick played by the caller work */
+        ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                            first_index, last_index);
+        if (ret)
+                goto out_unlock;
+        file_ra_state_init(ra, inode->i_mapping);
+        for (i = first_index ; i <= last_index; i++) {
+                if (total_read % ra->ra_pages == 0) {
+                        btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+                                min(last_index, ra->ra_pages + i - 1));
+                }
+                total_read++;
+again:
+                if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+                        BUG_ON(1);
+                page = grab_cache_page(inode->i_mapping, i);
+                if (!page) {
+                        ret = -ENOMEM;
+                        goto out_unlock;
+                }
+                if (!PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                ret = -EIO;
+                                goto out_unlock;
+                        }
+                }
+                wait_on_page_writeback(page);
+                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+                page_end = page_start + PAGE_CACHE_SIZE - 1;
+                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, page_start);
+                if (ordered) {
+                        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        btrfs_put_ordered_extent(ordered);
+                        goto again;
+                }
+                set_page_extent_mapped(page);
+                if (i == first_index)
+                        set_extent_bits(io_tree, page_start, page_end,
+                                        EXTENT_BOUNDARY, GFP_NOFS);
+                btrfs_set_extent_delalloc(inode, page_start, page_end);
+                set_page_dirty(page);
+                total_dirty++;
+                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_page(page);
+                page_cache_release(page);
+        }
+out_unlock:
+        mutex_unlock(&inode->i_mutex);
+        kfree(ra);
+        balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+        return ret;
+}
+static noinline_for_stack
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_map *em;
+        u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
+        u64 end = start + extent_key->offset - 1;
+        em = alloc_extent_map(GFP_NOFS);
+        em->start = start;
+        em->len = extent_key->offset;
+        em->block_len = extent_key->offset;
+        em->block_start = extent_key->objectid;
+        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+        /* setup extent map to cheat btrfs_readpage */
+        lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+        while (1) {
+                int ret;
+                spin_lock(&em_tree->lock);
+                ret = add_extent_mapping(em_tree, em);
+                spin_unlock(&em_tree->lock);
+                if (ret != -EEXIST) {
+                        free_extent_map(em);
+                        break;
+                }
+                btrfs_drop_extent_cache(inode, start, end, 0);
+        }
+        unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+        return relocate_inode_pages(inode, start, extent_key->offset);
+}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int get_ref_objectid_v0(struct reloc_control *rc,
+                               struct btrfs_path *path,
+                               struct btrfs_key *extent_key,
+                               u64 *ref_objectid, int *path_change)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_ref_v0 *ref0;
+        int ret;
+        int slot;
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+        while (1) {
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(rc->extent_root, path);
+                        if (ret < 0)
+                                return ret;
+                        BUG_ON(ret > 0);
+                        leaf = path->nodes[0];
+                        slot = path->slots[0];
+                        if (path_change)
+                                *path_change = 1;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (key.objectid != extent_key->objectid)
+                        return -ENOENT;
+                if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
+                        slot++;
+                        continue;
+                }
+                ref0 = btrfs_item_ptr(leaf, slot,
+                                struct btrfs_extent_ref_v0);
+                *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
+                break;
+        }
+        return 0;
+}
+#endif
+/*
+ * helper to add a tree block to the list.
+ * the major work is getting the generation and level of the block
+ */
+static int add_tree_block(struct reloc_control *rc,
+                          struct btrfs_key *extent_key,
+                          struct btrfs_path *path,
+                          struct rb_root *blocks)
+{
+        struct extent_buffer *eb;
+        struct btrfs_extent_item *ei;
+        struct btrfs_tree_block_info *bi;
+        struct tree_block *block;
+        struct rb_node *rb_node;
+        u32 item_size;
+        int level = -1;
+        int generation;
+        eb =  path->nodes[0];
+        item_size = btrfs_item_size_nr(eb, path->slots[0]);
+        if (item_size >= sizeof(*ei) + sizeof(*bi)) {
+                ei = btrfs_item_ptr(eb, path->slots[0],
+                                struct btrfs_extent_item);
+                bi = (struct btrfs_tree_block_info *)(ei + 1);
+                generation = btrfs_extent_generation(eb, ei);
+                level = btrfs_tree_block_level(eb, bi);
+        } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                u64 ref_owner;
+                int ret;
+                BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+                ret = get_ref_objectid_v0(rc, path, extent_key,
+                                          &ref_owner, NULL);
+                BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
+                level = (int)ref_owner;
+                /* FIXME: get real generation */
+                generation = 0;
+#else
+                BUG();
+#endif
+        }
+        btrfs_release_path(rc->extent_root, path);
+        BUG_ON(level == -1);
+        block = kmalloc(sizeof(*block), GFP_NOFS);
+        if (!block)
+                return -ENOMEM;
+        block->bytenr = extent_key->objectid;
+        block->key.objectid = extent_key->offset;
+        block->key.offset = generation;
+        block->level = level;
+        block->key_ready = 0;
+        rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+        BUG_ON(rb_node);
+        return 0;
+}
+/*
+ * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
+ */
+static int __add_tree_block(struct reloc_control *rc,
+                            u64 bytenr, u32 blocksize,
+                            struct rb_root *blocks)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        int ret;
+        if (tree_block_processed(bytenr, blocksize, rc))
+                return 0;
+        if (tree_search(blocks, bytenr))
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = blocksize;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        BUG_ON(ret);
+        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+        ret = add_tree_block(rc, &key, path, blocks);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper to check if the block use full backrefs for pointers in it
+ */
+static int block_use_full_backref(struct reloc_control *rc,
+                                  struct extent_buffer *eb)
+{
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        struct btrfs_key key;
+        u64 flags;
+        int ret;
+        if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
+            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
+                return 1;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = eb->start;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = eb->len;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        ret = btrfs_search_slot(NULL, rc->extent_root,
+                                &key, path, 0, 0);
+        BUG_ON(ret);
+        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                            struct btrfs_extent_item);
+        flags = btrfs_extent_flags(path->nodes[0], ei);
+        BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                ret = 1;
+        else
+                ret = 0;
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
+ * this function scans fs tree to find blocks reference the data extent
+ */
+static int find_data_references(struct reloc_control *rc,
+                                struct btrfs_key *extent_key,
+                                struct extent_buffer *leaf,
+                                struct btrfs_extent_data_ref *ref,
+                                struct rb_root *blocks)
+{
+        struct btrfs_path *path;
+        struct tree_block *block;
+        struct btrfs_root *root;
+        struct btrfs_file_extent_item *fi;
+        struct rb_node *rb_node;
+        struct btrfs_key key;
+        u64 ref_root;
+        u64 ref_objectid;
+        u64 ref_offset;
+        u32 ref_count;
+        u32 nritems;
+        int err = 0;
+        int added = 0;
+        int counted;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ref_root = btrfs_extent_data_ref_root(leaf, ref);
+        ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
+        ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
+        ref_count = btrfs_extent_data_ref_count(leaf, ref);
+        root = read_fs_root(rc->extent_root->fs_info, ref_root);
+        if (IS_ERR(root)) {
+                err = PTR_ERR(root);
+                goto out;
+        }
+        key.objectid = ref_objectid;
+        key.offset = ref_offset;
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        nritems = btrfs_header_nritems(leaf);
+        /*
+         * the references in tree blocks that use full backrefs
+         * are not counted in
+         */
+        if (block_use_full_backref(rc, leaf))
+                counted = 0;
+        else
+                counted = 1;
+        rb_node = tree_search(blocks, leaf->start);
+        if (rb_node) {
+                if (counted)
+                        added = 1;
+                else
+                        path->slots[0] = nritems;
+        }
+        while (ref_count > 0) {
+                while (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0) {
+                                err = ret;
+                                goto out;
+                        }
+                        if (ret > 0) {
+                                WARN_ON(1);
+                                goto out;
+                        }
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                        added = 0;
+                        if (block_use_full_backref(rc, leaf))
+                                counted = 0;
+                        else
+                                counted = 1;
+                        rb_node = tree_search(blocks, leaf->start);
+                        if (rb_node) {
+                                if (counted)
+                                        added = 1;
+                                else
+                                        path->slots[0] = nritems;
+                        }
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != ref_objectid ||
+                    key.type != BTRFS_EXTENT_DATA_KEY) {
+                        WARN_ON(1);
+                        break;
+                }
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        goto next;
+                if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+                    extent_key->objectid)
+                        goto next;
+                key.offset -= btrfs_file_extent_offset(leaf, fi);
+                if (key.offset != ref_offset)
+                        goto next;
+                if (counted)
+                        ref_count--;
+                if (added)
+                        goto next;
+                if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+                        block = kmalloc(sizeof(*block), GFP_NOFS);
+                        if (!block) {
+                                err = -ENOMEM;
+                                break;
+                        }
+                        block->bytenr = leaf->start;
+                        btrfs_item_key_to_cpu(leaf, &block->key, 0);
+                        block->level = 0;
+                        block->key_ready = 1;
+                        rb_node = tree_insert(blocks, block->bytenr,
+                                              &block->rb_node);
+                        BUG_ON(rb_node);
+                }
+                if (counted)
+                        added = 1;
+                else
+                        path->slots[0] = nritems;
+next:
+                path->slots[0]++;
+        }
+out:
+        btrfs_free_path(path);
+        return err;
+}
+/*
+ * hepler to find all tree blocks that reference a given data extent
+ */
+static noinline_for_stack
+int add_data_references(struct reloc_control *rc,
+                        struct btrfs_key *extent_key,
+                        struct btrfs_path *path,
+                        struct rb_root *blocks)
+{
+        struct btrfs_key key;
+        struct extent_buffer *eb;
+        struct btrfs_extent_data_ref *dref;
+        struct btrfs_extent_inline_ref *iref;
+        unsigned long ptr;
+        unsigned long end;
+        u32 blocksize;
+        int ret;
+        int err = 0;
+        ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
+                               extent_key->offset);
+        BUG_ON(ret < 0);
+        if (ret > 0) {
+                /* the relocated data is fragmented */
+                rc->extents_skipped++;
+                btrfs_release_path(rc->extent_root, path);
+                return 0;
+        }
+        blocksize = btrfs_level_size(rc->extent_root, 0);
+        eb = path->nodes[0];
+        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
+                ptr = end;
+        else
+#endif
+                ptr += sizeof(struct btrfs_extent_item);
+        while (ptr < end) {
+                iref = (struct btrfs_extent_inline_ref *)ptr;
+                key.type = btrfs_extent_inline_ref_type(eb, iref);
+                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+                        ret = __add_tree_block(rc, key.offset, blocksize,
+                                               blocks);
+                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        ret = find_data_references(rc, extent_key,
+                                                   eb, dref, blocks);
+                } else {
+                        BUG();
+                }
+                ptr += btrfs_extent_inline_ref_size(key.type);
+        }
+        WARN_ON(ptr > end);
+        while (1) {
+                cond_resched();
+                eb = path->nodes[0];
+                if (path->slots[0] >= btrfs_header_nritems(eb)) {
+                        ret = btrfs_next_leaf(rc->extent_root, path);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                        if (ret > 0)
+                                break;
+                        eb = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+                if (key.objectid != extent_key->objectid)
+                        break;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
+                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+#else
+                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+#endif
+                        ret = __add_tree_block(rc, key.offset, blocksize,
+                                               blocks);
+                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                        dref = btrfs_item_ptr(eb, path->slots[0],
+                                              struct btrfs_extent_data_ref);
+                        ret = find_data_references(rc, extent_key,
+                                                   eb, dref, blocks);
+                } else {
+                        ret = 0;
+                }
+                if (ret) {
+                        err = ret;
+                        break;
+                }
+                path->slots[0]++;
+        }
+        btrfs_release_path(rc->extent_root, path);
+        if (err)
+                free_block_list(blocks);
+        return err;
+}
+/*
+ * hepler to find next unprocessed extent
+ */
+static noinline_for_stack
+int find_next_extent(struct btrfs_trans_handle *trans,
+                     struct reloc_control *rc, struct btrfs_path *path)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        u64 start, end, last;
+        int ret;
+        last = rc->block_group->key.objectid + rc->block_group->key.offset;
+        while (1) {
+                cond_resched();
+                if (rc->search_start >= last) {
+                        ret = 1;
+                        break;
+                }
+                key.objectid = rc->search_start;
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = 0;
+                path->search_commit_root = 1;
+                path->skip_locking = 1;
+                ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
+                                        0, 0);
+                if (ret < 0)
+                        break;
+next:
+                leaf = path->nodes[0];
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(rc->extent_root, path);
+                        if (ret != 0)
+                                break;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid >= last) {
+                        ret = 1;
+                        break;
+                }
+                if (key.type != BTRFS_EXTENT_ITEM_KEY ||
+                    key.objectid + key.offset <= rc->search_start) {
+                        path->slots[0]++;
+                        goto next;
+                }
+                ret = find_first_extent_bit(&rc->processed_blocks,
+                                            key.objectid, &start, &end,
+                                            EXTENT_DIRTY);
+                if (ret == 0 && start <= key.objectid) {
+                        btrfs_release_path(rc->extent_root, path);
+                        rc->search_start = end + 1;
+                } else {
+                        rc->search_start = key.objectid + key.offset;
+                        return 0;
+                }
+        }
+        btrfs_release_path(rc->extent_root, path);
+        return ret;
+}
+static void set_reloc_control(struct reloc_control *rc)
+{
+        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+        mutex_lock(&fs_info->trans_mutex);
+        fs_info->reloc_ctl = rc;
+        mutex_unlock(&fs_info->trans_mutex);
+}
+static void unset_reloc_control(struct reloc_control *rc)
+{
+        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+        mutex_lock(&fs_info->trans_mutex);
+        fs_info->reloc_ctl = NULL;
+        mutex_unlock(&fs_info->trans_mutex);
+}
+static int check_extent_flags(u64 flags)
+{
+        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+                return 1;
+        if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
+            !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+                return 1;
+        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+                return 1;
+        return 0;
+}
+static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+{
+        struct rb_root blocks = RB_ROOT;
+        struct btrfs_key key;
+        struct btrfs_trans_handle *trans = NULL;
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        unsigned long nr;
+        u64 flags;
+        u32 item_size;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        rc->search_start = rc->block_group->key.objectid;
+        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+                          GFP_NOFS);
+        rc->create_reloc_root = 1;
+        set_reloc_control(rc);
+        trans = btrfs_start_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        while (1) {
+                trans = btrfs_start_transaction(rc->extent_root, 1);
+                ret = find_next_extent(trans, rc, path);
+                if (ret < 0)
+                        err = ret;
+                if (ret != 0)
+                        break;
+                rc->extents_found++;
+                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                    struct btrfs_extent_item);
+                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+                item_size = btrfs_item_size_nr(path->nodes[0],
+                                               path->slots[0]);
+                if (item_size >= sizeof(*ei)) {
+                        flags = btrfs_extent_flags(path->nodes[0], ei);
+                        ret = check_extent_flags(flags);
+                        BUG_ON(ret);
+                } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        u64 ref_owner;
+                        int path_change = 0;
+                        BUG_ON(item_size !=
+                               sizeof(struct btrfs_extent_item_v0));
+                        ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
+                                                  &path_change);
+                        if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
+                                flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+                        else
+                                flags = BTRFS_EXTENT_FLAG_DATA;
+                        if (path_change) {
+                                btrfs_release_path(rc->extent_root, path);
+                                path->search_commit_root = 1;
+                                path->skip_locking = 1;
+                                ret = btrfs_search_slot(NULL, rc->extent_root,
+                                                        &key, path, 0, 0);
+                                if (ret < 0) {
+                                        err = ret;
+                                        break;
+                                }
+                                BUG_ON(ret > 0);
+                        }
+#else
+                        BUG();
+#endif
+                }
+                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                        ret = add_tree_block(rc, &key, path, &blocks);
+                } else if (rc->stage == UPDATE_DATA_PTRS &&
+                         (flags & BTRFS_EXTENT_FLAG_DATA)) {
+                        ret = add_data_references(rc, &key, path, &blocks);
+                } else {
+                        btrfs_release_path(rc->extent_root, path);
+                        ret = 0;
+                }
+                if (ret < 0) {
+                        err = 0;
+                        break;
+                }
+                if (!RB_EMPTY_ROOT(&blocks)) {
+                        ret = relocate_tree_blocks(trans, rc, &blocks);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                }
+                nr = trans->blocks_used;
+                btrfs_end_transaction_throttle(trans, rc->extent_root);
+                trans = NULL;
+                btrfs_btree_balance_dirty(rc->extent_root, nr);
+                if (rc->stage == MOVE_DATA_EXTENTS &&
+                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
+                        rc->found_file_extent = 1;
+                        ret = relocate_data_extent(rc->data_inode, &key);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                }
+        }
+        btrfs_free_path(path);
+        if (trans) {
+                nr = trans->blocks_used;
+                btrfs_end_transaction(trans, rc->extent_root);
+                btrfs_btree_balance_dirty(rc->extent_root, nr);
+        }
+        rc->create_reloc_root = 0;
+        smp_mb();
+        if (rc->extents_found > 0) {
+                trans = btrfs_start_transaction(rc->extent_root, 1);
+                btrfs_commit_transaction(trans, rc->extent_root);
+        }
+        merge_reloc_roots(rc);
+        unset_reloc_control(rc);
+        /* get rid of pinned extents */
+        trans = btrfs_start_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        return err;
+}
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 u64 objectid, u64 size)
+{
+        struct btrfs_path *path;
+        struct btrfs_inode_item *item;
+        struct extent_buffer *leaf;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+        if (ret)
+                goto out;
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+        btrfs_set_inode_generation(leaf, item, 1);
+        btrfs_set_inode_size(leaf, item, size);
+        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(root, path);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper to create inode for data relocation.
+ * the inode is in data relocation tree and its link count is 0
+ */
+static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_block_group_cache *group)
+{
+        struct inode *inode = NULL;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
+        struct btrfs_key key;
+        unsigned long nr;
+        u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+        int err = 0;
+        root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
+        if (IS_ERR(root))
+                return ERR_CAST(root);
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+        if (err)
+                goto out;
+        err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+        BUG_ON(err);
+        err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+                                       group->key.offset, 0, group->key.offset,
+                                       0, 0, 0);
+        BUG_ON(err);
+        key.objectid = objectid;
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        inode = btrfs_iget(root->fs_info->sb, &key, root);
+        BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
+        BTRFS_I(inode)->index_cnt = group->key.objectid;
+        err = btrfs_orphan_add(trans, inode);
+out:
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+        if (err) {
+                if (inode)
+                        iput(inode);
+                inode = ERR_PTR(err);
+        }
+        return inode;
+}
+/*
+ * function to relocate all extents in a block group.
+ */
+int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+{
+        struct btrfs_fs_info *fs_info = extent_root->fs_info;
+        struct reloc_control *rc;
+        int ret;
+        int err = 0;
+        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        if (!rc)
+                return -ENOMEM;
+        mapping_tree_init(&rc->reloc_root_tree);
+        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+        INIT_LIST_HEAD(&rc->reloc_roots);
+        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
+        BUG_ON(!rc->block_group);
+        btrfs_init_workers(&rc->workers, "relocate",
+                           fs_info->thread_pool_size);
+        rc->extent_root = extent_root;
+        btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+        if (IS_ERR(rc->data_inode)) {
+                err = PTR_ERR(rc->data_inode);
+                rc->data_inode = NULL;
+                goto out;
+        }
+        printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
+               (unsigned long long)rc->block_group->key.objectid,
+               (unsigned long long)rc->block_group->flags);
+        btrfs_start_delalloc_inodes(fs_info->tree_root);
+        btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+        while (1) {
+                mutex_lock(&fs_info->cleaner_mutex);
+                btrfs_clean_old_snapshots(fs_info->tree_root);
+                mutex_unlock(&fs_info->cleaner_mutex);
+                rc->extents_found = 0;
+                rc->extents_skipped = 0;
+                ret = relocate_block_group(rc);
+                if (ret < 0) {
+                        err = ret;
+                        break;
+                }
+                if (rc->extents_found == 0)
+                        break;
+                printk(KERN_INFO "btrfs: found %llu extents\n",
+                        (unsigned long long)rc->extents_found);
+                if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
+                        btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+                        invalidate_mapping_pages(rc->data_inode->i_mapping,
+                                                 0, -1);
+                        rc->stage = UPDATE_DATA_PTRS;
+                } else if (rc->stage == UPDATE_DATA_PTRS &&
+                           rc->extents_skipped >= rc->extents_found) {
+                        iput(rc->data_inode);
+                        rc->data_inode = create_reloc_inode(fs_info,
+                                                            rc->block_group);
+                        if (IS_ERR(rc->data_inode)) {
+                                err = PTR_ERR(rc->data_inode);
+                                rc->data_inode = NULL;
+                                break;
+                        }
+                        rc->stage = MOVE_DATA_EXTENTS;
+                        rc->found_file_extent = 0;
+                }
+        }
+        filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
+                                 rc->block_group->key.objectid,
+                                 rc->block_group->key.objectid +
+                                 rc->block_group->key.offset - 1);
+        WARN_ON(rc->block_group->pinned > 0);
+        WARN_ON(rc->block_group->reserved > 0);
+        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
+out:
+        iput(rc->data_inode);
+        btrfs_stop_workers(&rc->workers);
+        btrfs_put_block_group(rc->block_group);
+        kfree(rc);
+        return err;
+}
+/*
+ * recover relocation interrupted by system crash.
+ *
+ * this function resumes merging reloc trees with corresponding fs trees.
+ * this is important for keeping the sharing of tree blocks
+ */
+int btrfs_recover_relocation(struct btrfs_root *root)
+{
+        LIST_HEAD(reloc_roots);
+        struct btrfs_key key;
+        struct btrfs_root *fs_root;
+        struct btrfs_root *reloc_root;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct reloc_control *rc = NULL;
+        struct btrfs_trans_handle *trans;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        key.offset = (u64)-1;
+        while (1) {
+                ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
+                                        path, 0, 0);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                if (ret > 0) {
+                        if (path->slots[0] == 0)
+                                break;
+                        path->slots[0]--;
+                }
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                btrfs_release_path(root->fs_info->tree_root, path);
+                if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
+                    key.type != BTRFS_ROOT_ITEM_KEY)
+                        break;
+                reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+                if (IS_ERR(reloc_root)) {
+                        err = PTR_ERR(reloc_root);
+                        goto out;
+                }
+                list_add(&reloc_root->root_list, &reloc_roots);
+                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+                        fs_root = read_fs_root(root->fs_info,
+                                               reloc_root->root_key.offset);
+                        if (IS_ERR(fs_root)) {
+                                err = PTR_ERR(fs_root);
+                                goto out;
+                        }
+                }
+                if (key.offset == 0)
+                        break;
+                key.offset--;
+        }
+        btrfs_release_path(root->fs_info->tree_root, path);
+        if (list_empty(&reloc_roots))
+                goto out;
+        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        if (!rc) {
+                err = -ENOMEM;
+                goto out;
+        }
+        mapping_tree_init(&rc->reloc_root_tree);
+        INIT_LIST_HEAD(&rc->reloc_roots);
+        btrfs_init_workers(&rc->workers, "relocate",
+                           root->fs_info->thread_pool_size);
+        rc->extent_root = root->fs_info->extent_root;
+        set_reloc_control(rc);
+        while (!list_empty(&reloc_roots)) {
+                reloc_root = list_entry(reloc_roots.next,
+                                        struct btrfs_root, root_list);
+                list_del(&reloc_root->root_list);
+                if (btrfs_root_refs(&reloc_root->root_item) == 0) {
+                        list_add_tail(&reloc_root->root_list,
+                                      &rc->reloc_roots);
+                        continue;
+                }
+                fs_root = read_fs_root(root->fs_info,
+                                       reloc_root->root_key.offset);
+                BUG_ON(IS_ERR(fs_root));
+                __add_reloc_root(reloc_root);
+                fs_root->reloc_root = reloc_root;
+        }
+        trans = btrfs_start_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        merge_reloc_roots(rc);
+        unset_reloc_control(rc);
+        trans = btrfs_start_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+out:
+        if (rc) {
+                btrfs_stop_workers(&rc->workers);
+                kfree(rc);
+        }
+        while (!list_empty(&reloc_roots)) {
+                reloc_root = list_entry(reloc_roots.next,
+                                        struct btrfs_root, root_list);
+                list_del(&reloc_root->root_list);
+                free_extent_buffer(reloc_root->node);
+                free_extent_buffer(reloc_root->commit_root);
+                kfree(reloc_root);
+        }
+        btrfs_free_path(path);
+        if (err == 0) {
+                /* cleanup orphan inode in data relocation tree */
+                fs_root = read_fs_root(root->fs_info,
+                                       BTRFS_DATA_RELOC_TREE_OBJECTID);
+                if (IS_ERR(fs_root))
+                        err = PTR_ERR(fs_root);
+        }
+        return err;
+}
+/*
+ * helper to add ordered checksum for data relocation.
+ *
+ * cloning checksum properly handles the nodatasum extents.
+ * it also saves CPU time to re-calculate the checksum.
+ */
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+        struct btrfs_ordered_sum *sums;
+        struct btrfs_sector_sum *sector_sum;
+        struct btrfs_ordered_extent *ordered;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        size_t offset;
+        int ret;
+        u64 disk_bytenr;
+        LIST_HEAD(list);
+        ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+        BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+                                       disk_bytenr + len - 1, &list);
+        while (!list_empty(&list)) {
+                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+                list_del_init(&sums->list);
+                sector_sum = sums->sums;
+                sums->bytenr = ordered->start;
+                offset = 0;
+                while (offset < sums->len) {
+                        sector_sum->bytenr += ordered->start - disk_bytenr;
+                        sector_sum++;
+                        offset += root->sectorsize;
+                }
+                btrfs_add_ordered_sum(inode, ordered, sums);
+        }
+        btrfs_put_ordered_extent(ordered);
+        return 0;
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index b48650de4472..0ddc6d61c55a 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -111,6 +111,15 @@ out:
        return ret;
 }
+int btrfs_set_root_node(struct btrfs_root_item *item,
+                        struct extent_buffer *node)
+{
+        btrfs_set_root_bytenr(item, node->start);
+        btrfs_set_root_level(item, btrfs_header_level(node));
+        btrfs_set_root_generation(item, btrfs_header_generation(node));
+        return 0;
+}
 /*
 * copy the data in 'item' into the btree
 */
@@ -164,8 +173,7 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 * offset lower than the latest root.  They need to be queued for deletion to
 * finish what was happening when we crashed.
 */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
-                          struct btrfs_root *latest)
 {
        struct btrfs_root *dead_root;
        struct btrfs_item *item;
@@ -227,10 +235,7 @@ again:
                        goto err;
                }
-                if (objectid == BTRFS_TREE_RELOC_OBJECTID)
+                ret = btrfs_add_dead_root(dead_root);
-                        ret = btrfs_add_dead_reloc_root(dead_root);
-                else
-                        ret = btrfs_add_dead_root(dead_root, latest);
                if (ret)
                        goto err;
                goto again;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2ff7cd2db25f..708ac06b953b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -52,7 +52,6 @@
 #include "export.h"
 #include "compression.h"
 static struct super_operations btrfs_super_ops;
 static void btrfs_put_super(struct super_block *sb)
@@ -67,8 +66,8 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
        Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
        Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-        Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
+        Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
-        Opt_ratio, Opt_flushoncommit, Opt_err,
+        Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
 };
 static match_table_t tokens = {
@@ -84,6 +83,8 @@ static match_table_t tokens = {
        {Opt_thread_pool, "thread_pool=%d"},
        {Opt_compress, "compress"},
        {Opt_ssd, "ssd"},
+        {Opt_ssd_spread, "ssd_spread"},
+        {Opt_nossd, "nossd"},
        {Opt_noacl, "noacl"},
        {Opt_notreelog, "notreelog"},
        {Opt_flushoncommit, "flushoncommit"},
@@ -158,7 +159,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                         */
                        break;
                case Opt_nodatasum:
-                        printk(KERN_INFO "btrfs: setting nodatacsum\n");
+                        printk(KERN_INFO "btrfs: setting nodatasum\n");
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
                case Opt_nodatacow:
@@ -174,6 +175,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
                        btrfs_set_opt(info->mount_opt, SSD);
                        break;
+                case Opt_ssd_spread:
+                        printk(KERN_INFO "btrfs: use spread ssd "
+                               "allocation scheme\n");
+                        btrfs_set_opt(info->mount_opt, SSD);
+                        btrfs_set_opt(info->mount_opt, SSD_SPREAD);
+                        break;
+                case Opt_nossd:
+                        printk(KERN_INFO "btrfs: not using ssd allocation "
+                               "scheme\n");
+                        btrfs_set_opt(info->mount_opt, NOSSD);
+                        btrfs_clear_opt(info->mount_opt, SSD);
+                        btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
+                        break;
                case Opt_nobarrier:
                        printk(KERN_INFO "btrfs: turning off barriers\n");
                        btrfs_set_opt(info->mount_opt, NOBARRIER);
@@ -322,7 +336,7 @@ static int btrfs_fill_super(struct super_block *sb,
        struct dentry *root_dentry;
        struct btrfs_super_block *disk_super;
        struct btrfs_root *tree_root;
-        struct btrfs_inode *bi;
+        struct btrfs_key key;
        int err;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -341,23 +355,15 @@ static int btrfs_fill_super(struct super_block *sb,
        }
        sb->s_fs_info = tree_root;
        disk_super = &tree_root->fs_info->super_copy;
-        inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
-                                  tree_root->fs_info->fs_root);
-        bi = BTRFS_I(inode);
-        bi->location.objectid = inode->i_ino;
-        bi->location.offset = 0;
-        bi->root = tree_root->fs_info->fs_root;
-        btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
+        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+        key.type = BTRFS_INODE_ITEM_KEY;
-        if (!inode) {
+        key.offset = 0;
-                err = -ENOMEM;
+        inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
                goto fail_close;
        }
-        if (inode->i_state & I_NEW) {
-                btrfs_read_locked_inode(inode);
-                unlock_new_inode(inode);
-        }
        root_dentry = d_alloc_root(inode);
        if (!root_dentry) {
@@ -433,7 +439,11 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
        if (btrfs_test_opt(root, COMPRESS))
                seq_puts(seq, ",compress");
-        if (btrfs_test_opt(root, SSD))
+        if (btrfs_test_opt(root, NOSSD))
+                seq_puts(seq, ",nossd");
+        if (btrfs_test_opt(root, SSD_SPREAD))
+                seq_puts(seq, ",ssd_spread");
+        else if (btrfs_test_opt(root, SSD))
                seq_puts(seq, ",ssd");
        if (btrfs_test_opt(root, NOTREELOG))
                seq_puts(seq, ",notreelog");
@@ -584,7 +594,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
                        return -EINVAL;
-                ret = btrfs_cleanup_reloc_trees(root);
+                /* recover relocation */
+                ret = btrfs_recover_relocation(root);
                WARN_ON(ret);
                ret = btrfs_cleanup_fs_roots(root->fs_info);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 01b143605ec1..2e177d7f4bb9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -25,7 +25,6 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "tree-log.h"
 #define BTRFS_ROOT_TRANS_TAG 0
@@ -94,45 +93,37 @@ static noinline int join_transaction(struct btrfs_root *root)
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
 */
-noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
+static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
+                                         struct btrfs_root *root)
 {
-        struct btrfs_dirty_root *dirty;
+        if (root->ref_cows && root->last_trans < trans->transid) {
-        u64 running_trans_id = root->fs_info->running_transaction->transid;
-        if (root->ref_cows && root->last_trans < running_trans_id) {
                WARN_ON(root == root->fs_info->extent_root);
-                if (root->root_item.refs != 0) {
+                WARN_ON(root->root_item.refs == 0);
-                        radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+                WARN_ON(root->commit_root != root->node);
-                                   (unsigned long)root->root_key.objectid,
-                                   BTRFS_ROOT_TRANS_TAG);
+                radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+                           (unsigned long)root->root_key.objectid,
-                        dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+                           BTRFS_ROOT_TRANS_TAG);
-                        BUG_ON(!dirty);
+                root->last_trans = trans->transid;
-                        dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+                btrfs_init_reloc_root(trans, root);
-                        BUG_ON(!dirty->root);
+        }
-                        dirty->latest_root = root;
+        return 0;
-                        INIT_LIST_HEAD(&dirty->list);
+}
-                        root->commit_root = btrfs_root_node(root);
-                        memcpy(dirty->root, root, sizeof(*root));
-                        spin_lock_init(&dirty->root->node_lock);
-                        spin_lock_init(&dirty->root->list_lock);
-                        mutex_init(&dirty->root->objectid_mutex);
-                        mutex_init(&dirty->root->log_mutex);
-                        INIT_LIST_HEAD(&dirty->root->dead_list);
-                        dirty->root->node = root->commit_root;
-                        dirty->root->commit_root = NULL;
-                        spin_lock(&root->list_lock);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
-                        list_add(&dirty->root->dead_list, &root->dead_list);
+                               struct btrfs_root *root)
-                        spin_unlock(&root->list_lock);
+{
+        if (!root->ref_cows)
+                return 0;
-                        root->dirty_root = dirty;
+        mutex_lock(&root->fs_info->trans_mutex);
-                } else {
+        if (root->last_trans == trans->transid) {
-                        WARN_ON(1);
+                mutex_unlock(&root->fs_info->trans_mutex);
-                }
+                return 0;
-                root->last_trans = running_trans_id;
        }
+        record_root_in_trans(trans, root);
+        mutex_unlock(&root->fs_info->trans_mutex);
        return 0;
 }
@@ -181,7 +172,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        ret = join_transaction(root);
        BUG_ON(ret);
-        btrfs_record_root_in_trans(root);
        h->transid = root->fs_info->running_transaction->transid;
        h->transaction = root->fs_info->running_transaction;
        h->blocks_reserved = num_blocks;
@@ -192,6 +182,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        h->delayed_ref_updates = 0;
        root->fs_info->running_transaction->use_count++;
+        record_root_in_trans(h, root);
        mutex_unlock(&root->fs_info->trans_mutex);
        return h;
 }
@@ -233,6 +224,7 @@ static noinline int wait_for_commit(struct btrfs_root *root,
        return 0;
 }
+#if 0
 /*
 * rate limit against the drop_snapshot code.  This helps to slow down new
 * operations if the drop_snapshot code isn't able to keep up.
@@ -273,6 +265,7 @@ harder:
                        goto harder;
        }
 }
+#endif
 void btrfs_throttle(struct btrfs_root *root)
 {
@@ -280,7 +273,6 @@ void btrfs_throttle(struct btrfs_root *root)
        if (!root->fs_info->open_ioctl_trans)
                wait_current_trans(root);
        mutex_unlock(&root->fs_info->trans_mutex);
-        throttle_on_drops(root);
 }
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -323,9 +315,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        memset(trans, 0, sizeof(*trans));
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
-        if (throttle)
-                throttle_on_drops(root);
        return 0;
 }
@@ -462,12 +451,8 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start)
                        break;
-                btrfs_set_root_bytenr(&root->root_item,
-                                       root->node->start);
-                btrfs_set_root_level(&root->root_item,
-                                     btrfs_header_level(root->node));
-                btrfs_set_root_generation(&root->root_item, trans->transid);
+                btrfs_set_root_node(&root->root_item, root->node);
                ret = btrfs_update_root(trans, tree_root,
                                        &root->root_key,
                                        &root->root_item);
@@ -477,14 +462,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
                BUG_ON(ret);
        }
+        free_extent_buffer(root->commit_root);
+        root->commit_root = btrfs_root_node(root);
        return 0;
 }
 /*
 * update all the cowonly tree roots on disk
 */
-int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root)
+                                         struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct list_head *next;
@@ -520,118 +507,54 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
-int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
+int btrfs_add_dead_root(struct btrfs_root *root)
 {
-        struct btrfs_dirty_root *dirty;
-        dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
-        if (!dirty)
-                return -ENOMEM;
-        dirty->root = root;
-        dirty->latest_root = latest;
        mutex_lock(&root->fs_info->trans_mutex);
-        list_add(&dirty->list, &latest->fs_info->dead_roots);
+        list_add(&root->root_list, &root->fs_info->dead_roots);
        mutex_unlock(&root->fs_info->trans_mutex);
        return 0;
 }
 /*
- * at transaction commit time we need to schedule the old roots for
+ * update all the cowonly tree roots on disk
- * deletion via btrfs_drop_snapshot.  This runs through all the
- * reference counted roots that were modified in the current
- * transaction and puts them into the drop list
 */
-static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
-                                    struct radix_tree_root *radix,
+                                    struct btrfs_root *root)
-                                    struct list_head *list)
 {
-        struct btrfs_dirty_root *dirty;
        struct btrfs_root *gang[8];
-        struct btrfs_root *root;
+        struct btrfs_fs_info *fs_info = root->fs_info;
        int i;
        int ret;
        int err = 0;
-        u32 refs;
        while (1) {
-                ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
+                ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+                                                 (void **)gang, 0,
                                                 ARRAY_SIZE(gang),
                                                 BTRFS_ROOT_TRANS_TAG);
                if (ret == 0)
                        break;
                for (i = 0; i < ret; i++) {
                        root = gang[i];
-                        radix_tree_tag_clear(radix,
+                        radix_tree_tag_clear(&fs_info->fs_roots_radix,
-                                     (unsigned long)root->root_key.objectid,
+                                        (unsigned long)root->root_key.objectid,
-                                     BTRFS_ROOT_TRANS_TAG);
+                                        BTRFS_ROOT_TRANS_TAG);
-                        BUG_ON(!root->ref_tree);
-                        dirty = root->dirty_root;
                        btrfs_free_log(trans, root);
-                        btrfs_free_reloc_root(trans, root);
+                        btrfs_update_reloc_root(trans, root);
-                        if (root->commit_root == root->node) {
-                                WARN_ON(root->node->start !=
-                                        btrfs_root_bytenr(&root->root_item));
-                                free_extent_buffer(root->commit_root);
-                                root->commit_root = NULL;
-                                root->dirty_root = NULL;
-                                spin_lock(&root->list_lock);
-                                list_del_init(&dirty->root->dead_list);
-                                spin_unlock(&root->list_lock);
-                                kfree(dirty->root);
+                        if (root->commit_root == root->node)
-                                kfree(dirty);
-                                /* make sure to update the root on disk
-                                 * so we get any updates to the block used
-                                 * counts
-                                 */
-                                err = btrfs_update_root(trans,
-                                                root->fs_info->tree_root,
-                                                &root->root_key,
-                                                &root->root_item);
                                continue;
-                        }
-                        memset(&root->root_item.drop_progress, 0,
+                        free_extent_buffer(root->commit_root);
-                               sizeof(struct btrfs_disk_key));
+                        root->commit_root = btrfs_root_node(root);
-                        root->root_item.drop_level = 0;
-                        root->commit_root = NULL;
+                        btrfs_set_root_node(&root->root_item, root->node);
-                        root->dirty_root = NULL;
+                        err = btrfs_update_root(trans, fs_info->tree_root,
-                        root->root_key.offset = root->fs_info->generation;
-                        btrfs_set_root_bytenr(&root->root_item,
-                                              root->node->start);
-                        btrfs_set_root_level(&root->root_item,
-                                             btrfs_header_level(root->node));
-                        btrfs_set_root_generation(&root->root_item,
-                                                  root->root_key.offset);
-                        err = btrfs_insert_root(trans, root->fs_info->tree_root,
                                                &root->root_key,
                                                &root->root_item);
                        if (err)
                                break;
-                        refs = btrfs_root_refs(&dirty->root->root_item);
-                        btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
-                        err = btrfs_update_root(trans, root->fs_info->tree_root,
-                                                &dirty->root->root_key,
-                                                &dirty->root->root_item);
-                        BUG_ON(err);
-                        if (refs == 1) {
-                                list_add(&dirty->list, list);
-                        } else {
-                                WARN_ON(1);
-                                free_extent_buffer(dirty->root->node);
-                                kfree(dirty->root);
-                                kfree(dirty);
-                        }
                }
        }
        return err;
@@ -688,12 +611,8 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
                                TASK_UNINTERRUPTIBLE);
                mutex_unlock(&info->trans_mutex);
-                atomic_dec(&info->throttles);
-                wake_up(&info->transaction_throttle);
                schedule();
-                atomic_inc(&info->throttles);
                mutex_lock(&info->trans_mutex);
                finish_wait(&info->transaction_wait, &wait);
        }
@@ -705,111 +624,61 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
 * all of them
 */
-static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
+int btrfs_drop_dead_root(struct btrfs_root *root)
-                                     struct list_head *list)
 {
-        struct btrfs_dirty_root *dirty;
        struct btrfs_trans_handle *trans;
+        struct btrfs_root *tree_root = root->fs_info->tree_root;
        unsigned long nr;
-        u64 num_bytes;
+        int ret;
-        u64 bytes_used;
-        u64 max_useless;
-        int ret = 0;
-        int err;
-        while (!list_empty(list)) {
-                struct btrfs_root *root;
-                dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
-                list_del_init(&dirty->list);
-                num_bytes = btrfs_root_used(&dirty->root->root_item);
-                root = dirty->latest_root;
-                atomic_inc(&root->fs_info->throttles);
-                while (1) {
-                        /*
-                         * we don't want to jump in and create a bunch of
-                         * delayed refs if the transaction is starting to close
-                         */
-                        wait_transaction_pre_flush(tree_root->fs_info);
-                        trans = btrfs_start_transaction(tree_root, 1);
-                        /*
-                         * we've joined a transaction, make sure it isn't
-                         * closing right now
-                         */
-                        if (trans->transaction->delayed_refs.flushing) {
-                                btrfs_end_transaction(trans, tree_root);
-                                continue;
-                        }
-                        mutex_lock(&root->fs_info->drop_mutex);
-                        ret = btrfs_drop_snapshot(trans, dirty->root);
-                        if (ret != -EAGAIN)
-                                break;
-                        mutex_unlock(&root->fs_info->drop_mutex);
-                        err = btrfs_update_root(trans,
+        while (1) {
-                                        tree_root,
+                /*
-                                        &dirty->root->root_key,
+                 * we don't want to jump in and create a bunch of
-                                        &dirty->root->root_item);
+                 * delayed refs if the transaction is starting to close
-                        if (err)
+                 */
-                                ret = err;
+                wait_transaction_pre_flush(tree_root->fs_info);
-                        nr = trans->blocks_used;
+                trans = btrfs_start_transaction(tree_root, 1);
-                        ret = btrfs_end_transaction(trans, tree_root);
-                        BUG_ON(ret);
-                        btrfs_btree_balance_dirty(tree_root, nr);
+                /*
-                        cond_resched();
+                 * we've joined a transaction, make sure it isn't
+                 * closing right now
+                 */
+                if (trans->transaction->delayed_refs.flushing) {
+                        btrfs_end_transaction(trans, tree_root);
+                        continue;
                }
-                BUG_ON(ret);
-                atomic_dec(&root->fs_info->throttles);
-                wake_up(&root->fs_info->transaction_throttle);
-                num_bytes -= btrfs_root_used(&dirty->root->root_item);
+                ret = btrfs_drop_snapshot(trans, root);
-                bytes_used = btrfs_root_used(&root->root_item);
+                if (ret != -EAGAIN)
-                if (num_bytes) {
+                        break;
-                        mutex_lock(&root->fs_info->trans_mutex);
-                        btrfs_record_root_in_trans(root);
-                        mutex_unlock(&root->fs_info->trans_mutex);
-                        btrfs_set_root_used(&root->root_item,
-                                            bytes_used - num_bytes);
-                }
-                ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
+                ret = btrfs_update_root(trans, tree_root,
-                if (ret) {
+                                        &root->root_key,
-                        BUG();
+                                        &root->root_item);
+                if (ret)
                        break;
-                }
-                mutex_unlock(&root->fs_info->drop_mutex);
-                spin_lock(&root->list_lock);
-                list_del_init(&dirty->root->dead_list);
-                if (!list_empty(&root->dead_list)) {
-                        struct btrfs_root *oldest;
-                        oldest = list_entry(root->dead_list.prev,
-                                            struct btrfs_root, dead_list);
-                        max_useless = oldest->root_key.offset - 1;
-                } else {
-                        max_useless = root->root_key.offset - 1;
-                }
-                spin_unlock(&root->list_lock);
                nr = trans->blocks_used;
                ret = btrfs_end_transaction(trans, tree_root);
                BUG_ON(ret);
-                ret = btrfs_remove_leaf_refs(root, max_useless, 0);
-                BUG_ON(ret);
-                free_extent_buffer(dirty->root->node);
-                kfree(dirty->root);
-                kfree(dirty);
                btrfs_btree_balance_dirty(tree_root, nr);
                cond_resched();
        }
+        BUG_ON(ret);
+        ret = btrfs_del_root(trans, tree_root, &root->root_key);
+        BUG_ON(ret);
+        nr = trans->blocks_used;
+        ret = btrfs_end_transaction(trans, tree_root);
+        BUG_ON(ret);
+        free_extent_buffer(root->node);
+        free_extent_buffer(root->commit_root);
+        kfree(root);
+        btrfs_btree_balance_dirty(tree_root, nr);
        return ret;
 }
@@ -839,24 +708,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        if (ret)
                goto fail;
-        btrfs_record_root_in_trans(root);
+        record_root_in_trans(trans, root);
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
        key.objectid = objectid;
-        key.offset = trans->transid;
+        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
        old = btrfs_lock_root_node(root);
        btrfs_cow_block(trans, root, old, NULL, 0, &old);
+        btrfs_set_lock_blocking(old);
        btrfs_copy_root(trans, root, old, &tmp, objectid);
        btrfs_tree_unlock(old);
        free_extent_buffer(old);
-        btrfs_set_root_bytenr(new_root_item, tmp->start);
+        btrfs_set_root_node(new_root_item, tmp);
-        btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
-        btrfs_set_root_generation(new_root_item, trans->transid);
        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
                                new_root_item);
        btrfs_tree_unlock(tmp);
@@ -964,6 +832,24 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
        return 0;
 }
+static void update_super_roots(struct btrfs_root *root)
+{
+        struct btrfs_root_item *root_item;
+        struct btrfs_super_block *super;
+        super = &root->fs_info->super_copy;
+        root_item = &root->fs_info->chunk_root->root_item;
+        super->chunk_root = root_item->bytenr;
+        super->chunk_root_generation = root_item->generation;
+        super->chunk_root_level = root_item->level;
+        root_item = &root->fs_info->tree_root->root_item;
+        super->root = root_item->bytenr;
+        super->generation = root_item->generation;
+        super->root_level = root_item->level;
+}
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
@@ -971,8 +857,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        unsigned long timeout = 1;
        struct btrfs_transaction *cur_trans;
        struct btrfs_transaction *prev_trans = NULL;
-        struct btrfs_root *chunk_root = root->fs_info->chunk_root;
-        struct list_head dirty_fs_roots;
        struct extent_io_tree *pinned_copy;
        DEFINE_WAIT(wait);
        int ret;
@@ -999,7 +883,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        mutex_lock(&root->fs_info->trans_mutex);
-        INIT_LIST_HEAD(&dirty_fs_roots);
        if (cur_trans->in_commit) {
                cur_trans->use_count++;
                mutex_unlock(&root->fs_info->trans_mutex);
@@ -1105,41 +988,36 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         * with the tree-log code.
         */
        mutex_lock(&root->fs_info->tree_log_mutex);
-        /*
-         * keep tree reloc code from adding new reloc trees
-         */
-        mutex_lock(&root->fs_info->tree_reloc_mutex);
-        ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
+        ret = commit_fs_roots(trans, root);
-                              &dirty_fs_roots);
        BUG_ON(ret);
-        /* add_dirty_roots gets rid of all the tree log roots, it is now
+        /* commit_fs_roots gets rid of all the tree log roots, it is now
         * safe to free the root of tree log roots
         */
        btrfs_free_log_root_tree(trans, root->fs_info);
-        ret = btrfs_commit_tree_roots(trans, root);
+        ret = commit_cowonly_roots(trans, root);
        BUG_ON(ret);
        cur_trans = root->fs_info->running_transaction;
        spin_lock(&root->fs_info->new_trans_lock);
        root->fs_info->running_transaction = NULL;
        spin_unlock(&root->fs_info->new_trans_lock);
-        btrfs_set_super_generation(&root->fs_info->super_copy,
-                                   cur_trans->transid);
+        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
-        btrfs_set_super_root(&root->fs_info->super_copy,
+                            root->fs_info->tree_root->node);
-                             root->fs_info->tree_root->node->start);
+        free_extent_buffer(root->fs_info->tree_root->commit_root);
-        btrfs_set_super_root_level(&root->fs_info->super_copy,
+        root->fs_info->tree_root->commit_root =
-                           btrfs_header_level(root->fs_info->tree_root->node));
+                                btrfs_root_node(root->fs_info->tree_root);
-        btrfs_set_super_chunk_root(&root->fs_info->super_copy,
+        btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
-                                   chunk_root->node->start);
+                            root->fs_info->chunk_root->node);
-        btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
+        free_extent_buffer(root->fs_info->chunk_root->commit_root);
-                                         btrfs_header_level(chunk_root->node));
+        root->fs_info->chunk_root->commit_root =
-        btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
+                                btrfs_root_node(root->fs_info->chunk_root);
-                                btrfs_header_generation(chunk_root->node));
+        update_super_roots(root);
        if (!root->fs_info->log_root_recovering) {
                btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
@@ -1153,7 +1031,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        trans->transaction->blocked = 0;
-        wake_up(&root->fs_info->transaction_throttle);
        wake_up(&root->fs_info->transaction_wait);
        mutex_unlock(&root->fs_info->trans_mutex);
@@ -1170,9 +1047,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_finish_extent_commit(trans, root, pinned_copy);
        kfree(pinned_copy);
-        btrfs_drop_dead_reloc_roots(root);
-        mutex_unlock(&root->fs_info->tree_reloc_mutex);
        /* do the directory inserts of any pending snapshot creations */
        finish_pending_snapshots(trans, root->fs_info);
@@ -1186,16 +1060,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        put_transaction(cur_trans);
        put_transaction(cur_trans);
-        list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
-        if (root->fs_info->closing)
-                list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
        mutex_unlock(&root->fs_info->trans_mutex);
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
-        if (root->fs_info->closing)
-                drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
        return ret;
 }
@@ -1204,16 +1071,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 */
 int btrfs_clean_old_snapshots(struct btrfs_root *root)
 {
-        struct list_head dirty_roots;
+        LIST_HEAD(list);
-        INIT_LIST_HEAD(&dirty_roots);
+        struct btrfs_fs_info *fs_info = root->fs_info;
-again:
-        mutex_lock(&root->fs_info->trans_mutex);
+        mutex_lock(&fs_info->trans_mutex);
-        list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
+        list_splice_init(&fs_info->dead_roots, &list);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        mutex_unlock(&fs_info->trans_mutex);
-        if (!list_empty(&dirty_roots)) {
+        while (!list_empty(&list)) {
-                drop_dirty_roots(root, &dirty_roots);
+                root = list_entry(list.next, struct btrfs_root, root_list);
-                goto again;
+                list_del_init(&root->root_list);
+                btrfs_drop_dead_root(root);
        }
        return 0;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 94f5bde2b58d..961c3ee5a2e1 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,12 +62,6 @@ struct btrfs_pending_snapshot {
        struct list_head list;
 };
-struct btrfs_dirty_root {
-        struct list_head list;
-        struct btrfs_root *root;
-        struct btrfs_root *latest_root;
-};
 static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
                                               struct inode *inode)
 {
@@ -100,7 +94,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root);
-int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
+int btrfs_add_dead_root(struct btrfs_root *root);
+int btrfs_drop_dead_root(struct btrfs_root *root);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -108,7 +103,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
-int btrfs_record_root_in_trans(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
                                        struct extent_io_tree *dirty_pages);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index db5e212e8445..c13922206d1b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -430,18 +430,16 @@ no_copy:
 static noinline struct inode *read_one_inode(struct btrfs_root *root,
                                             u64 objectid)
 {
+        struct btrfs_key key;
        struct inode *inode;
-        inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
-        if (inode->i_state & I_NEW) {
-                BTRFS_I(inode)->root = root;
-                BTRFS_I(inode)->location.objectid = objectid;
-                BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-                BTRFS_I(inode)->location.offset = 0;
-                btrfs_read_locked_inode(inode);
-                unlock_new_inode(inode);
-        }
+        key.objectid = objectid;
-        if (is_bad_inode(inode)) {
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        inode = btrfs_iget(root->fs_info->sb, &key, root);
+        if (IS_ERR(inode)) {
+                inode = NULL;
+        } else if (is_bad_inode(inode)) {
                iput(inode);
                inode = NULL;
        }
@@ -541,6 +539,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
        if (found_type == BTRFS_FILE_EXTENT_REG ||
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                u64 offset;
                unsigned long dest_offset;
                struct btrfs_key ins;
@@ -555,6 +554,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
                ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
                ins.type = BTRFS_EXTENT_ITEM_KEY;
+                offset = key->offset - btrfs_file_extent_offset(eb, item);
                if (ins.objectid > 0) {
                        u64 csum_start;
@@ -569,19 +569,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                        if (ret == 0) {
                                ret = btrfs_inc_extent_ref(trans, root,
                                                ins.objectid, ins.offset,
-                                                path->nodes[0]->start,
+                                                0, root->root_key.objectid,
-                                                root->root_key.objectid,
+                                                key->objectid, offset);
-                                                trans->transid, key->objectid);
                        } else {
                                /*
                                 * insert the extent pointer in the extent
                                 * allocation tree
                                 */
-                                ret = btrfs_alloc_logged_extent(trans, root,
+                                ret = btrfs_alloc_logged_file_extent(trans,
-                                                path->nodes[0]->start,
+                                                root, root->root_key.objectid,
-                                                root->root_key.objectid,
+                                                key->objectid, offset, &ins);
-                                                trans->transid, key->objectid,
-                                                &ins);
                                BUG_ON(ret);
                        }
                        btrfs_release_path(root, path);
@@ -1706,9 +1703,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
-                                ret = btrfs_drop_leaf_ref(trans, root, next);
-                                BUG_ON(ret);
                                WARN_ON(root_owner !=
                                        BTRFS_TREE_LOG_OBJECTID);
                                ret = btrfs_free_reserved_extent(root,
@@ -1753,10 +1747,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                btrfs_wait_tree_block_writeback(next);
                btrfs_tree_unlock(next);
-                if (*level == 0) {
-                        ret = btrfs_drop_leaf_ref(trans, root, next);
-                        BUG_ON(ret);
-                }
                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
                ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
                BUG_ON(ret);
@@ -1811,12 +1801,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
-                                if (*level == 0) {
-                                        ret = btrfs_drop_leaf_ref(trans, root,
-                                                                  next);
-                                        BUG_ON(ret);
-                                }
                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
                                ret = btrfs_free_reserved_extent(root,
                                                path->nodes[*level]->start,
@@ -1884,11 +1868,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
                        btrfs_wait_tree_block_writeback(next);
                        btrfs_tree_unlock(next);
-                        if (orig_level == 0) {
-                                ret = btrfs_drop_leaf_ref(trans, log,
-                                                          next);
-                                BUG_ON(ret);
-                        }
                        WARN_ON(log->root_key.objectid !=
                                BTRFS_TREE_LOG_OBJECTID);
                        ret = btrfs_free_reserved_extent(log, next->start,
@@ -2027,9 +2006,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
        BUG_ON(ret);
-        btrfs_set_root_bytenr(&log->root_item, log->node->start);
+        btrfs_set_root_node(&log->root_item, log->node);
-        btrfs_set_root_generation(&log->root_item, trans->transid);
-        btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
        root->log_batch = 0;
        root->log_transid++;
@@ -2581,7 +2558,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                       ins_keys, ins_sizes, nr);
        BUG_ON(ret);
-        for (i = 0; i < nr; i++) {
+        for (i = 0; i < nr; i++, dst_path->slots[0]++) {
                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
                                                   dst_path->slots[0]);
@@ -2617,36 +2594,31 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                        found_type = btrfs_file_extent_type(src, extent);
                        if (found_type == BTRFS_FILE_EXTENT_REG ||
                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-                                u64 ds = btrfs_file_extent_disk_bytenr(src,
+                                u64 ds, dl, cs, cl;
-                                                                   extent);
+                                ds = btrfs_file_extent_disk_bytenr(src,
-                                u64 dl = btrfs_file_extent_disk_num_bytes(src,
+                                                                extent);
-                                                                      extent);
+                                /* ds == 0 is a hole */
-                                u64 cs = btrfs_file_extent_offset(src, extent);
+                                if (ds == 0)
-                                u64 cl = btrfs_file_extent_num_bytes(src,
+                                        continue;
-                                                                     extent);;
+                                dl = btrfs_file_extent_disk_num_bytes(src,
+                                                                extent);
+                                cs = btrfs_file_extent_offset(src, extent);
+                                cl = btrfs_file_extent_num_bytes(src,
+                                                                extent);;
                                if (btrfs_file_extent_compression(src,
                                                                  extent)) {
                                        cs = 0;
                                        cl = dl;
                                }
-                                /* ds == 0 is a hole */
-                                if (ds != 0) {
+                                ret = btrfs_lookup_csums_range(
-                                        ret = btrfs_inc_extent_ref(trans, log,
+                                                log->fs_info->csum_root,
-                                                   ds, dl,
+                                                ds + cs, ds + cs + cl - 1,
-                                                   dst_path->nodes[0]->start,
+                                                &ordered_sums);
-                                                   BTRFS_TREE_LOG_OBJECTID,
+                                BUG_ON(ret);
-                                                   trans->transid,
-                                                   ins_keys[i].objectid);
-                                        BUG_ON(ret);
-                                        ret = btrfs_lookup_csums_range(
-                                                   log->fs_info->csum_root,
-                                                   ds + cs, ds + cs + cl - 1,
-                                                   &ordered_sums);
-                                        BUG_ON(ret);
-                                }
                        }
                }
-                dst_path->slots[0]++;
        }
        btrfs_mark_buffer_dirty(dst_path->nodes[0]);
@@ -3029,9 +3001,7 @@ again:
                BUG_ON(!wc.replay_dest);
                wc.replay_dest->log_root = log;
-                mutex_lock(&fs_info->trans_mutex);
+                btrfs_record_root_in_trans(trans, wc.replay_dest);
-                btrfs_record_root_in_trans(wc.replay_dest);
-                mutex_unlock(&fs_info->trans_mutex);
                ret = walk_log_tree(trans, log, &wc);
                BUG_ON(ret);
@@ -3049,6 +3019,7 @@ again:
                key.offset = found_key.offset - 1;
                wc.replay_dest->log_root = NULL;
                free_extent_buffer(log->node);
+                free_extent_buffer(log->commit_root);
                kfree(log);
                if (found_key.offset == 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a6d35b0054ca..3ab80e9cd767 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -161,8 +161,10 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        int again = 0;
        unsigned long num_run;
        unsigned long num_sync_run;
+        unsigned long batch_run = 0;
        unsigned long limit;
        unsigned long last_waited = 0;
+        int force_reg = 0;
        bdi = blk_get_backing_dev_info(device->bdev);
        fs_info = device->dev_root->fs_info;
@@ -176,19 +178,22 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 loop:
        spin_lock(&device->io_lock);
-        num_run = 0;
 loop_lock:
+        num_run = 0;
        /* take all the bios off the list at once and process them
         * later on (without the lock held).  But, remember the
         * tail and other pointers so the bios can be properly reinserted
         * into the list if we hit congestion
         */
-        if (device->pending_sync_bios.head)
+        if (!force_reg && device->pending_sync_bios.head) {
                pending_bios = &device->pending_sync_bios;
-        else
+                force_reg = 1;
+        } else {
                pending_bios = &device->pending_bios;
+                force_reg = 0;
+        }
        pending = pending_bios->head;
        tail = pending_bios->tail;
@@ -228,10 +233,14 @@ loop_lock:
        while (pending) {
                rmb();
-                if (pending_bios != &device->pending_sync_bios &&
+                /* we want to work on both lists, but do more bios on the
-                    device->pending_sync_bios.head &&
+                 * sync list than the regular list
-                    num_run > 16) {
+                 */
-                        cond_resched();
+                if ((num_run > 32 &&
+                    pending_bios != &device->pending_sync_bios &&
+                    device->pending_sync_bios.head) ||
+                   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
+                    device->pending_bios.head)) {
                        spin_lock(&device->io_lock);
                        requeue_list(pending_bios, pending, tail);
                        goto loop_lock;
@@ -249,6 +258,8 @@ loop_lock:
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
                submit_bio(cur->bi_rw, cur);
                num_run++;
+                batch_run++;
                if (bio_sync(cur))
                        num_sync_run++;
@@ -265,7 +276,7 @@ loop_lock:
                 * is now congested.  Back off and let other work structs
                 * run instead
                 */
-                if (pending && bdi_write_congested(bdi) && num_run > 16 &&
+                if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
                    fs_info->fs_devices->open_devices > 1) {
                        struct io_context *ioc;
@@ -366,6 +377,7 @@ static noinline int device_list_add(const char *path,
                memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
                fs_devices->latest_devid = devid;
                fs_devices->latest_trans = found_transid;
+                mutex_init(&fs_devices->device_list_mutex);
                device = NULL;
        } else {
                device = __find_device(&fs_devices->devices, devid,
@@ -392,7 +404,11 @@ static noinline int device_list_add(const char *path,
                        return -ENOMEM;
                }
                INIT_LIST_HEAD(&device->dev_alloc_list);
+                mutex_lock(&fs_devices->device_list_mutex);
                list_add(&device->dev_list, &fs_devices->devices);
+                mutex_unlock(&fs_devices->device_list_mutex);
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
        }
@@ -418,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
        INIT_LIST_HEAD(&fs_devices->devices);
        INIT_LIST_HEAD(&fs_devices->alloc_list);
        INIT_LIST_HEAD(&fs_devices->list);
+        mutex_init(&fs_devices->device_list_mutex);
        fs_devices->latest_devid = orig->latest_devid;
        fs_devices->latest_trans = orig->latest_trans;
        memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+        mutex_lock(&orig->device_list_mutex);
        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
                device = kzalloc(sizeof(*device), GFP_NOFS);
                if (!device)
@@ -443,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
        }
+        mutex_unlock(&orig->device_list_mutex);
        return fs_devices;
 error:
+        mutex_unlock(&orig->device_list_mutex);
        free_fs_devices(fs_devices);
        return ERR_PTR(-ENOMEM);
 }
@@ -455,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
        mutex_lock(&uuid_mutex);
 again:
+        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
                if (device->in_fs_metadata)
                        continue;
@@ -474,6 +495,7 @@ again:
                kfree(device->name);
                kfree(device);
        }
+        mutex_unlock(&fs_devices->device_list_mutex);
        if (fs_devices->seed) {
                fs_devices = fs_devices->seed;
@@ -594,6 +616,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                device->in_fs_metadata = 0;
                device->mode = flags;
+                if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+                        fs_devices->rotating = 1;
                fs_devices->open_devices++;
                if (device->writeable) {
                        fs_devices->rw_devices++;
@@ -1121,12 +1146,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                device = NULL;
                devices = &root->fs_info->fs_devices->devices;
+                mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
                list_for_each_entry(tmp, devices, dev_list) {
                        if (tmp->in_fs_metadata && !tmp->bdev) {
                                device = tmp;
                                break;
                        }
                }
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
                bdev = NULL;
                bh = NULL;
                disk_super = NULL;
@@ -1181,7 +1208,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                goto error_brelse;
        device->in_fs_metadata = 0;
+        /*
+         * the device list mutex makes sure that we don't change
+         * the device list while someone else is writing out all
+         * the device supers.
+         */
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        list_del_init(&device->dev_list);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        device->fs_devices->num_devices--;
        next_device = list_entry(root->fs_info->fs_devices->devices.next,
@@ -1275,6 +1311,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
        seed_devices->opened = 1;
        INIT_LIST_HEAD(&seed_devices->devices);
        INIT_LIST_HEAD(&seed_devices->alloc_list);
+        mutex_init(&seed_devices->device_list_mutex);
        list_splice_init(&fs_devices->devices, &seed_devices->devices);
        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
        list_for_each_entry(device, &seed_devices->devices, dev_list) {
@@ -1400,6 +1437,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        mutex_lock(&root->fs_info->volume_mutex);
        devices = &root->fs_info->fs_devices->devices;
+        /*
+         * we have the volume lock, so we don't need the extra
+         * device list mutex while reading the list here.
+         */
        list_for_each_entry(device, devices, dev_list) {
                if (device->bdev == bdev) {
                        ret = -EEXIST;
@@ -1454,6 +1495,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        }
        device->fs_devices = root->fs_info->fs_devices;
+        /*
+         * we don't want write_supers to jump in here with our device
+         * half setup
+         */
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
        list_add(&device->dev_alloc_list,
                 &root->fs_info->fs_devices->alloc_list);
@@ -1462,6 +1509,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        root->fs_info->fs_devices->rw_devices++;
        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+                root->fs_info->fs_devices->rotating = 1;
        total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
        btrfs_set_super_total_bytes(&root->fs_info->super_copy,
                                    total_bytes + device->total_bytes);
@@ -1469,6 +1519,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
        btrfs_set_super_num_devices(&root->fs_info->super_copy,
                                    total_bytes + 1);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        if (seeding_dev) {
                ret = init_first_rw_device(trans, root, device);
@@ -1671,8 +1722,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        int ret;
        int i;
-        printk(KERN_INFO "btrfs relocating chunk %llu\n",
-               (unsigned long long)chunk_offset);
        root = root->fs_info->chunk_root;
        extent_root = root->fs_info->extent_root;
        em_tree = &root->fs_info->mapping_tree.map_tree;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5c3ff6d02fd7..5139a833f721 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -96,7 +96,12 @@ struct btrfs_fs_devices {
        u64 rw_devices;
        u64 total_rw_bytes;
        struct block_device *latest_bdev;
-        /* all of the devices in the FS */
+        /* all of the devices in the FS, protected by a mutex
+         * so we can safely walk it to write out the supers without
+         * worrying about add/remove by the multi-device code
+         */
+        struct mutex device_list_mutex;
        struct list_head devices;
        /* devices not currently being allocated */
@@ -107,6 +112,11 @@ struct btrfs_fs_devices {
        int seeding;
        int opened;
+        /* set when we find or add a device that doesn't have the
+         * nonrot flag set
+         */
+        int rotating;
 };
 struct btrfs_bio_stripe {
diff --git a/fs/buffer.c b/fs/buffer.c
index 49106127a4aa..a3ef091a45bd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1085,12 +1085,12 @@ static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
        /* Size must be multiple of hard sectorsize */
-        if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
+        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
                        (size < 512 || size > PAGE_SIZE))) {
                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
                                        size);
-                printk(KERN_ERR "hardsect size: %d\n",
+                printk(KERN_ERR "logical block size: %d\n",
-                                        bdev_hardsect_size(bdev));
+                                        bdev_logical_block_size(bdev));
                dump_stack();
                return NULL;
@@ -2935,6 +2935,8 @@ int submit_bh(int rw, struct buffer_head * bh)
        BUG_ON(!buffer_locked(bh));
        BUG_ON(!buffer_mapped(bh));
        BUG_ON(!bh->b_end_io);
+        BUG_ON(buffer_delay(bh));
+        BUG_ON(buffer_unwritten(bh));
        /*
         * Mask in barrier bit for a write (could be either a WRITE or a
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index f20c4069c220..b48689839428 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,12 @@
+Version 1.59
+------------
+Client uses server inode numbers (which are persistent) rather than
+client generated ones by default (mount option "serverino" turned
+on by default if server supports it).  Add forceuid and forcegid
+mount options (so that when negotiating unix extensions specifying
+which uid mounted does not immediately force the server's reported
+uids to be overridden).
 Version 1.58
 ------------
 Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
@@ -10,6 +19,8 @@ we converted from).  Fix endianness of the vcnum field used during
 session setup to distinguish multiple mounts to same server from different
 userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
 flag to be set to 2, and mount must enable krb5 to turn on extended security).
+Performance of file create to Samba improved (posix create on lookup
+removes 1 of 2 network requests sent on file create)
 
 Version 1.57
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index db208ddb9899..ad92921dbde4 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,7 +262,8 @@ A partial list of the supported mount options follows:
                mount.  
  domain        Set the SMB/CIFS workgroup name prepended to the
                username during CIFS session establishment
-  uid           Set the default uid for inodes. For mounts to servers
+  forceuid      Set the default uid for inodes based on the uid
+                passed in. For mounts to servers
                which do support the CIFS Unix extensions, such as a
                properly configured Samba server, the server provides
                the uid, gid and mode so this parameter should  not be
@@ -292,6 +293,12 @@ A partial list of the supported mount options follows:
                the client.  Note that the mount.cifs helper must be
                at version 1.10 or higher to support specifying the uid
                (or gid) in non-numeric form.
+  forcegid      (similar to above but for the groupid instead of uid)
+  uid           Set the default uid for inodes, and indicate to the
+                cifs kernel driver which local user mounted . If the server
+                supports the unix extensions the default uid is
+                not used to fill in the owner fields of inodes (files)
+                unless the "forceuid" parameter is specified.
  gid           Set the default gid for inodes (similar to above).
  file_mode     If CIFS Unix extensions are not supported by the server
                this overrides the default mode for file inodes.
@@ -388,8 +395,13 @@ A partial list of the supported mount options follows:
                or the CIFS Unix Extensions equivalent and for those
                this mount option will have no effect.  Exporting cifs mounts
                under nfsd requires this mount option on the cifs mount.
+                This is now the default if server supports the 
+                required network operation.
  noserverino   Client generates inode numbers (rather than using the actual one
-                from the server) by default.
+                from the server). These inode numbers will vary after
+                unmount or reboot which can confuse some applications,
+                but not all server filesystems support unique inode
+                numbers.
  setuids       If the CIFS Unix extensions are negotiated with the server
                the client will attempt to set the effective uid and gid of
                the local process on newly created files, directories, and
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 67bf93a40d2e..4a4581cb2b5e 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <keys/user-type.h>
 #include <linux/key-type.h>
+#include <linux/inet.h>
 #include "cifsglob.h"
 #include "cifs_spnego.h"
 #include "cifs_debug.h"
@@ -73,9 +74,6 @@ struct key_type cifs_spnego_key_type = {
 * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN        13
-/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
-#define MAX_IPV6_ADDR_LEN       43
 /* strlen of "host=" */
 #define HOST_KEY_LEN            5
@@ -102,7 +100,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
           host=hostname sec=mechanism uid=0xFF user=username */
        desc_len = MAX_VER_STR_LEN +
                   HOST_KEY_LEN + strlen(hostname) +
-                   IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
+                   IP_KEY_LEN + INET6_ADDRSTRLEN +
                   MAX_MECH_STR_LEN +
                   UID_KEY_LEN + (sizeof(uid_t) * 2) +
                   USER_KEY_LEN + strlen(sesInfo->userName) + 1;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 57ecdc83c26f..1403b5d86a73 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -552,130 +552,138 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
        return rc;
 }
+static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
-/* Retrieve an ACL from the server */
+                __u16 fid, u32 *pacllen)
-static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
-                                       const char *path, const __u16 *pfid)
 {
-        struct cifsFileInfo *open_file = NULL;
-        bool unlock_file = false;
-        int xid;
-        int rc = -EIO;
-        __u16 fid;
-        struct super_block *sb;
-        struct cifs_sb_info *cifs_sb;
        struct cifs_ntsd *pntsd = NULL;
+        int xid, rc;
+        xid = GetXid();
+        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+        FreeXid(xid);
-        cFYI(1, ("get mode from ACL for %s", path));
-        if (inode == NULL)
+        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
-                return NULL;
+        return pntsd;
+}
+static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
+                const char *path, u32 *pacllen)
+{
+        struct cifs_ntsd *pntsd = NULL;
+        int oplock = 0;
+        int xid, rc;
+        __u16 fid;
        xid = GetXid();
-        if (pfid == NULL)
-                open_file = find_readable_file(CIFS_I(inode));
-        else
-                fid = *pfid;
-        sb = inode->i_sb;
+        rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
-        if (sb == NULL) {
+                         &fid, &oplock, NULL, cifs_sb->local_nls,
-                FreeXid(xid);
+                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-                return NULL;
+        if (rc) {
-        }
+                cERROR(1, ("Unable to open file to get ACL"));
-        cifs_sb = CIFS_SB(sb);
+                goto out;
-        if (open_file) {
-                unlock_file = true;
-                fid = open_file->netfid;
-        } else if (pfid == NULL) {
-                int oplock = 0;
-                /* open file */
-                rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-                                READ_CONTROL, 0, &fid, &oplock, NULL,
-                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                if (rc != 0) {
-                        cERROR(1, ("Unable to open file to get ACL"));
-                        FreeXid(xid);
-                        return NULL;
-                }
        }
        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
-        if (unlock_file == true) /* find_readable_file increments ref count */
-                atomic_dec(&open_file->wrtPending);
-        else if (pfid == NULL) /* if opened above we have to close the handle */
-                CIFSSMBClose(xid, cifs_sb->tcon, fid);
-        /* else handle was passed in by caller */
+        CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
        FreeXid(xid);
        return pntsd;
 }
-/* Set an ACL on the server */
+/* Retrieve an ACL from the server */
-static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
-                                struct inode *inode, const char *path)
+                                      struct inode *inode, const char *path,
+                                      u32 *pacllen)
 {
-        struct cifsFileInfo *open_file;
+        struct cifs_ntsd *pntsd = NULL;
-        bool unlock_file = false;
+        struct cifsFileInfo *open_file = NULL;
-        int xid;
-        int rc = -EIO;
-        __u16 fid;
-        struct super_block *sb;
-        struct cifs_sb_info *cifs_sb;
-        cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+        if (inode)
+                open_file = find_readable_file(CIFS_I(inode));
+        if (!open_file)
+                return get_cifs_acl_by_path(cifs_sb, path, pacllen);
-        if (!inode)
+        pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
-                return rc;
+        atomic_dec(&open_file->wrtPending);
+        return pntsd;
+}
-        sb = inode->i_sb;
+static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
-        if (sb == NULL)
+                struct cifs_ntsd *pnntsd, u32 acllen)
-                return rc;
+{
+        int xid, rc;
-        cifs_sb = CIFS_SB(sb);
        xid = GetXid();
+        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+        FreeXid(xid);
-        open_file = find_readable_file(CIFS_I(inode));
+        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
-        if (open_file) {
+        return rc;
-                unlock_file = true;
+}
-                fid = open_file->netfid;
-        } else {
+static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
-                int oplock = 0;
+                struct cifs_ntsd *pnntsd, u32 acllen)
-                /* open file */
+{
-                rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
+        int oplock = 0;
-                                WRITE_DAC, 0, &fid, &oplock, NULL,
+        int xid, rc;
-                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+        __u16 fid;
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                if (rc != 0) {
+        xid = GetXid();
-                        cERROR(1, ("Unable to open file to set ACL"));
-                        FreeXid(xid);
+        rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
-                        return rc;
+                         &fid, &oplock, NULL, cifs_sb->local_nls,
-                }
+                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc) {
+                cERROR(1, ("Unable to open file to set ACL"));
+                goto out;
        }
        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
-        if (unlock_file)
-                atomic_dec(&open_file->wrtPending);
-        else
-                CIFSSMBClose(xid, cifs_sb->tcon, fid);
+        CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
        FreeXid(xid);
+        return rc;
+}
+/* Set an ACL on the server */
+static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+                                struct inode *inode, const char *path)
+{
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsFileInfo *open_file;
+        int rc;
+        cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+        open_file = find_readable_file(CIFS_I(inode));
+        if (!open_file)
+                return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
+        rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
+        atomic_dec(&open_file->wrtPending);
        return rc;
 }
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void acl_to_uid_mode(struct inode *inode, const char *path, const __u16 *pfid)
+void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
+                     const char *path, const __u16 *pfid)
 {
        struct cifs_ntsd *pntsd = NULL;
        u32 acllen = 0;
        int rc = 0;
        cFYI(DBG2, ("converting ACL to mode for %s", path));
-        pntsd = get_cifs_acl(&acllen, inode, path, pfid);
+        if (pfid)
+                pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
+        else
+                pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
        /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
        if (pntsd)
@@ -698,7 +706,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        cFYI(DBG2, ("set ACL from mode for %s", path));
        /* Get the security descriptor */
-        pntsd = get_cifs_acl(&secdesclen, inode, path, NULL);
+        pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
        /* Add three ACEs for owner, group, everyone getting rid of
           other ACEs as chmod disables ACEs and set the security descriptor */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5e6d35804d73..0a10a59b6392 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -146,7 +146,7 @@ cifs_read_super(struct super_block *sb, void *data,
 #endif
        sb->s_blocksize = CIFS_MAX_MSGSIZE;
        sb->s_blocksize_bits = 14;      /* default 2**14 = CIFS_MAX_MSGSIZE */
-        inode = cifs_iget(sb, ROOT_I);
+        inode = cifs_root_iget(sb, ROOT_I);
        if (IS_ERR(inode)) {
                rc = PTR_ERR(inode);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 051b71cfdea9..9570a0e8023f 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -36,7 +36,7 @@ extern void cifs_read_inode(struct inode *);
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
-extern struct inode *cifs_iget(struct super_block *, unsigned long);
+extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
 extern int cifs_create(struct inode *, struct dentry *, int,
                       struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.58"
+#define CIFS_VERSION   "1.59"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fae083930eee..f9452329bcce 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -90,10 +90,10 @@ extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
                                                 struct cifsTconInfo *);
 extern void DeleteOplockQEntry(struct oplock_q_entry *);
 extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
-extern struct timespec cifs_NTtimeToUnix(u64 utc_nanoseconds_since_1601);
+extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
-extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
+extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
-extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
+                                      int offset);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
                           struct super_block *sb, int mode, int oflags,
@@ -108,8 +108,8 @@ extern int cifs_get_inode_info(struct inode **pinode,
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
-extern void acl_to_uid_mode(struct inode *inode, const char *path,
+extern void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
-                            const __u16 *pfid);
+                            const char *path, const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d06260251c30..b84c61d5bca4 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -524,8 +524,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        int val, seconds, remain, result;
                        struct timespec ts, utc;
                        utc = CURRENT_TIME;
-                        ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date),
+                        ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
-                                                le16_to_cpu(rsp->SrvTime.Time));
+                                            rsp->SrvTime.Time, 0);
                        cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
                                (int)ts.tv_sec, (int)utc.tv_sec,
                                (int)(utc.tv_sec - ts.tv_sec)));
@@ -2427,8 +2427,7 @@ querySymLinkRetry:
        params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
        pSMB->TotalDataCount = 0;
        pSMB->MaxParameterCount = cpu_to_le16(2);
-        /* BB find exact max data count below from sess structure BB */
+        pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
-        pSMB->MaxDataCount = cpu_to_le16(4000);
        pSMB->MaxSetupCount = 0;
        pSMB->Reserved = 0;
        pSMB->Flags = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4aa81a507b74..97f4311b9a8e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -35,6 +35,7 @@
 #include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <linux/inet.h>
 #include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -61,7 +62,6 @@ struct smb_vol {
        char *domainname;
        char *UNC;
        char *UNCip;
-        char *in6_addr;   /* ipv6 address as human readable form of in6_addr */
        char *iocharset;  /* local code page for mapping to and from Unicode */
        char source_rfc1001_name[16]; /* netbios name of client */
        char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
@@ -827,14 +827,16 @@ cifs_parse_mount_options(char *options, const char *devname,
        vol->target_rfc1001_name[0] = 0;
        vol->linux_uid = current_uid();  /* use current_euid() instead? */
        vol->linux_gid = current_gid();
-        vol->dir_mode = S_IRWXUGO;
-        /* 2767 perms indicate mandatory locking support */
+        /* default to only allowing write access to owner of the mount */
-        vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP);
+        vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
        /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
        vol->rw = true;
        /* default is always to request posix paths. */
        vol->posix_paths = 1;
+        /* default to using server inode numbers where available */
+        vol->server_ino = 1;
        if (!options)
                return 1;
@@ -955,10 +957,12 @@ cifs_parse_mount_options(char *options, const char *devname,
                                }
                                strcpy(vol->password, value);
                        }
-                } else if (strnicmp(data, "ip", 2) == 0) {
+                } else if (!strnicmp(data, "ip", 2) ||
+                           !strnicmp(data, "addr", 4)) {
                        if (!value || !*value) {
                                vol->UNCip = NULL;
-                        } else if (strnlen(value, 35) < 35) {
+                        } else if (strnlen(value, INET6_ADDRSTRLEN) <
+                                                        INET6_ADDRSTRLEN) {
                                vol->UNCip = value;
                        } else {
                                printk(KERN_WARNING "CIFS: ip address "
@@ -1092,17 +1096,17 @@ cifs_parse_mount_options(char *options, const char *devname,
                                return 1;
                        }
                } else if (strnicmp(data, "uid", 3) == 0) {
-                        if (value && *value) {
+                        if (value && *value)
                                vol->linux_uid =
                                        simple_strtoul(value, &value, 0);
+                } else if (strnicmp(data, "forceuid", 8) == 0) {
                                vol->override_uid = 1;
-                        }
                } else if (strnicmp(data, "gid", 3) == 0) {
-                        if (value && *value) {
+                        if (value && *value)
                                vol->linux_gid =
                                        simple_strtoul(value, &value, 0);
+                } else if (strnicmp(data, "forcegid", 8) == 0) {
                                vol->override_gid = 1;
-                        }
                } else if (strnicmp(data, "file_mode", 4) == 0) {
                        if (value && *value) {
                                vol->file_mode =
@@ -1315,16 +1319,6 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->direct_io = 1;
                } else if (strnicmp(data, "forcedirectio", 13) == 0) {
                        vol->direct_io = 1;
-                } else if (strnicmp(data, "in6_addr", 8) == 0) {
-                        if (!value || !*value) {
-                                vol->in6_addr = NULL;
-                        } else if (strnlen(value, 49) == 48) {
-                                vol->in6_addr = value;
-                        } else {
-                                printk(KERN_WARNING "CIFS: ip v6 address not "
-                                                    "48 characters long\n");
-                                return 1;
-                        }
                } else if (strnicmp(data, "noac", 4) == 0) {
                        printk(KERN_WARNING "CIFS: Mount option noac not "
                                "supported. Instead set "
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 302ea15f02e6..06866841b97f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -241,7 +241,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
        /* BB need same check in cifs_create too? */
        /* if not oplocked, invalidate inode pages if mtime or file
           size changed */
-        temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
+        temp = cifs_NTtimeToUnix(buf->LastWriteTime);
        if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
                           (file->f_path.dentry->d_inode->i_size ==
                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9c869a6dcba1..fad882b075ba 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -85,10 +85,10 @@ static void cifs_unix_info_to_inode(struct inode *inode,
        __u64 num_of_bytes = le64_to_cpu(info->NumOfBytes);
        __u64 end_of_file = le64_to_cpu(info->EndOfFile);
-        inode->i_atime = cifs_NTtimeToUnix(le64_to_cpu(info->LastAccessTime));
+        inode->i_atime = cifs_NTtimeToUnix(info->LastAccessTime);
        inode->i_mtime =
-                cifs_NTtimeToUnix(le64_to_cpu(info->LastModificationTime));
+                cifs_NTtimeToUnix(info->LastModificationTime);
-        inode->i_ctime = cifs_NTtimeToUnix(le64_to_cpu(info->LastStatusChange));
+        inode->i_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
        inode->i_mode = le64_to_cpu(info->Permissions);
        /*
@@ -554,14 +554,11 @@ int cifs_get_inode_info(struct inode **pinode,
        /* Linux can not store file creation time so ignore it */
        if (pfindData->LastAccessTime)
-                inode->i_atime = cifs_NTtimeToUnix
+                inode->i_atime = cifs_NTtimeToUnix(pfindData->LastAccessTime);
-                        (le64_to_cpu(pfindData->LastAccessTime));
        else /* do not need to use current_fs_time - time not stored */
                inode->i_atime = CURRENT_TIME;
-        inode->i_mtime =
+        inode->i_mtime = cifs_NTtimeToUnix(pfindData->LastWriteTime);
-                    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
+        inode->i_ctime = cifs_NTtimeToUnix(pfindData->ChangeTime);
-        inode->i_ctime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
        cFYI(DBG2, ("Attributes came in as 0x%x", attr));
        if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
                inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
@@ -629,7 +626,7 @@ int cifs_get_inode_info(struct inode **pinode,
        /* fill in 0777 bits from ACL */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
                cFYI(1, ("Getting mode bits from ACL"));
-                acl_to_uid_mode(inode, full_path, pfid);
+                acl_to_uid_mode(cifs_sb, inode, full_path, pfid);
        }
 #endif
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
@@ -699,7 +696,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 }
 /* gets root inode */
-struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
+struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 {
        int xid;
        struct cifs_sb_info *cifs_sb;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index e2fe998989a3..32d6baa0a54f 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -853,12 +853,12 @@ smbCalcSize_LE(struct smb_hdr *ptr)
 #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
-    /*
+/*
-     * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
+ * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
-     * into Unix UTC (based 1970-01-01, in seconds).
+ * into Unix UTC (based 1970-01-01, in seconds).
-     */
+ */
 struct timespec
-cifs_NTtimeToUnix(u64 ntutc)
+cifs_NTtimeToUnix(__le64 ntutc)
 {
        struct timespec ts;
        /* BB what about the timezone? BB */
@@ -866,7 +866,7 @@ cifs_NTtimeToUnix(u64 ntutc)
        /* Subtract the NTFS time offset, then convert to 1s intervals. */
        u64 t;
-        t = ntutc - NTFS_TIME_OFFSET;
+        t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
        ts.tv_nsec = do_div(t, 10000000) * 100;
        ts.tv_sec = t;
        return ts;
@@ -883,16 +883,12 @@ cifs_UnixTimeToNT(struct timespec t)
 static int total_days_of_prev_months[] =
 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
+struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
-__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
-{
-        return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
-}
-struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
 {
        struct timespec ts;
        int sec, min, days, month, year;
+        u16 date = le16_to_cpu(le_date);
+        u16 time = le16_to_cpu(le_time);
        SMB_TIME *st = (SMB_TIME *)&time;
        SMB_DATE *sd = (SMB_DATE *)&date;
@@ -933,7 +929,7 @@ struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
                days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
        sec += 24 * 60 * 60 * days;
-        ts.tv_sec = sec;
+        ts.tv_sec = sec + offset;
        /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 964e097c8203..86d0055dc529 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -115,17 +115,6 @@ construct_dentry(struct qstr *qstring, struct file *file,
        return rc;
 }
-static void AdjustForTZ(struct cifsTconInfo *tcon, struct inode *inode)
-{
-        if ((tcon) && (tcon->ses) && (tcon->ses->server)) {
-                inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
-                inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
-                inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
-        }
-        return;
-}
 static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
                          char *buf, unsigned int *pobject_type, int isNewInode)
 {
@@ -150,26 +139,25 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
                allocation_size = le64_to_cpu(pfindData->AllocationSize);
                end_of_file = le64_to_cpu(pfindData->EndOfFile);
                tmp_inode->i_atime =
-                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+                        cifs_NTtimeToUnix(pfindData->LastAccessTime);
                tmp_inode->i_mtime =
-                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
+                        cifs_NTtimeToUnix(pfindData->LastWriteTime);
                tmp_inode->i_ctime =
-                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+                        cifs_NTtimeToUnix(pfindData->ChangeTime);
        } else { /* legacy, OS2 and DOS style */
-/*              struct timespec ts;*/
+                int offset = cifs_sb->tcon->ses->server->timeAdj;
                FIND_FILE_STANDARD_INFO *pfindData =
                        (FIND_FILE_STANDARD_INFO *)buf;
-                tmp_inode->i_mtime = cnvrtDosUnixTm(
+                tmp_inode->i_mtime = cnvrtDosUnixTm(pfindData->LastWriteDate,
-                                le16_to_cpu(pfindData->LastWriteDate),
+                                                    pfindData->LastWriteTime,
-                                le16_to_cpu(pfindData->LastWriteTime));
+                                                    offset);
-                tmp_inode->i_atime = cnvrtDosUnixTm(
+                tmp_inode->i_atime = cnvrtDosUnixTm(pfindData->LastAccessDate,
-                                le16_to_cpu(pfindData->LastAccessDate),
+                                                    pfindData->LastAccessTime,
-                                le16_to_cpu(pfindData->LastAccessTime));
+                                                    offset);
-                tmp_inode->i_ctime = cnvrtDosUnixTm(
+                tmp_inode->i_ctime = cnvrtDosUnixTm(pfindData->LastWriteDate,
-                                le16_to_cpu(pfindData->LastWriteDate),
+                                                    pfindData->LastWriteTime,
-                                le16_to_cpu(pfindData->LastWriteTime));
+                                                    offset);
-                AdjustForTZ(cifs_sb->tcon, tmp_inode);
                attr = le16_to_cpu(pfindData->Attributes);
                allocation_size = le32_to_cpu(pfindData->AllocationSize);
                end_of_file = le32_to_cpu(pfindData->DataSize);
@@ -331,11 +319,11 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
        local_size  = tmp_inode->i_size;
        tmp_inode->i_atime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+            cifs_NTtimeToUnix(pfindData->LastAccessTime);
        tmp_inode->i_mtime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastModificationTime));
+            cifs_NTtimeToUnix(pfindData->LastModificationTime);
        tmp_inode->i_ctime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastStatusChange));
+            cifs_NTtimeToUnix(pfindData->LastStatusChange);
        tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions);
        /* since we set the inode type below we need to mask off type
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 6a347fbc998a..ffd42815fda1 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -47,6 +47,8 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
                      struct pipe_inode_info *pipe, size_t count,
                      unsigned int flags)
 {
+        ssize_t (*splice_read)(struct file *, loff_t *,
+                               struct pipe_inode_info *, size_t, unsigned int);
        struct coda_file_info *cfi;
        struct file *host_file;
@@ -54,10 +56,11 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
        host_file = cfi->cfi_container;
-        if (!host_file->f_op || !host_file->f_op->splice_read)
+        splice_read = host_file->f_op->splice_read;
-                return -EINVAL;
+        if (!splice_read)
+                splice_read = default_file_splice_read;
-        return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags);
+        return splice_read(host_file, ppos, pipe, count, flags);
 }
 static ssize_t
diff --git a/fs/compat.c b/fs/compat.c
index 681ed81e6be0..bb2a9b2e8173 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1488,7 +1488,7 @@ int compat_do_execve(char * filename,
        if (!bprm)
                goto out_files;
-        retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+        retval = mutex_lock_interruptible(&current->cred_guard_mutex);
        if (retval < 0)
                goto out_free;
        current->in_execve = 1;
@@ -1550,7 +1550,7 @@ int compat_do_execve(char * filename,
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&current->cred_guard_mutex);
        acct_update_integrals(current);
        free_bprm(bprm);
        if (displaced)
@@ -1573,7 +1573,7 @@ out_unmark:
 out_unlock:
        current->in_execve = 0;
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&current->cred_guard_mutex);
 out_free:
        free_bprm(bprm);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c68edb969441..9b1d285f9fe6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -557,8 +557,10 @@ static int __init init_devpts_fs(void)
        int err = register_filesystem(&devpts_fs_type);
        if (!err) {
                devpts_mnt = kern_mount(&devpts_fs_type);
-                if (IS_ERR(devpts_mnt))
+                if (IS_ERR(devpts_mnt)) {
                        err = PTR_ERR(devpts_mnt);
+                        unregister_filesystem(&devpts_fs_type);
+                }
        }
        return err;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 05763bbc2050..8b10b87dc01a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1127,7 +1127,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                rw = WRITE_ODIRECT;
        if (bdev)
-                bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
+                bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
        if (offset & blocksize_mask) {
                if (bdev)
diff --git a/fs/exec.c b/fs/exec.c
index 895823d0149d..e639957d7a57 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/perf_counter.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/key.h>
@@ -922,6 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
        task_lock(tsk);
        strlcpy(tsk->comm, buf, sizeof(tsk->comm));
        task_unlock(tsk);
+        perf_counter_comm(tsk);
 }
 int flush_old_exec(struct linux_binprm * bprm)
@@ -990,6 +992,13 @@ int flush_old_exec(struct linux_binprm * bprm)
        current->personality &= ~bprm->per_clear;
+        /*
+         * Flush performance counters when crossing a
+         * security domain:
+         */
+        if (!get_dumpable(current->mm))
+                perf_counter_exit_task(current);
        /* An exec changes our domain. We are no longer part of the thread
           group */
@@ -1016,7 +1025,7 @@ void install_exec_creds(struct linux_binprm *bprm)
        commit_creds(bprm->cred);
        bprm->cred = NULL;
-        /* cred_exec_mutex must be held at least to this point to prevent
+        /* cred_guard_mutex must be held at least to this point to prevent
         * ptrace_attach() from altering our determination of the task's
         * credentials; any time after this it may be unlocked */
@@ -1026,7 +1035,7 @@ EXPORT_SYMBOL(install_exec_creds);
 /*
 * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_exec_mutex to protect against
+ * - the caller must hold current->cred_guard_mutex to protect against
 *   PTRACE_ATTACH
 */
 int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1268,7 +1277,7 @@ int do_execve(char * filename,
        if (!bprm)
                goto out_files;
-        retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+        retval = mutex_lock_interruptible(&current->cred_guard_mutex);
        if (retval < 0)
                goto out_free;
        current->in_execve = 1;
@@ -1331,7 +1340,7 @@ int do_execve(char * filename,
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&current->cred_guard_mutex);
        acct_update_integrals(current);
        free_bprm(bprm);
        if (displaced)
@@ -1354,7 +1363,7 @@ out_unmark:
 out_unlock:
        current->in_execve = 0;
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&current->cred_guard_mutex);
 out_free:
        free_bprm(bprm);
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
index b249ae97fb15..06ca92672eb5 100644
--- a/fs/exofs/osd.c
+++ b/fs/exofs/osd.c
@@ -50,10 +50,10 @@ int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
        /* FIXME: should be include in osd_sense_info */
        if (in_resid)
-                *in_resid = or->in.req ? or->in.req->data_len : 0;
+                *in_resid = or->in.req ? or->in.req->resid_len : 0;
        if (out_resid)
-                *out_resid = or->out.req ? or->out.req->data_len : 0;
+                *out_resid = or->out.req ? or->out.req->resid_len : 0;
        return ret;
 }
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 5c4afe652245..e3c748faf2db 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1093,6 +1093,7 @@ failed_mount:
        brelse(bh);
 failed_sbi:
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return ret;
 }
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe504c3..3c70d52afb10 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1696,7 +1696,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        hblock = bdev_hardsect_size(sb->s_bdev);
+        hblock = bdev_logical_block_size(sb->s_bdev);
        if (sb->s_blocksize != blocksize) {
                /*
                 * Make sure the blocksize for the filesystem is larger
@@ -2021,6 +2021,7 @@ failed_mount:
        brelse(bh);
 out_fail:
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
        return ret;
@@ -2119,7 +2120,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
        }
        blocksize = sb->s_blocksize;
-        hblock = bdev_hardsect_size(bdev);
+        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
                printk(KERN_ERR
                        "EXT3-fs: blocksize too small for journal device.\n");
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index a8ff003a00f7..8a34710ecf40 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -5,8 +5,8 @@
 obj-$(CONFIG_EXT4_FS) += ext4.o
 ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-                   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-                   ext4_jbd2.o migrate.o mballoc.o
+                ext4_jbd2.o migrate.o mballoc.o block_validity.o
 ext4-$(CONFIG_EXT4_FS_XATTR)            += xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)        += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 53c72ad85877..e2126d70dff5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -19,7 +19,6 @@
 #include <linux/buffer_head.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
-#include "group.h"
 #include "mballoc.h"
 /*
@@ -88,6 +87,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 ext4_group_t block_group, struct ext4_group_desc *gdp)
 {
        int bit, bit_max;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        unsigned free_blocks, group_blocks;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -123,7 +123,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                bit_max += ext4_bg_num_gdb(sb, block_group);
        }
-        if (block_group == sbi->s_groups_count - 1) {
+        if (block_group == ngroups - 1) {
                /*
                 * Even though mke2fs always initialize first and last group
                 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
@@ -131,7 +131,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 */
                group_blocks = ext4_blocks_count(sbi->s_es) -
                        le32_to_cpu(sbi->s_es->s_first_data_block) -
-                        (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
+                        (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
        } else {
                group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
        }
@@ -205,18 +205,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 {
        unsigned int group_desc;
        unsigned int offset;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (block_group >= sbi->s_groups_count) {
+        if (block_group >= ngroups) {
                ext4_error(sb, "ext4_get_group_desc",
                           "block_group >= groups_count - "
                           "block_group = %u, groups_count = %u",
-                           block_group, sbi->s_groups_count);
+                           block_group, ngroups);
                return NULL;
        }
-        smp_rmb();
        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
@@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
-        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                ext4_init_block_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
-        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
        down_write(&grp->alloc_sem);
        for (i = 0, blocks_freed = 0; i < count; i++) {
                BUFFER_TRACE(bitmap_bh, "clear bit");
-                if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+                if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
                                                bit + i, bitmap_bh->b_data)) {
                        ext4_error(sb, __func__,
                                   "bit already cleared for block %llu",
@@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                        blocks_freed++;
                }
        }
-        spin_lock(sb_bgl_lock(sbi, block_group));
+        ext4_lock_group(sb, block_group);
        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
        ext4_free_blks_set(sb, desc, blk_free_count);
        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-        spin_unlock(sb_bgl_lock(sbi, block_group));
+        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
        if (sbi->s_log_groups_per_flex) {
@@ -665,7 +665,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        ext4_fsblk_t desc_count;
        struct ext4_group_desc *gdp;
        ext4_group_t i;
-        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
@@ -677,7 +677,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        bitmap_count = 0;
        gdp = NULL;
-        smp_rmb();
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
@@ -700,7 +699,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        return bitmap_count;
 #else
        desc_count = 0;
-        smp_rmb();
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 000000000000..50784ef07563
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,244 @@
+/*
+ *  linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4.h"
+struct ext4_system_zone {
+        struct rb_node  node;
+        ext4_fsblk_t    start_blk;
+        unsigned int    count;
+};
+static struct kmem_cache *ext4_system_zone_cachep;
+int __init init_ext4_system_zone(void)
+{
+        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
+                                             SLAB_RECLAIM_ACCOUNT);
+        if (ext4_system_zone_cachep == NULL)
+                return -ENOMEM;
+        return 0;
+}
+void exit_ext4_system_zone(void)
+{
+        kmem_cache_destroy(ext4_system_zone_cachep);
+}
+static inline int can_merge(struct ext4_system_zone *entry1,
+                     struct ext4_system_zone *entry2)
+{
+        if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+                return 1;
+        return 0;
+}
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+                           ext4_fsblk_t start_blk,
+                           unsigned int count)
+{
+        struct ext4_system_zone *new_entry = NULL, *entry;
+        struct rb_node **n = &sbi->system_blks.rb_node, *node;
+        struct rb_node *parent = NULL, *new_node = NULL;
+        while (*n) {
+                parent = *n;
+                entry = rb_entry(parent, struct ext4_system_zone, node);
+                if (start_blk < entry->start_blk)
+                        n = &(*n)->rb_left;
+                else if (start_blk >= (entry->start_blk + entry->count))
+                        n = &(*n)->rb_right;
+                else {
+                        if (start_blk + count > (entry->start_blk + 
+                                                 entry->count))
+                                entry->count = (start_blk + count - 
+                                                entry->start_blk);
+                        new_node = *n;
+                        new_entry = rb_entry(new_node, struct ext4_system_zone,
+                                             node);
+                        break;
+                }
+        }
+        if (!new_entry) {
+                new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+                                             GFP_KERNEL);
+                if (!new_entry)
+                        return -ENOMEM;
+                new_entry->start_blk = start_blk;
+                new_entry->count = count;
+                new_node = &new_entry->node;
+                rb_link_node(new_node, parent, n);
+                rb_insert_color(new_node, &sbi->system_blks);
+        }
+        /* Can we merge to the left? */
+        node = rb_prev(new_node);
+        if (node) {
+                entry = rb_entry(node, struct ext4_system_zone, node);
+                if (can_merge(entry, new_entry)) {
+                        new_entry->start_blk = entry->start_blk;
+                        new_entry->count += entry->count;
+                        rb_erase(node, &sbi->system_blks);
+                        kmem_cache_free(ext4_system_zone_cachep, entry);
+                }
+        }
+        /* Can we merge to the right? */
+        node = rb_next(new_node);
+        if (node) {
+                entry = rb_entry(node, struct ext4_system_zone, node);
+                if (can_merge(new_entry, entry)) {
+                        new_entry->count += entry->count;
+                        rb_erase(node, &sbi->system_blks);
+                        kmem_cache_free(ext4_system_zone_cachep, entry);
+                }
+        }
+        return 0;
+}
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+        struct rb_node *node;
+        struct ext4_system_zone *entry;
+        int first = 1;
+        printk(KERN_INFO "System zones: ");
+        node = rb_first(&sbi->system_blks);
+        while (node) {
+                entry = rb_entry(node, struct ext4_system_zone, node);
+                printk("%s%llu-%llu", first ? "" : ", ",
+                       entry->start_blk, entry->start_blk + entry->count - 1);
+                first = 0;
+                node = rb_next(node);
+        }
+        printk("\n");
+}
+int ext4_setup_system_zone(struct super_block *sb)
+{
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp;
+        ext4_group_t i;
+        int flex_size = ext4_flex_bg_size(sbi);
+        int ret;
+        if (!test_opt(sb, BLOCK_VALIDITY)) {
+                if (EXT4_SB(sb)->system_blks.rb_node)
+                        ext4_release_system_zone(sb);
+                return 0;
+        }
+        if (EXT4_SB(sb)->system_blks.rb_node)
+                return 0;
+        for (i=0; i < ngroups; i++) {
+                if (ext4_bg_has_super(sb, i) &&
+                    ((i < 5) || ((i % flex_size) == 0)))
+                        add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+                                        sbi->s_gdb_count + 1);
+                gdp = ext4_get_group_desc(sb, i, NULL);
+                ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+                if (ret)
+                        return ret;
+                ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+                if (ret)
+                        return ret;
+                ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+                                sbi->s_itb_per_group);
+                if (ret)
+                        return ret;
+        }
+        if (test_opt(sb, DEBUG))
+                debug_print_tree(EXT4_SB(sb));
+        return 0;
+}
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+        struct rb_node  *n = EXT4_SB(sb)->system_blks.rb_node;
+        struct rb_node  *parent;
+        struct ext4_system_zone *entry;
+        while (n) {
+                /* Do the node's children first */
+                if (n->rb_left) {
+                        n = n->rb_left;
+                        continue;
+                }
+                if (n->rb_right) {
+                        n = n->rb_right;
+                        continue;
+                }
+                /*
+                 * The node has no children; free it, and then zero
+                 * out parent's link to it.  Finally go to the
+                 * beginning of the loop and try to free the parent
+                 * node.
+                 */
+                parent = rb_parent(n);
+                entry = rb_entry(n, struct ext4_system_zone, node);
+                kmem_cache_free(ext4_system_zone_cachep, entry);
+                if (!parent)
+                        EXT4_SB(sb)->system_blks.rb_node = NULL;
+                else if (parent->rb_left == n)
+                        parent->rb_left = NULL;
+                else if (parent->rb_right == n)
+                        parent->rb_right = NULL;
+                n = parent;
+        }
+        EXT4_SB(sb)->system_blks.rb_node = NULL;
+}
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+                          unsigned int count)
+{
+        struct ext4_system_zone *entry;
+        struct rb_node *n = sbi->system_blks.rb_node;
+        if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+            (start_blk + count > ext4_blocks_count(sbi->s_es)))
+                return 0;
+        while (n) {
+                entry = rb_entry(n, struct ext4_system_zone, node);
+                if (start_blk + count - 1 < entry->start_blk)
+                        n = n->rb_left;
+                else if (start_blk >= (entry->start_blk + entry->count))
+                        n = n->rb_right;
+                else
+                        return 0;
+        }
+        return 1;
+}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b64789929a65..9dc93168e262 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp,
                struct buffer_head *bh = NULL;
                map_bh.b_state = 0;
-                err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+                err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
-                                                0, 0, 0);
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d0f15ef56de1..cc7d5edc38c9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -21,7 +21,14 @@
 #include <linux/magic.h>
 #include <linux/jbd2.h>
 #include <linux/quota.h>
-#include "ext4_i.h"
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
 /*
 * The fourth extended filesystem constants/structures
@@ -46,6 +53,19 @@
 #define ext4_debug(f, a...)     do {} while (0)
 #endif
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE              1
 /* blocks already reserved */
@@ -179,9 +199,6 @@ struct flex_groups {
 #define EXT4_BG_BLOCK_UNINIT    0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED    0x0004 /* On-disk itable initialized to zero */
-#ifdef __KERNEL__
-#include "ext4_sb.h"
-#endif
 /*
 * Macro-instructions used to manage group descriptors
 */
@@ -297,10 +314,23 @@ struct ext4_new_group_data {
 };
 /*
- * Following is used by preallocation code to tell get_blocks() that we
+ * Flags used by ext4_get_blocks()
- * want uninitialzed extents.
 */
-#define EXT4_CREATE_UNINITIALIZED_EXT           2
+        /* Allocate any needed blocks and/or convert an unitialized
+           extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE                  0x0001
+        /* Request the creation of an unitialized extent */
+#define EXT4_GET_BLOCKS_UNINIT_EXT              0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT       (EXT4_GET_BLOCKS_UNINIT_EXT|\
+                                                 EXT4_GET_BLOCKS_CREATE)
+        /* Caller is from the delayed allocation writeout path,
+           so set the magic i_delalloc_reserve_flag after taking the 
+           inode allocation semaphore for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
+        /* Call ext4_da_update_reserve_space() after successfully 
+           allocating the blocks */
+#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE    0x0008
 /*
 * ioctl commands
@@ -516,6 +546,110 @@ do {									       \
 #endif /* defined(__KERNEL__) || defined(__linux__) */
 /*
+ * storage for cached extent
+ */
+struct ext4_ext_cache {
+        ext4_fsblk_t    ec_start;
+        ext4_lblk_t     ec_block;
+        __u32           ec_len; /* must be 32bit to return holes */
+        __u32           ec_type;
+};
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+        __le32  i_data[15];     /* unconverted */
+        __u32   i_flags;
+        ext4_fsblk_t    i_file_acl;
+        __u32   i_dtime;
+        /*
+         * i_block_group is the number of the block group which contains
+         * this file's inode.  Constant across the lifetime of the inode,
+         * it is ued for making block allocation decisions - we try to
+         * place a file's data blocks near its inode block, and new inodes
+         * near to their parent directory's inode.
+         */
+        ext4_group_t    i_block_group;
+        __u32   i_state;                /* Dynamic state flags for ext4 */
+        ext4_lblk_t             i_dir_start_lookup;
+#ifdef CONFIG_EXT4_FS_XATTR
+        /*
+         * Extended attributes can be read independently of the main file
+         * data. Taking i_mutex even when reading would cause contention
+         * between readers of EAs and writers of regular file data, so
+         * instead we synchronize on xattr_sem when reading or changing
+         * EAs.
+         */
+        struct rw_semaphore xattr_sem;
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+        struct posix_acl        *i_acl;
+        struct posix_acl        *i_default_acl;
+#endif
+        struct list_head i_orphan;      /* unlinked but open inodes */
+        /*
+         * i_disksize keeps track of what the inode size is ON DISK, not
+         * in memory.  During truncate, i_size is set to the new size by
+         * the VFS prior to calling ext4_truncate(), but the filesystem won't
+         * set i_disksize to 0 until the truncate is actually under way.
+         *
+         * The intent is that i_disksize always represents the blocks which
+         * are used by this file.  This allows recovery to restart truncate
+         * on orphans if we crash during truncate.  We actually write i_disksize
+         * into the on-disk inode when writing inodes out, instead of i_size.
+         *
+         * The only time when i_disksize and i_size may be different is when
+         * a truncate is in progress.  The only things which change i_disksize
+         * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+         */
+        loff_t  i_disksize;
+        /*
+         * i_data_sem is for serialising ext4_truncate() against
+         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
+         * data tree are chopped off during truncate. We can't do that in
+         * ext4 because whenever we perform intermediate commits during
+         * truncate, the inode and all the metadata blocks *must* be in a
+         * consistent state which allows truncation of the orphans to restart
+         * during recovery.  Hence we must fix the get_block-vs-truncate race
+         * by other means, so we have i_data_sem.
+         */
+        struct rw_semaphore i_data_sem;
+        struct inode vfs_inode;
+        struct jbd2_inode jinode;
+        struct ext4_ext_cache i_cached_extent;
+        /*
+         * File creation time. Its function is same as that of
+         * struct timespec i_{a,c,m}time in the generic inode.
+         */
+        struct timespec i_crtime;
+        /* mballoc */
+        struct list_head i_prealloc_list;
+        spinlock_t i_prealloc_lock;
+        /* ialloc */
+        ext4_group_t    i_last_alloc_group;
+        /* allocation reservation info for delalloc */
+        unsigned int i_reserved_data_blocks;
+        unsigned int i_reserved_meta_blocks;
+        unsigned int i_allocated_meta_blocks;
+        unsigned short i_delalloc_reserved_flag;
+        /* on-disk additional length */
+        __u16 i_extra_isize;
+        spinlock_t i_block_reservation_lock;
+};
+/*
 * File system states
 */
 #define EXT4_VALID_FS                   0x0001  /* Unmounted cleanly */
@@ -560,6 +694,7 @@ do {									       \
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -689,6 +824,137 @@ struct ext4_super_block {
 };
 #ifdef __KERNEL__
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+        unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
+        unsigned long s_inodes_per_block;/* Number of inodes per block */
+        unsigned long s_blocks_per_group;/* Number of blocks in a group */
+        unsigned long s_inodes_per_group;/* Number of inodes in a group */
+        unsigned long s_itb_per_group;  /* Number of inode table blocks per group */
+        unsigned long s_gdb_count;      /* Number of group descriptor blocks */
+        unsigned long s_desc_per_block; /* Number of group descriptors per block */
+        ext4_group_t s_groups_count;    /* Number of groups in the fs */
+        unsigned long s_overhead_last;  /* Last calculated overhead */
+        unsigned long s_blocks_last;    /* Last seen block count */
+        loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
+        struct buffer_head * s_sbh;     /* Buffer containing the super block */
+        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
+        struct buffer_head **s_group_desc;
+        unsigned long  s_mount_opt;
+        ext4_fsblk_t s_sb_block;
+        uid_t s_resuid;
+        gid_t s_resgid;
+        unsigned short s_mount_state;
+        unsigned short s_pad;
+        int s_addr_per_block_bits;
+        int s_desc_per_block_bits;
+        int s_inode_size;
+        int s_first_ino;
+        unsigned int s_inode_readahead_blks;
+        spinlock_t s_next_gen_lock;
+        u32 s_next_generation;
+        u32 s_hash_seed[4];
+        int s_def_hash_version;
+        int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
+        struct percpu_counter s_freeblocks_counter;
+        struct percpu_counter s_freeinodes_counter;
+        struct percpu_counter s_dirs_counter;
+        struct percpu_counter s_dirtyblocks_counter;
+        struct blockgroup_lock *s_blockgroup_lock;
+        struct proc_dir_entry *s_proc;
+        struct kobject s_kobj;
+        struct completion s_kobj_unregister;
+        /* Journaling */
+        struct inode *s_journal_inode;
+        struct journal_s *s_journal;
+        struct list_head s_orphan;
+        struct mutex s_orphan_lock;
+        struct mutex s_resize_lock;
+        unsigned long s_commit_interval;
+        u32 s_max_batch_time;
+        u32 s_min_batch_time;
+        struct block_device *journal_bdev;
+#ifdef CONFIG_JBD2_DEBUG
+        struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
+        wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
+#endif
+#ifdef CONFIG_QUOTA
+        char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
+        int s_jquota_fmt;                       /* Format of quota to use */
+#endif
+        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+        struct rb_root system_blks;
+#ifdef EXTENTS_STATS
+        /* ext4 extents stats */
+        unsigned long s_ext_min;
+        unsigned long s_ext_max;
+        unsigned long s_depth_max;
+        spinlock_t s_ext_stats_lock;
+        unsigned long s_ext_blocks;
+        unsigned long s_ext_extents;
+#endif
+        /* for buddy allocator */
+        struct ext4_group_info ***s_group_info;
+        struct inode *s_buddy_cache;
+        long s_blocks_reserved;
+        spinlock_t s_reserve_lock;
+        spinlock_t s_md_lock;
+        tid_t s_last_transaction;
+        unsigned short *s_mb_offsets;
+        unsigned int *s_mb_maxs;
+        /* tunables */
+        unsigned long s_stripe;
+        unsigned int s_mb_stream_request;
+        unsigned int s_mb_max_to_scan;
+        unsigned int s_mb_min_to_scan;
+        unsigned int s_mb_stats;
+        unsigned int s_mb_order2_reqs;
+        unsigned int s_mb_group_prealloc;
+        /* where last allocation was done - for stream allocation */
+        unsigned long s_mb_last_group;
+        unsigned long s_mb_last_start;
+        /* history to debug policy */
+        struct ext4_mb_history *s_mb_history;
+        int s_mb_history_cur;
+        int s_mb_history_max;
+        int s_mb_history_num;
+        spinlock_t s_mb_history_lock;
+        int s_mb_history_filter;
+        /* stats for buddy allocator */
+        spinlock_t s_mb_pa_lock;
+        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
+        atomic_t s_bal_success; /* we found long enough chunks */
+        atomic_t s_bal_allocated;       /* in blocks */
+        atomic_t s_bal_ex_scanned;      /* total extents scanned */
+        atomic_t s_bal_goals;   /* goal hits */
+        atomic_t s_bal_breaks;  /* too long searches */
+        atomic_t s_bal_2orders; /* 2^order hits */
+        spinlock_t s_bal_lock;
+        unsigned long s_mb_buddies_generated;
+        unsigned long long s_mb_generation_time;
+        atomic_t s_mb_lost_chunks;
+        atomic_t s_mb_preallocated;
+        atomic_t s_mb_discarded;
+        /* locality groups */
+        struct ext4_locality_group *s_locality_groups;
+        /* for write statistics */
+        unsigned long s_sectors_written_start;
+        u64 s_kbytes_written;
+        unsigned int s_log_groups_per_flex;
+        struct flex_groups *s_flex_groups;
+};
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -704,7 +970,6 @@ static inline struct timespec ext4_current_time(struct inode *inode)
                current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
 }
 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 {
        return ino == EXT4_ROOT_INO ||
@@ -1014,6 +1279,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+                                      ext4_group_t block_group);
+extern unsigned ext4_init_block_bitmap(struct super_block *sb,
+                                       struct buffer_head *bh,
+                                       ext4_group_t group,
+                                       struct ext4_group_desc *desc);
+#define ext4_free_blocks_after_init(sb, group, desc)                    \
+                ext4_init_block_bitmap(sb, NULL, group, desc)
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1038,6 +1311,11 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
+extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
+                                       struct buffer_head *bh,
+                                       ext4_group_t group,
+                                       struct ext4_group_desc *desc);
+extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1123,6 +1401,8 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void ext4_warning(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
+extern void ext4_msg(struct super_block *, const char *, const char *, ...)
+        __attribute__ ((format (printf, 3, 4)));
 extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
                                const char *, const char *, ...)
        __attribute__ ((format (printf, 4, 5)));
@@ -1161,6 +1441,10 @@ extern void ext4_used_dirs_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
 extern void ext4_itable_unused_set(struct super_block *sb,
                                   struct ext4_group_desc *bg, __u32 count);
+extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
+                                   struct ext4_group_desc *gdp);
+extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+                                       struct ext4_group_desc *gdp);
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
@@ -1228,6 +1512,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
         return grp_info[indexv][indexh];
 }
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards.  See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+        ext4_group_t    ngroups = EXT4_SB(sb)->s_groups_count;
+        smp_rmb();
+        return ngroups;
+}
 static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
                                             ext4_group_t block_group)
@@ -1283,33 +1579,25 @@ struct ext4_group_info {
 };
 #define EXT4_GROUP_INFO_NEED_INIT_BIT   0
-#define EXT4_GROUP_INFO_LOCKED_BIT      1
 #define EXT4_MB_GRP_NEED_INIT(grp)      \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+                                              ext4_group_t group)
 {
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
-        bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
 }
-static inline void ext4_unlock_group(struct super_block *sb,
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-                                        ext4_group_t group)
 {
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        spin_lock(ext4_group_lock_ptr(sb, group));
-        bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
 }
-static inline int ext4_is_group_locked(struct super_block *sb,
+static inline void ext4_unlock_group(struct super_block *sb,
                                        ext4_group_t group)
 {
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        spin_unlock(ext4_group_lock_ptr(sb, group));
-        return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-                                                &(grinfo->bb_state));
 }
 /*
@@ -1326,11 +1614,21 @@ extern const struct file_operations ext4_file_operations;
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
 extern const struct inode_operations ext4_fast_symlink_inode_operations;
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init init_ext4_system_zone(void);
+extern void exit_ext4_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+                                 ext4_fsblk_t start_blk,
+                                 unsigned int count);
 /* extents.c */
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
@@ -1338,17 +1636,15 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                               ext4_lblk_t iblock, unsigned int max_blocks,
-                               struct buffer_head *bh_result,
+                               struct buffer_head *bh_result, int flags);
-                               int create, int extend_disksize);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
-extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
+extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
-                        sector_t block, unsigned int max_blocks,
+                           sector_t block, unsigned int max_blocks,
-                        struct buffer_head *bh, int create,
+                           struct buffer_head *bh, int flags);
-                        int extend_disksize, int flag);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
deleted file mode 100644
index 4ce2187123aa..000000000000
--- a/fs/ext4/ext4_i.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- *  ext4_i.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_i.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-#ifndef _EXT4_I
-#define _EXT4_I
-#include <linux/rwsem.h>
-#include <linux/rbtree.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-/* data type for block offset of block group */
-typedef int ext4_grpblk_t;
-/* data type for filesystem-wide blocks number */
-typedef unsigned long long ext4_fsblk_t;
-/* data type for file logical block number */
-typedef __u32 ext4_lblk_t;
-/* data type for block group number */
-typedef unsigned int ext4_group_t;
-/*
- * storage for cached extent
- */
-struct ext4_ext_cache {
-        ext4_fsblk_t    ec_start;
-        ext4_lblk_t     ec_block;
-        __u32           ec_len; /* must be 32bit to return holes */
-        __u32           ec_type;
-};
-/*
- * fourth extended file system inode data in memory
- */
-struct ext4_inode_info {
-        __le32  i_data[15];     /* unconverted */
-        __u32   i_flags;
-        ext4_fsblk_t    i_file_acl;
-        __u32   i_dtime;
-        /*
-         * i_block_group is the number of the block group which contains
-         * this file's inode.  Constant across the lifetime of the inode,
-         * it is ued for making block allocation decisions - we try to
-         * place a file's data blocks near its inode block, and new inodes
-         * near to their parent directory's inode.
-         */
-        ext4_group_t    i_block_group;
-        __u32   i_state;                /* Dynamic state flags for ext4 */
-        ext4_lblk_t             i_dir_start_lookup;
-#ifdef CONFIG_EXT4_FS_XATTR
-        /*
-         * Extended attributes can be read independently of the main file
-         * data. Taking i_mutex even when reading would cause contention
-         * between readers of EAs and writers of regular file data, so
-         * instead we synchronize on xattr_sem when reading or changing
-         * EAs.
-         */
-        struct rw_semaphore xattr_sem;
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-        struct posix_acl        *i_acl;
-        struct posix_acl        *i_default_acl;
-#endif
-        struct list_head i_orphan;      /* unlinked but open inodes */
-        /*
-         * i_disksize keeps track of what the inode size is ON DISK, not
-         * in memory.  During truncate, i_size is set to the new size by
-         * the VFS prior to calling ext4_truncate(), but the filesystem won't
-         * set i_disksize to 0 until the truncate is actually under way.
-         *
-         * The intent is that i_disksize always represents the blocks which
-         * are used by this file.  This allows recovery to restart truncate
-         * on orphans if we crash during truncate.  We actually write i_disksize
-         * into the on-disk inode when writing inodes out, instead of i_size.
-         *
-         * The only time when i_disksize and i_size may be different is when
-         * a truncate is in progress.  The only things which change i_disksize
-         * are ext4_get_block (growth) and ext4_truncate (shrinkth).
-         */
-        loff_t  i_disksize;
-        /*
-         * i_data_sem is for serialising ext4_truncate() against
-         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
-         * data tree are chopped off during truncate. We can't do that in
-         * ext4 because whenever we perform intermediate commits during
-         * truncate, the inode and all the metadata blocks *must* be in a
-         * consistent state which allows truncation of the orphans to restart
-         * during recovery.  Hence we must fix the get_block-vs-truncate race
-         * by other means, so we have i_data_sem.
-         */
-        struct rw_semaphore i_data_sem;
-        struct inode vfs_inode;
-        struct jbd2_inode jinode;
-        struct ext4_ext_cache i_cached_extent;
-        /*
-         * File creation time. Its function is same as that of
-         * struct timespec i_{a,c,m}time in the generic inode.
-         */
-        struct timespec i_crtime;
-        /* mballoc */
-        struct list_head i_prealloc_list;
-        spinlock_t i_prealloc_lock;
-        /* ialloc */
-        ext4_group_t    i_last_alloc_group;
-        /* allocation reservation info for delalloc */
-        unsigned int i_reserved_data_blocks;
-        unsigned int i_reserved_meta_blocks;
-        unsigned int i_allocated_meta_blocks;
-        unsigned short i_delalloc_reserved_flag;
-        /* on-disk additional length */
-        __u16 i_extra_isize;
-        spinlock_t i_block_reservation_lock;
-};
-#endif  /* _EXT4_I */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
deleted file mode 100644
index 57b71fefbccf..000000000000
--- a/fs/ext4/ext4_sb.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- *  ext4_sb.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_sb.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-#ifndef _EXT4_SB
-#define _EXT4_SB
-#ifdef __KERNEL__
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/blockgroup_lock.h>
-#include <linux/percpu_counter.h>
-#endif
-#include <linux/rbtree.h>
-/*
- * fourth extended-fs super-block data in memory
- */
-struct ext4_sb_info {
-        unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
-        unsigned long s_inodes_per_block;/* Number of inodes per block */
-        unsigned long s_blocks_per_group;/* Number of blocks in a group */
-        unsigned long s_inodes_per_group;/* Number of inodes in a group */
-        unsigned long s_itb_per_group;  /* Number of inode table blocks per group */
-        unsigned long s_gdb_count;      /* Number of group descriptor blocks */
-        unsigned long s_desc_per_block; /* Number of group descriptors per block */
-        ext4_group_t s_groups_count;    /* Number of groups in the fs */
-        unsigned long s_overhead_last;  /* Last calculated overhead */
-        unsigned long s_blocks_last;    /* Last seen block count */
-        loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
-        struct buffer_head * s_sbh;     /* Buffer containing the super block */
-        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
-        struct buffer_head **s_group_desc;
-        unsigned long  s_mount_opt;
-        ext4_fsblk_t s_sb_block;
-        uid_t s_resuid;
-        gid_t s_resgid;
-        unsigned short s_mount_state;
-        unsigned short s_pad;
-        int s_addr_per_block_bits;
-        int s_desc_per_block_bits;
-        int s_inode_size;
-        int s_first_ino;
-        unsigned int s_inode_readahead_blks;
-        spinlock_t s_next_gen_lock;
-        u32 s_next_generation;
-        u32 s_hash_seed[4];
-        int s_def_hash_version;
-        int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
-        struct percpu_counter s_freeblocks_counter;
-        struct percpu_counter s_freeinodes_counter;
-        struct percpu_counter s_dirs_counter;
-        struct percpu_counter s_dirtyblocks_counter;
-        struct blockgroup_lock *s_blockgroup_lock;
-        struct proc_dir_entry *s_proc;
-        struct kobject s_kobj;
-        struct completion s_kobj_unregister;
-        /* Journaling */
-        struct inode *s_journal_inode;
-        struct journal_s *s_journal;
-        struct list_head s_orphan;
-        unsigned long s_commit_interval;
-        u32 s_max_batch_time;
-        u32 s_min_batch_time;
-        struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
-        struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
-        wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
-#endif
-#ifdef CONFIG_QUOTA
-        char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
-        int s_jquota_fmt;                       /* Format of quota to use */
-#endif
-        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
-#ifdef EXTENTS_STATS
-        /* ext4 extents stats */
-        unsigned long s_ext_min;
-        unsigned long s_ext_max;
-        unsigned long s_depth_max;
-        spinlock_t s_ext_stats_lock;
-        unsigned long s_ext_blocks;
-        unsigned long s_ext_extents;
-#endif
-        /* for buddy allocator */
-        struct ext4_group_info ***s_group_info;
-        struct inode *s_buddy_cache;
-        long s_blocks_reserved;
-        spinlock_t s_reserve_lock;
-        spinlock_t s_md_lock;
-        tid_t s_last_transaction;
-        unsigned short *s_mb_offsets;
-        unsigned int *s_mb_maxs;
-        /* tunables */
-        unsigned long s_stripe;
-        unsigned int s_mb_stream_request;
-        unsigned int s_mb_max_to_scan;
-        unsigned int s_mb_min_to_scan;
-        unsigned int s_mb_stats;
-        unsigned int s_mb_order2_reqs;
-        unsigned int s_mb_group_prealloc;
-        /* where last allocation was done - for stream allocation */
-        unsigned long s_mb_last_group;
-        unsigned long s_mb_last_start;
-        /* history to debug policy */
-        struct ext4_mb_history *s_mb_history;
-        int s_mb_history_cur;
-        int s_mb_history_max;
-        int s_mb_history_num;
-        spinlock_t s_mb_history_lock;
-        int s_mb_history_filter;
-        /* stats for buddy allocator */
-        spinlock_t s_mb_pa_lock;
-        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
-        atomic_t s_bal_success; /* we found long enough chunks */
-        atomic_t s_bal_allocated;       /* in blocks */
-        atomic_t s_bal_ex_scanned;      /* total extents scanned */
-        atomic_t s_bal_goals;   /* goal hits */
-        atomic_t s_bal_breaks;  /* too long searches */
-        atomic_t s_bal_2orders; /* 2^order hits */
-        spinlock_t s_bal_lock;
-        unsigned long s_mb_buddies_generated;
-        unsigned long long s_mb_generation_time;
-        atomic_t s_mb_lost_chunks;
-        atomic_t s_mb_preallocated;
-        atomic_t s_mb_discarded;
-        /* locality groups */
-        struct ext4_locality_group *s_locality_groups;
-        /* for write statistics */
-        unsigned long s_sectors_written_start;
-        u64 s_kbytes_written;
-        unsigned int s_log_groups_per_flex;
-        struct flex_groups *s_flex_groups;
-};
-static inline spinlock_t *
-sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
-{
-        return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
-#endif  /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e3a55eb8b26a..2593f748c3a4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-        ext4_fsblk_t block = ext_pblock(ext), valid_block;
+        ext4_fsblk_t block = ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
-        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-        valid_block = le32_to_cpu(es->s_first_data_block) +
+        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
-                EXT4_SB(inode->i_sb)->s_gdb_count;
-        if (unlikely(block <= valid_block ||
-                     ((block + len) > ext4_blocks_count(es))))
-                return 0;
-        else
-                return 1;
 }
 static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
 {
-        ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
+        ext4_fsblk_t block = idx_pblock(ext_idx);
-        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-        valid_block = le32_to_cpu(es->s_first_data_block) +
+        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
-                EXT4_SB(inode->i_sb)->s_gdb_count;
-        if (unlikely(block <= valid_block ||
-                     (block >= ext4_blocks_count(es))))
-                return 0;
-        else
-                return 1;
 }
 static int ext4_valid_extent_entries(struct inode *inode,
@@ -2097,12 +2083,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        ex = EXT_LAST_EXTENT(eh);
        ex_ee_block = le32_to_cpu(ex->ee_block);
-        if (ext4_ext_is_uninitialized(ex))
-                uninitialized = 1;
        ex_ee_len = ext4_ext_get_actual_len(ex);
        while (ex >= EXT_FIRST_EXTENT(eh) &&
                        ex_ee_block + ex_ee_len > start) {
+                if (ext4_ext_is_uninitialized(ex))
+                        uninitialized = 1;
+                else
+                        uninitialized = 0;
                ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
                path[depth].p_ext = ex;
@@ -2784,7 +2774,7 @@ fix_extent_len:
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
                        unsigned int max_blocks, struct buffer_head *bh_result,
-                        int create, int extend_disksize)
+                        int flags)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
@@ -2793,7 +2783,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        int err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
-        loff_t disksize;
        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%u requested for inode %u\n",
@@ -2803,7 +2792,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        cache_type = ext4_ext_in_cache(inode, iblock, &newex);
        if (cache_type) {
                if (cache_type == EXT4_EXT_CACHE_GAP) {
-                        if (!create) {
+                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
                                 * block isn't allocated yet and
                                 * user doesn't want to allocate it
@@ -2869,9 +2858,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                                        EXT4_EXT_CACHE_EXTENT);
                                goto out;
                        }
-                        if (create == EXT4_CREATE_UNINITIALIZED_EXT)
+                        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
                                goto out;
-                        if (!create) {
+                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+                                if (allocated > max_blocks)
+                                        allocated = max_blocks;
                                /*
                                 * We have blocks reserved already.  We
                                 * return allocated blocks so that delalloc
@@ -2879,8 +2870,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                 * the buffer head will be unmapped so that
                                 * a read from the block returns 0s.
                                 */
-                                if (allocated > max_blocks)
-                                        allocated = max_blocks;
                                set_buffer_unwritten(bh_result);
                                bh_result->b_bdev = inode->i_sb->s_bdev;
                                bh_result->b_blocknr = newblock;
@@ -2903,7 +2892,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * requested block isn't allocated yet;
         * we couldn't try to create block if create flag is zero
         */
-        if (!create) {
+        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                /*
                 * put just found gap into cache to speed up
                 * subsequent requests
@@ -2932,10 +2921,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * EXT_UNINIT_MAX_LEN.
         */
        if (max_blocks > EXT_INIT_MAX_LEN &&
-            create != EXT4_CREATE_UNINITIALIZED_EXT)
+            !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
                max_blocks = EXT_INIT_MAX_LEN;
        else if (max_blocks > EXT_UNINIT_MAX_LEN &&
-                 create == EXT4_CREATE_UNINITIALIZED_EXT)
+                 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
                max_blocks = EXT_UNINIT_MAX_LEN;
        /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
@@ -2966,7 +2955,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* try to insert new extent into found leaf and return */
        ext4_ext_store_pblock(&newex, newblock);
        newex.ee_len = cpu_to_le16(ar.len);
-        if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
+        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)  /* Mark uninitialized */
                ext4_ext_mark_uninitialized(&newex);
        err = ext4_ext_insert_extent(handle, inode, path, &newex);
        if (err) {
@@ -2983,18 +2972,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-        if (extend_disksize) {
-                disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
-                if (disksize > i_size_read(inode))
-                        disksize = i_size_read(inode);
-                if (disksize > EXT4_I(inode)->i_disksize)
-                        EXT4_I(inode)->i_disksize = disksize;
-        }
        set_buffer_new(bh_result);
        /* Cache only when it is _not_ an uninitialized extent */
-        if (create != EXT4_CREATE_UNINITIALIZED_EXT)
+        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
                                                EXT4_EXT_CACHE_EXTENT);
 out:
@@ -3150,9 +3131,10 @@ retry:
                        ret = PTR_ERR(handle);
                        break;
                }
-                ret = ext4_get_blocks_wrap(handle, inode, block,
+                map_bh.b_state = 0;
-                                          max_blocks, &map_bh,
+                ret = ext4_get_blocks(handle, inode, block,
-                                          EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
+                                      max_blocks, &map_bh,
+                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
@@ -3195,7 +3177,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
                       void *data)
 {
        struct fiemap_extent_info *fieinfo = data;
-        unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
        __u64   logical;
        __u64   physical;
        __u64   length;
@@ -3242,9 +3224,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
         *
         * XXX this might miss a single-block extent at EXT_MAX_BLOCK
         */
-        if (logical + length - 1 == EXT_MAX_BLOCK ||
+        if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
-            ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+            newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
+                loff_t size = i_size_read(inode);
+                loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
                flags |= FIEMAP_EXTENT_LAST;
+                if ((flags & FIEMAP_EXTENT_DELALLOC) &&
+                    logical+length > size)
+                        length = (size - logical + bs - 1) & ~(bs-1);
+        }
        error = fiemap_fill_next_extent(fieinfo, logical, physical,
                                        length, flags);
@@ -3318,10 +3307,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 * Walk the extent tree gathering extent information.
                 * ext4_ext_fiemap_cb will push extents back to user.
                 */
-                down_write(&EXT4_I(inode)->i_data_sem);
+                down_read(&EXT4_I(inode)->i_data_sem);
                error = ext4_ext_walk_space(inode, start_blk, len_blks,
                                          ext4_ext_fiemap_cb, fieinfo);
-                up_write(&EXT4_I(inode)->i_data_sem);
+                up_read(&EXT4_I(inode)->i_data_sem);
        }
        return error;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
deleted file mode 100644
index c2c0a8d06d0e..000000000000
--- a/fs/ext4/group.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  linux/fs/ext4/group.h
- *
- * Copyright (C) 2007 Cluster File Systems, Inc
- *
- * Author: Andreas Dilger <adilger@clusterfs.com>
- */
-#ifndef _LINUX_EXT4_GROUP_H
-#define _LINUX_EXT4_GROUP_H
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
-                                   struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
-                                       struct ext4_group_desc *gdp);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
-                                      ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
-                                       struct buffer_head *bh,
-                                       ext4_group_t group,
-                                       struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc)                    \
-                ext4_init_block_bitmap(sb, NULL, group, desc)
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-                                       struct buffer_head *bh,
-                                       ext4_group_t group,
-                                       struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18e0a08a6b5..3743bd849bce 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -27,7 +27,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "group.h"
 /*
 * ialloc.c contains the inodes allocation and deallocation routines
@@ -123,16 +122,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
-        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
-        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -247,9 +246,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                goto error_return;
        /* Ok, now we can actually update the inode bitmaps.. */
-        spin_lock(sb_bgl_lock(sbi, block_group));
+        cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
-        cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+                                        bit, bitmap_bh->b_data);
-        spin_unlock(sb_bgl_lock(sbi, block_group));
        if (!cleared)
                ext4_error(sb, "ext4_free_inode",
                           "bit already cleared for inode %lu", ino);
@@ -261,7 +259,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                if (fatal) goto error_return;
                if (gdp) {
-                        spin_lock(sb_bgl_lock(sbi, block_group));
+                        ext4_lock_group(sb, block_group);
                        count = ext4_free_inodes_count(sb, gdp) + 1;
                        ext4_free_inodes_set(sb, gdp, count);
                        if (is_directory) {
@@ -277,7 +275,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                        }
                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
                                                        block_group, gdp);
-                        spin_unlock(sb_bgl_lock(sbi, block_group));
+                        ext4_unlock_group(sb, block_group);
                        percpu_counter_inc(&sbi->s_freeinodes_counter);
                        if (is_directory)
                                percpu_counter_dec(&sbi->s_dirs_counter);
@@ -316,7 +314,7 @@ error_return:
 static int find_group_dir(struct super_block *sb, struct inode *parent,
                                ext4_group_t *best_group)
 {
-        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        unsigned int freei, avefreei;
        struct ext4_group_desc *desc, *best_desc = NULL;
        ext4_group_t group;
@@ -349,11 +347,10 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *desc;
-        struct buffer_head *bh;
        struct flex_groups *flex_group = sbi->s_flex_groups;
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
-        ext4_group_t ngroups = sbi->s_groups_count;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        int flex_size = ext4_flex_bg_size(sbi);
        ext4_group_t best_flex = parent_fbg_group;
        int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
@@ -362,7 +359,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
        ext4_group_t n_fbg_groups;
        ext4_group_t i;
-        n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+        n_fbg_groups = (ngroups + flex_size - 1) >>
                sbi->s_log_groups_per_flex;
 find_close_to_parent:
@@ -404,7 +401,7 @@ find_close_to_parent:
 found_flexbg:
        for (i = best_flex * flex_size; i < ngroups &&
                     i < (best_flex + 1) * flex_size; i++) {
-                desc = ext4_get_group_desc(sb, i, &bh);
+                desc = ext4_get_group_desc(sb, i, NULL);
                if (ext4_free_inodes_count(sb, desc)) {
                        *best_group = i;
                        goto out;
@@ -478,20 +475,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        ext4_group_t ngroups = sbi->s_groups_count;
+        ext4_group_t real_ngroups = ext4_get_groups_count(sb);
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei;
        ext4_fsblk_t freeb, avefreeb;
        unsigned int ndirs;
        int max_dirs, min_inodes;
        ext4_grpblk_t min_blocks;
-        ext4_group_t i, grp, g;
+        ext4_group_t i, grp, g, ngroups;
        struct ext4_group_desc *desc;
        struct orlov_stats stats;
        int flex_size = ext4_flex_bg_size(sbi);
+        ngroups = real_ngroups;
        if (flex_size > 1) {
-                ngroups = (ngroups + flex_size - 1) >>
+                ngroups = (real_ngroups + flex_size - 1) >>
                        sbi->s_log_groups_per_flex;
                parent_group >>= sbi->s_log_groups_per_flex;
        }
@@ -543,7 +541,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                 */
                grp *= flex_size;
                for (i = 0; i < flex_size; i++) {
-                        if (grp+i >= sbi->s_groups_count)
+                        if (grp+i >= real_ngroups)
                                break;
                        desc = ext4_get_group_desc(sb, grp+i, NULL);
                        if (desc && ext4_free_inodes_count(sb, desc)) {
@@ -583,7 +581,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        }
 fallback:
-        ngroups = sbi->s_groups_count;
+        ngroups = real_ngroups;
        avefreei = freei / ngroups;
 fallback_retry:
        parent_group = EXT4_I(parent)->i_block_group;
@@ -613,9 +611,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                            ext4_group_t *group, int mode)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
-        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
-        ext4_group_t i, last;
        int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
        /*
@@ -708,10 +705,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 /*
 * claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's sb_bgl_lock
+ * is uninit we need to take the groups's ext4_group_lock
 * and clear the uninit flag. The inode bitmap update
 * and group desc uninit flag clear should be done
- * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * after holding ext4_group_lock so that ext4_read_inode_bitmap
 * doesn't race with the ext4_claim_inode
 */
 static int ext4_claim_inode(struct super_block *sb,
@@ -722,7 +719,7 @@ static int ext4_claim_inode(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
-        spin_lock(sb_bgl_lock(sbi, group));
+        ext4_lock_group(sb, group);
        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
                retval = 1;
@@ -731,7 +728,7 @@ static int ext4_claim_inode(struct super_block *sb,
        ino++;
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
-                spin_unlock(sb_bgl_lock(sbi, group));
+                ext4_unlock_group(sb, group);
                ext4_error(sb, __func__,
                           "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
@@ -780,7 +777,7 @@ static int ext4_claim_inode(struct super_block *sb,
        }
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
-        spin_unlock(sb_bgl_lock(sbi, group));
+        ext4_unlock_group(sb, group);
        return retval;
 }
@@ -799,11 +796,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
        struct super_block *sb;
        struct buffer_head *inode_bitmap_bh = NULL;
        struct buffer_head *group_desc_bh;
-        ext4_group_t group = 0;
+        ext4_group_t ngroups, group = 0;
        unsigned long ino = 0;
        struct inode *inode;
        struct ext4_group_desc *gdp = NULL;
-        struct ext4_super_block *es;
        struct ext4_inode_info *ei;
        struct ext4_sb_info *sbi;
        int ret2, err = 0;
@@ -818,15 +814,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                return ERR_PTR(-EPERM);
        sb = dir->i_sb;
+        ngroups = ext4_get_groups_count(sb);
        trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
                   dir->i_ino, mode);
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        ei = EXT4_I(inode);
        sbi = EXT4_SB(sb);
-        es = sbi->s_es;
        if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
                ret2 = find_group_flex(sb, dir, &group);
@@ -856,7 +851,7 @@ got_group:
        if (ret2 == -1)
                goto out;
-        for (i = 0; i < sbi->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                err = -EIO;
                gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -917,7 +912,7 @@ repeat_in_this_group:
                 * group descriptor metadata has not yet been updated.
                 * So we just go onto the next blockgroup.
                 */
-                if (++group == sbi->s_groups_count)
+                if (++group == ngroups)
                        group = 0;
        }
        err = -ENOSPC;
@@ -938,7 +933,7 @@ got:
                }
                free = 0;
-                spin_lock(sb_bgl_lock(sbi, group));
+                ext4_lock_group(sb, group);
                /* recheck and clear flag under lock if we still need to */
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        free = ext4_free_blocks_after_init(sb, group, gdp);
@@ -947,7 +942,7 @@ got:
                        gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
                                                                gdp);
                }
-                spin_unlock(sb_bgl_lock(sbi, group));
+                ext4_unlock_group(sb, group);
                /* Don't need to dirty bitmap block if we didn't change it */
                if (free) {
@@ -1158,7 +1153,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 {
        unsigned long desc_count;
        struct ext4_group_desc *gdp;
-        ext4_group_t i;
+        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        unsigned long bitmap_count, x;
@@ -1168,7 +1163,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
        desc_count = 0;
        bitmap_count = 0;
        gdp = NULL;
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
@@ -1190,7 +1185,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
        return desc_count;
 #else
        desc_count = 0;
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
@@ -1205,9 +1200,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 unsigned long ext4_count_dirs(struct super_block * sb)
 {
        unsigned long count = 0;
-        ext4_group_t i;
+        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2a9ffd528dd1..875db944b22f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -372,20 +372,21 @@ static int ext4_block_to_path(struct inode *inode,
 }
 static int __ext4_check_blockref(const char *function, struct inode *inode,
-                                 __le32 *p, unsigned int max) {
+                                 __le32 *p, unsigned int max)
+{
-        unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
        __le32 *bref = p;
+        unsigned int blk;
        while (bref < p+max) {
-                if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
+                blk = le32_to_cpu(*bref++);
+                if (blk && 
+                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 
+                                                    blk, 1))) {
                        ext4_error(inode->i_sb, function,
-                                   "block reference %u >= max (%u) "
+                                   "invalid block reference %u "
-                                   "in inode #%lu, offset=%d",
+                                   "in inode #%lu", blk, inode->i_ino);
-                                   le32_to_cpu(*bref), maxblocks,
-                                   inode->i_ino, (int)(bref-p));
                        return -EIO;
                }
-                bref++;
        }
        return 0;
 }
@@ -892,6 +893,10 @@ err_out:
 }
 /*
+ * The ext4_ind_get_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_get_blocks().
+ *
 * Allocation strategy is simple: if we have to allocate something, we will
 * have to go the whole way to leaf. So let's do it before attaching anything
 * to tree, set linkage between the newborn blocks, write them if sync is
@@ -909,15 +914,16 @@ err_out:
 * return = 0, if plain lookup failed.
 * return < 0, error case.
 *
- *
+ * The ext4_ind_get_blocks() function should be called with
- * Need to be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
 */
-static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
                                  ext4_lblk_t iblock, unsigned int maxblocks,
                                  struct buffer_head *bh_result,
-                                  int create, int extend_disksize)
+                                  int flags)
 {
        int err = -EIO;
        ext4_lblk_t offsets[4];
@@ -927,14 +933,11 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        int indirect_blks;
        int blocks_to_boundary = 0;
        int depth;
-        struct ext4_inode_info *ei = EXT4_I(inode);
        int count = 0;
        ext4_fsblk_t first_block = 0;
-        loff_t disksize;
        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
-        J_ASSERT(handle != NULL || create == 0);
+        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, iblock, offsets,
                                        &blocks_to_boundary);
@@ -963,7 +966,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        }
        /* Next simple case - plain lookup or failed read of indirect block */
-        if (!create || err == -EIO)
+        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
                goto cleanup;
        /*
@@ -997,19 +1000,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        if (!err)
                err = ext4_splice_branch(handle, inode, iblock,
                                        partial, indirect_blks, count);
-        /*
+        else 
-         * i_disksize growing is protected by i_data_sem.  Don't forget to
-         * protect it if you're about to implement concurrent
-         * ext4_get_block() -bzzz
-        */
-        if (!err && extend_disksize) {
-                disksize = ((loff_t) iblock + count) << inode->i_blkbits;
-                if (disksize > i_size_read(inode))
-                        disksize = i_size_read(inode);
-                if (disksize > ei->i_disksize)
-                        ei->i_disksize = disksize;
-        }
-        if (err)
                goto cleanup;
        set_buffer_new(bh_result);
@@ -1120,8 +1111,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
                ext4_discard_preallocations(inode);
 }
+static int check_block_validity(struct inode *inode, sector_t logical,
+                                sector_t phys, int len)
+{
+        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+                ext4_error(inode->i_sb, "check_block_validity",
+                           "inode #%lu logical block %llu mapped to %llu "
+                           "(size %d)", inode->i_ino,
+                           (unsigned long long) logical,
+                           (unsigned long long) phys, len);
+                WARN_ON(1);
+                return -EIO;
+        }
+        return 0;
+}
 /*
- * The ext4_get_blocks_wrap() function try to look up the requested blocks,
+ * The ext4_get_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
@@ -1129,7 +1135,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 * mapped.
 *
 * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocate.
@@ -1142,9 +1148,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 *
 * It returns the error in case of allocation failure.
 */
-int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
+int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
-                        unsigned int max_blocks, struct buffer_head *bh,
+                    unsigned int max_blocks, struct buffer_head *bh,
-                        int create, int extend_disksize, int flag)
+                    int flags)
 {
        int retval;
@@ -1152,21 +1158,28 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
        clear_buffer_unwritten(bh);
        /*
-         * Try to see if we can get  the block without requesting
+         * Try to see if we can get the block without requesting a new
-         * for new file system block.
+         * file system block.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-                                bh, 0, 0);
+                                bh, 0);
        } else {
-                retval = ext4_get_blocks_handle(handle,
+                retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
-                                inode, block, max_blocks, bh, 0, 0);
+                                             bh, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
+        if (retval > 0 && buffer_mapped(bh)) {
+                int ret = check_block_validity(inode, block, 
+                                               bh->b_blocknr, retval);
+                if (ret != 0)
+                        return ret;
+        }
        /* If it is only a block(s) look up */
-        if (!create)
+        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
                return retval;
        /*
@@ -1205,7 +1218,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
         * let the underlying get_block() function know to
         * avoid double accounting
         */
-        if (flag)
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
        /*
         * We need to check for EXT4 here because migrate
@@ -1213,10 +1226,10 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
         */
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-                                bh, create, extend_disksize);
+                                              bh, flags);
        } else {
-                retval = ext4_get_blocks_handle(handle, inode, block,
+                retval = ext4_ind_get_blocks(handle, inode, block,
-                                max_blocks, bh, create, extend_disksize);
+                                             max_blocks, bh, flags);
                if (retval > 0 && buffer_new(bh)) {
                        /*
@@ -1229,18 +1242,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                }
        }
-        if (flag) {
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
-                /*
-                 * Update reserved blocks/metadata blocks
+        /*
-                 * after successful block allocation
+         * Update reserved blocks/metadata blocks after successful
-                 * which were deferred till now
+         * block allocation which had been deferred till now.
-                 */
+         */
-                if ((retval > 0) && buffer_delay(bh))
+        if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
-                        ext4_da_update_reserve_space(inode, retval);
+                ext4_da_update_reserve_space(inode, retval);
-        }
        up_write((&EXT4_I(inode)->i_data_sem));
+        if (retval > 0 && buffer_mapped(bh)) {
+                int ret = check_block_validity(inode, block, 
+                                               bh->b_blocknr, retval);
+                if (ret != 0)
+                        return ret;
+        }
        return retval;
 }
@@ -1268,8 +1286,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
                started = 1;
        }
-        ret = ext4_get_blocks_wrap(handle, inode, iblock,
+        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-                                        max_blocks, bh_result, create, 0, 0);
+                              create ? EXT4_GET_BLOCKS_CREATE : 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -1288,17 +1306,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 {
        struct buffer_head dummy;
        int fatal = 0, err;
+        int flags = 0;
        J_ASSERT(handle != NULL || create == 0);
        dummy.b_state = 0;
        dummy.b_blocknr = -1000;
        buffer_trace_init(&dummy.b_history);
-        err = ext4_get_blocks_wrap(handle, inode, block, 1,
+        if (create)
-                                        &dummy, create, 1, 0);
+                flags |= EXT4_GET_BLOCKS_CREATE;
+        err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
        /*
-         * ext4_get_blocks_handle() returns number of blocks
+         * ext4_get_blocks() returns number of blocks mapped. 0 in
-         * mapped. 0 in case of a HOLE.
+         * case of a HOLE.
         */
        if (err > 0) {
                if (err > 1)
@@ -1439,7 +1459,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
                                struct page **pagep, void **fsdata)
 {
        struct inode *inode = mapping->host;
-        int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+        int ret, needed_blocks;
        handle_t *handle;
        int retries = 0;
        struct page *page;
@@ -1450,6 +1470,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
                   "dev %s ino %lu pos %llu len %u flags %u",
                   inode->i_sb->s_id, inode->i_ino,
                   (unsigned long long) pos, len, flags);
+        /*
+         * Reserve one block more for addition to orphan list in case
+         * we allocate blocks but write fails for some reason
+         */
+        needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1483,15 +1508,30 @@ retry:
        if (ret) {
                unlock_page(page);
-                ext4_journal_stop(handle);
                page_cache_release(page);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_mutex.
+                 *
+                 * Add inode to orphan list in case we crash before
+                 * truncate finishes
                 */
                if (pos + len > inode->i_size)
+                        ext4_orphan_add(handle, inode);
+                ext4_journal_stop(handle);
+                if (pos + len > inode->i_size) {
                        vmtruncate(inode, inode->i_size);
+                        /* 
+                         * If vmtruncate failed early the inode might
+                         * still be on the orphan list; we need to
+                         * make sure the inode is removed from the
+                         * orphan list in that case.
+                         */
+                        if (inode->i_nlink)
+                                ext4_orphan_del(NULL, inode);
+                }
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1509,6 +1549,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
        return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
+static int ext4_generic_write_end(struct file *file,
+                                struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned copied,
+                                struct page *page, void *fsdata)
+{
+        int i_size_changed = 0;
+        struct inode *inode = mapping->host;
+        handle_t *handle = ext4_journal_current_handle();
+        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold i_mutex.
+         *
+         * But it's important to update i_size while still holding page lock:
+         * page writeout could otherwise come in and zero beyond i_size.
+         */
+        if (pos + copied > inode->i_size) {
+                i_size_write(inode, pos + copied);
+                i_size_changed = 1;
+        }
+        if (pos + copied >  EXT4_I(inode)->i_disksize) {
+                /* We need to mark inode dirty even if
+                 * new_i_size is less that inode->i_size
+                 * bu greater than i_disksize.(hint delalloc)
+                 */
+                ext4_update_i_disksize(inode, (pos + copied));
+                i_size_changed = 1;
+        }
+        unlock_page(page);
+        page_cache_release(page);
+        /*
+         * Don't mark the inode dirty under page lock. First, it unnecessarily
+         * makes the holding time of page lock longer. Second, it forces lock
+         * ordering of page lock and transaction start for journaling
+         * filesystems.
+         */
+        if (i_size_changed)
+                ext4_mark_inode_dirty(handle, inode);
+        return copied;
+}
 /*
 * We need to pick up the new inode size which generic_commit_write gave us
 * `file' can be NULL - eg, when called from page_symlink().
@@ -1532,21 +1618,15 @@ static int ext4_ordered_write_end(struct file *file,
        ret = ext4_jbd2_file_inode(handle, inode);
        if (ret == 0) {
-                loff_t new_i_size;
+                ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
-                new_i_size = pos + copied;
-                if (new_i_size > EXT4_I(inode)->i_disksize) {
-                        ext4_update_i_disksize(inode, new_i_size);
-                        /* We need to mark inode dirty even if
-                         * new_i_size is less that inode->i_size
-                         * bu greater than i_disksize.(hint delalloc)
-                         */
-                        ext4_mark_inode_dirty(handle, inode);
-                }
-                ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
                copied = ret2;
+                if (pos + len > inode->i_size)
+                        /* if we have allocated more blocks and copied
+                         * less. We will have blocks allocated outside
+                         * inode->i_size. So truncate them
+                         */
+                        ext4_orphan_add(handle, inode);
                if (ret2 < 0)
                        ret = ret2;
        }
@@ -1554,6 +1634,18 @@ static int ext4_ordered_write_end(struct file *file,
        if (!ret)
                ret = ret2;
+        if (pos + len > inode->i_size) {
+                vmtruncate(inode, inode->i_size);
+                /* 
+                 * If vmtruncate failed early the inode might still be
+                 * on the orphan list; we need to make sure the inode
+                 * is removed from the orphan list in that case.
+                 */
+                if (inode->i_nlink)
+                        ext4_orphan_del(NULL, inode);
+        }
        return ret ? ret : copied;
 }
@@ -1565,25 +1657,21 @@ static int ext4_writeback_write_end(struct file *file,
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
-        loff_t new_i_size;
        trace_mark(ext4_writeback_write_end,
                   "dev %s ino %lu pos %llu len %u copied %u",
                   inode->i_sb->s_id, inode->i_ino,
                   (unsigned long long) pos, len, copied);
-        new_i_size = pos + copied;
+        ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
-        if (new_i_size > EXT4_I(inode)->i_disksize) {
-                ext4_update_i_disksize(inode, new_i_size);
-                /* We need to mark inode dirty even if
-                 * new_i_size is less that inode->i_size
-                 * bu greater than i_disksize.(hint delalloc)
-                 */
-                ext4_mark_inode_dirty(handle, inode);
-        }
-        ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
        copied = ret2;
+        if (pos + len > inode->i_size)
+                /* if we have allocated more blocks and copied
+                 * less. We will have blocks allocated outside
+                 * inode->i_size. So truncate them
+                 */
+                ext4_orphan_add(handle, inode);
        if (ret2 < 0)
                ret = ret2;
@@ -1591,6 +1679,17 @@ static int ext4_writeback_write_end(struct file *file,
        if (!ret)
                ret = ret2;
+        if (pos + len > inode->i_size) {
+                vmtruncate(inode, inode->i_size);
+                /* 
+                 * If vmtruncate failed early the inode might still be
+                 * on the orphan list; we need to make sure the inode
+                 * is removed from the orphan list in that case.
+                 */
+                if (inode->i_nlink)
+                        ext4_orphan_del(NULL, inode);
+        }
        return ret ? ret : copied;
 }
@@ -1635,10 +1734,27 @@ static int ext4_journalled_write_end(struct file *file,
        }
        unlock_page(page);
+        page_cache_release(page);
+        if (pos + len > inode->i_size)
+                /* if we have allocated more blocks and copied
+                 * less. We will have blocks allocated outside
+                 * inode->i_size. So truncate them
+                 */
+                ext4_orphan_add(handle, inode);
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-        page_cache_release(page);
+        if (pos + len > inode->i_size) {
+                vmtruncate(inode, inode->i_size);
+                /* 
+                 * If vmtruncate failed early the inode might still be
+                 * on the orphan list; we need to make sure the inode
+                 * is removed from the orphan list in that case.
+                 */
+                if (inode->i_nlink)
+                        ext4_orphan_del(NULL, inode);
+        }
        return ret ? ret : copied;
 }
@@ -1852,7 +1968,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 * @logical - first logical block to start assignment with
 *
 * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay
+ * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
 */
 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                 struct buffer_head *exbh)
@@ -1902,16 +2018,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                        do {
                                if (cur_logical >= logical + blocks)
                                        break;
-                                if (buffer_delay(bh)) {
-                                        bh->b_blocknr = pblock;
+                                if (buffer_delay(bh) ||
-                                        clear_buffer_delay(bh);
+                                                buffer_unwritten(bh)) {
-                                        bh->b_bdev = inode->i_sb->s_bdev;
-                                } else if (buffer_unwritten(bh)) {
+                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
-                                        bh->b_blocknr = pblock;
-                                        clear_buffer_unwritten(bh);
+                                        if (buffer_delay(bh)) {
-                                        set_buffer_mapped(bh);
+                                                clear_buffer_delay(bh);
-                                        set_buffer_new(bh);
+                                                bh->b_blocknr = pblock;
-                                        bh->b_bdev = inode->i_sb->s_bdev;
+                                        } else {
+                                                /*
+                                                 * unwritten already should have
+                                                 * blocknr assigned. Verify that
+                                                 */
+                                                clear_buffer_unwritten(bh);
+                                                BUG_ON(bh->b_blocknr != pblock);
+                                        }
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
@@ -1990,51 +2114,6 @@ static void ext4_print_free_blocks(struct inode *inode)
        return;
 }
-#define         EXT4_DELALLOC_RSVED     1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-                                   struct buffer_head *bh_result, int create)
-{
-        int ret;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-        loff_t disksize = EXT4_I(inode)->i_disksize;
-        handle_t *handle = NULL;
-        handle = ext4_journal_current_handle();
-        BUG_ON(!handle);
-        ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-                                   bh_result, create, 0, EXT4_DELALLOC_RSVED);
-        if (ret <= 0)
-                return ret;
-        bh_result->b_size = (ret << inode->i_blkbits);
-        if (ext4_should_order_data(inode)) {
-                int retval;
-                retval = ext4_jbd2_file_inode(handle, inode);
-                if (retval)
-                        /*
-                         * Failed to add inode for ordered mode. Don't
-                         * update file size
-                         */
-                        return retval;
-        }
-        /*
-         * Update on-disk size along with block allocation we don't
-         * use 'extend_disksize' as size may change within already
-         * allocated block -bzzz
-         */
-        disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
-        if (disksize > i_size_read(inode))
-                disksize = i_size_read(inode);
-        if (disksize > EXT4_I(inode)->i_disksize) {
-                ext4_update_i_disksize(inode, disksize);
-                ret = ext4_mark_inode_dirty(handle, inode);
-                return ret;
-        }
-        return 0;
-}
 /*
 * mpage_da_map_blocks - go through given space
 *
@@ -2045,29 +2124,57 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 */
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
-        int err = 0;
+        int err, blks, get_blocks_flags;
        struct buffer_head new;
-        sector_t next;
+        sector_t next = mpd->b_blocknr;
+        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
+        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
+        handle_t *handle = NULL;
        /*
         * We consider only non-mapped and non-allocated blocks
         */
        if ((mpd->b_state  & (1 << BH_Mapped)) &&
-            !(mpd->b_state & (1 << BH_Delay)))
+                !(mpd->b_state & (1 << BH_Delay)) &&
+                !(mpd->b_state & (1 << BH_Unwritten)))
                return 0;
-        new.b_state = mpd->b_state;
-        new.b_blocknr = 0;
-        new.b_size = mpd->b_size;
-        next = mpd->b_blocknr;
        /*
-         * If we didn't accumulate anything
+         * If we didn't accumulate anything to write simply return
-         * to write simply return
         */
-        if (!new.b_size)
+        if (!mpd->b_size)
                return 0;
-        err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+        handle = ext4_journal_current_handle();
-        if (err) {
+        BUG_ON(!handle);
+        /*
+         * Call ext4_get_blocks() to allocate any delayed allocation
+         * blocks, or to convert an uninitialized extent to be
+         * initialized (in the case where we have written into
+         * one or more preallocated blocks).
+         *
+         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
+         * indicate that we are on the delayed allocation path.  This
+         * affects functions in many different parts of the allocation
+         * call path.  This flag exists primarily because we don't
+         * want to change *many* call functions, so ext4_get_blocks()
+         * will set the magic i_delalloc_reserved_flag once the
+         * inode's allocation semaphore is taken.
+         *
+         * If the blocks in questions were delalloc blocks, set
+         * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
+         * variables are updated after the blocks have been allocated.
+         */
+        new.b_state = 0;
+        get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+                            EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+        if (mpd->b_state & (1 << BH_Delay))
+                get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+        blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+                               &new, get_blocks_flags);
+        if (blks < 0) {
+                err = blks;
                /*
                 * If get block returns with error we simply
                 * return. Later writepage will redirty the page and
@@ -2100,12 +2207,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                if (err == -ENOSPC) {
                        ext4_print_free_blocks(mpd->inode);
                }
-                /* invlaidate all the pages */
+                /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
                                mpd->b_size >> mpd->inode->i_blkbits);
                return err;
        }
-        BUG_ON(new.b_size == 0);
+        BUG_ON(blks == 0);
+        new.b_size = (blks << mpd->inode->i_blkbits);
        if (buffer_new(&new))
                __unmap_underlying_blocks(mpd->inode, &new);
@@ -2118,6 +2227,23 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
            (mpd->b_state & (1 << BH_Unwritten)))
                mpage_put_bnr_to_bhs(mpd, next, &new);
+        if (ext4_should_order_data(mpd->inode)) {
+                err = ext4_jbd2_file_inode(handle, mpd->inode);
+                if (err)
+                        return err;
+        }
+        /*
+         * Update on-disk size along with block allocation.
+         */
+        disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
+        if (disksize > i_size_read(mpd->inode))
+                disksize = i_size_read(mpd->inode);
+        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
+                ext4_update_i_disksize(mpd->inode, disksize);
+                return ext4_mark_inode_dirty(handle, mpd->inode);
+        }
        return 0;
 }
@@ -2192,6 +2318,17 @@ flush_it:
        return;
 }
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+        /*
+         * unmapped buffer is possible for holes.
+         * delay buffer is possible with delayed allocation.
+         * We also need to consider unwritten buffer as unmapped.
+         */
+        return (!buffer_mapped(bh) || buffer_delay(bh) ||
+                                buffer_unwritten(bh)) && buffer_dirty(bh);
+}
 /*
 * __mpage_da_writepage - finds extent of pages and blocks
 *
@@ -2276,8 +2413,7 @@ static int __mpage_da_writepage(struct page *page,
                         * Otherwise we won't make progress
                         * with the page in ext4_da_writepage
                         */
-                        if (buffer_dirty(bh) &&
+                        if (ext4_bh_unmapped_or_delay(NULL, bh)) {
-                            (!buffer_mapped(bh) || buffer_delay(bh))) {
                                mpage_add_bh_to_extent(mpd, logical,
                                                       bh->b_size,
                                                       bh->b_state);
@@ -2303,8 +2439,16 @@ static int __mpage_da_writepage(struct page *page,
 }
 /*
- * this is a special callback for ->write_begin() only
+ * This is a special get_blocks_t callback which is used by
- * it's intention is to return mapped block or reserve space
+ * ext4_da_write_begin().  It will either return mapped block or
+ * reserve space for a single block.
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
 */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                                  struct buffer_head *bh_result, int create)
@@ -2323,7 +2467,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
-        ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+        ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
        if ((ret == 0) && !buffer_delay(bh_result)) {
                /* the block isn't (pre)allocated yet, let's reserve space */
                /*
@@ -2340,40 +2484,53 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                set_buffer_delay(bh_result);
        } else if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
-                /*
+                if (buffer_unwritten(bh_result)) {
-                 * With sub-block writes into unwritten extents
+                        /* A delayed write to unwritten bh should
-                 * we also need to mark the buffer as new so that
+                         * be marked new and mapped.  Mapped ensures
-                 * the unwritten parts of the buffer gets correctly zeroed.
+                         * that we don't do get_block multiple times
-                 */
+                         * when we write to the same offset and new
-                if (buffer_unwritten(bh_result))
+                         * ensures that we do proper zero out for
+                         * partial write.
+                         */
                        set_buffer_new(bh_result);
+                        set_buffer_mapped(bh_result);
+                }
                ret = 0;
        }
        return ret;
 }
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+/*
-{
+ * This function is used as a standard get_block_t calback function
-        /*
+ * when there is no desire to allocate any blocks.  It is used as a
-         * unmapped buffer is possible for holes.
+ * callback function for block_prepare_write(), nobh_writepage(), and
-         * delay buffer is possible with delayed allocation
+ * block_write_full_page().  These functions should only try to map a
-         */
+ * single block at a time.
-        return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+ *
-}
+ * Since this function doesn't do block allocations even if the caller
+ * requests it by passing in create=1, it is critically important that
-static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+ * any caller checks to make sure that any buffer heads are returned
+ * by this function are either all already mapped or marked for
+ * delayed allocation before calling nobh_writepage() or
+ * block_write_full_page().  Otherwise, b_blocknr could be left
+ * unitialized, and the page write functions will be taken by
+ * surprise.
+ */
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
        int ret = 0;
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
        /*
         * we don't want to do block allocation in writepage
         * so call get_block_wrap with create = 0
         */
-        ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+        ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-                                   bh_result, 0, 0, 0);
+        BUG_ON(create && ret == 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -2382,10 +2539,11 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 }
 /*
- * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * This function can get called via...
- * get called via journal_submit_inode_data_buffers (no journal handle)
+ *   - ext4_da_writepages after taking page lock (have journal handle)
- * get called via shrink_page_list via pdflush (no journal handle)
+ *   - journal_submit_inode_data_buffers (no journal handle)
- * or grab_page_cache when doing write_begin (have journal handle)
+ *   - shrink_page_list via pdflush (no journal handle)
+ *   - grab_page_cache when doing write_begin (have journal handle)
 */
 static int ext4_da_writepage(struct page *page,
                                struct writeback_control *wbc)
@@ -2436,7 +2594,7 @@ static int ext4_da_writepage(struct page *page,
                 * do block allocation here.
                 */
                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                                                ext4_normal_get_block_write);
+                                          noalloc_get_block_write);
                if (!ret) {
                        page_bufs = page_buffers(page);
                        /* check whether all are mapped and non delay */
@@ -2461,11 +2619,10 @@ static int ext4_da_writepage(struct page *page,
        }
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-                ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+                ret = nobh_writepage(page, noalloc_get_block_write, wbc);
        else
-                ret = block_write_full_page(page,
+                ret = block_write_full_page(page, noalloc_get_block_write,
-                                                ext4_normal_get_block_write,
+                                            wbc);
-                                                wbc);
        return ret;
 }
@@ -2777,7 +2934,7 @@ retry:
        *pagep = page;
        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-                                                        ext4_da_get_block_prep);
+                                ext4_da_get_block_prep);
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
@@ -2815,7 +2972,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
        for (i = 0; i < idx; i++)
                bh = bh->b_this_page;
-        if (!buffer_mapped(bh) || (buffer_delay(bh)))
+        if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
                return 0;
        return 1;
 }
@@ -3085,12 +3242,10 @@ static int __ext4_normal_writepage(struct page *page,
        struct inode *inode = page->mapping->host;
        if (test_opt(inode->i_sb, NOBH))
-                return nobh_writepage(page,
+                return nobh_writepage(page, noalloc_get_block_write, wbc);
-                                        ext4_normal_get_block_write, wbc);
        else
-                return block_write_full_page(page,
+                return block_write_full_page(page, noalloc_get_block_write,
-                                                ext4_normal_get_block_write,
+                                             wbc);
-                                                wbc);
 }
 static int ext4_normal_writepage(struct page *page,
@@ -3142,7 +3297,7 @@ static int __ext4_journalled_writepage(struct page *page,
        int err;
        ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                                        ext4_normal_get_block_write);
+                                  noalloc_get_block_write);
        if (ret != 0)
                goto out_unlock;
@@ -3227,9 +3382,8 @@ static int ext4_journalled_writepage(struct page *page,
                 * really know unless we go poke around in the buffer_heads.
                 * But block_write_full_page will do the right thing.
                 */
-                return block_write_full_page(page,
+                return block_write_full_page(page, noalloc_get_block_write,
-                                                ext4_normal_get_block_write,
+                                             wbc);
-                                                wbc);
        }
 no_write:
        redirty_page_for_writepage(wbc, page);
@@ -3973,7 +4127,8 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
-        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+        if (ei->i_disksize && inode->i_size == 0 &&
+            !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4715,25 +4870,6 @@ int ext4_write_inode(struct inode *inode, int wait)
        return ext4_force_commit(inode->i_sb);
 }
-int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
-{
-        int err = 0;
-        mark_buffer_dirty(bh);
-        if (inode && inode_needs_sync(inode)) {
-                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                        ext4_error(inode->i_sb, __func__,
-                                   "IO error syncing inode, "
-                                   "inode=%lu, block=%llu",
-                                   inode->i_ino,
-                                   (unsigned long long)bh->b_blocknr);
-                        err = -EIO;
-                }
-        }
-        return err;
-}
 /*
 * ext4_setattr()
 *
@@ -4930,7 +5066,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 */
 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-        int groups, gdpblocks;
+        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+        int gdpblocks;
        int idxblocks;
        int ret = 0;
@@ -4957,8 +5094,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
                groups += nrblocks;
        gdpblocks = groups;
-        if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
+        if (groups > ngroups)
-                groups = EXT4_SB(inode->i_sb)->s_groups_count;
+                groups = ngroups;
        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
@@ -4998,7 +5135,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f871677a7984..ed8482e22c0e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr)
        ext4_set_bit(bit, addr);
 }
-static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-        addr = mb_correct_addr_and_bit(&bit, addr);
-        ext4_set_bit_atomic(lock, bit, addr);
-}
 static inline void mb_clear_bit(int bit, void *addr)
 {
        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_clear_bit(bit, addr);
 }
-static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-        addr = mb_correct_addr_and_bit(&bit, addr);
-        ext4_clear_bit_atomic(lock, bit, addr);
-}
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
        int fix = 0, ret, tmpmax;
@@ -448,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
-        BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
                        ext4_fsblk_t blocknr;
@@ -472,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
-        BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
@@ -739,6 +727,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 static int ext4_mb_init_cache(struct page *page, char *incore)
 {
+        ext4_group_t ngroups;
        int blocksize;
        int blocks_per_page;
        int groups_per_page;
@@ -757,6 +746,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        inode = page->mapping->host;
        sb = inode->i_sb;
+        ngroups = ext4_get_groups_count(sb);
        blocksize = 1 << inode->i_blkbits;
        blocks_per_page = PAGE_CACHE_SIZE / blocksize;
@@ -780,7 +770,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        for (i = 0; i < groups_per_page; i++) {
                struct ext4_group_desc *desc;
-                if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+                if (first_group + i >= ngroups)
                        break;
                err = -EIO;
@@ -801,17 +791,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        unlock_buffer(bh[i]);
                        continue;
                }
-                spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                ext4_lock_group(sb, first_group + i);
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        ext4_init_block_bitmap(sb, bh[i],
                                                first_group + i, desc);
                        set_bitmap_uptodate(bh[i]);
                        set_buffer_uptodate(bh[i]);
-                        spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                        ext4_unlock_group(sb, first_group + i);
                        unlock_buffer(bh[i]);
                        continue;
                }
-                spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                ext4_unlock_group(sb, first_group + i);
                if (buffer_uptodate(bh[i])) {
                        /*
                         * if not uninit if bh is uptodate,
@@ -852,7 +842,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                struct ext4_group_info *grinfo;
                group = (first_block + i) >> 1;
-                if (group >= EXT4_SB(sb)->s_groups_count)
+                if (group >= ngroups)
                        break;
                /*
@@ -1078,7 +1068,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
        return 0;
 }
-static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_clear_bits(void *bm, int cur, int len)
 {
        __u32 *addr;
@@ -1091,15 +1081,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-                if (lock)
+                mb_clear_bit(cur, bm);
-                        mb_clear_bit_atomic(lock, cur, bm);
-                else
-                        mb_clear_bit(cur, bm);
                cur++;
        }
 }
-static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_set_bits(void *bm, int cur, int len)
 {
        __u32 *addr;
@@ -1112,10 +1099,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-                if (lock)
+                mb_set_bit(cur, bm);
-                        mb_set_bit_atomic(lock, cur, bm);
-                else
-                        mb_set_bit(cur, bm);
                cur++;
        }
 }
@@ -1131,7 +1115,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
        struct super_block *sb = e4b->bd_sb;
        BUG_ON(first + count > (sb->s_blocksize << 3));
-        BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        mb_check_buddy(e4b);
        mb_free_blocks_double(inode, e4b, first, count);
@@ -1212,7 +1196,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
        int ord;
        void *buddy;
-        BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        BUG_ON(ex == NULL);
        buddy = mb_find_buddy(e4b, order, &max);
@@ -1276,7 +1260,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
        BUG_ON(e4b->bd_group != ex->fe_group);
-        BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        mb_check_buddy(e4b);
        mb_mark_used_double(e4b, start, len);
@@ -1330,8 +1314,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
                e4b->bd_info->bb_counters[ord]++;
        }
-        mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
+        mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
-                        EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
        mb_check_buddy(e4b);
        return ret;
@@ -1726,7 +1709,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        unsigned free, fragments;
        unsigned i, bits;
        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
-        struct ext4_group_desc *desc;
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        BUG_ON(cr < 0 || cr >= 4);
@@ -1742,10 +1724,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        switch (cr) {
        case 0:
                BUG_ON(ac->ac_2order == 0);
-                /* If this group is uninitialized, skip it initially */
-                desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
-                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
-                        return 0;
                /* Avoid using the first bg of a flexgroup for data files */
                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
@@ -1788,6 +1766,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
        int block, pnum;
        int blocks_per_page;
        int groups_per_page;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t first_group;
        struct ext4_group_info *grp;
@@ -1807,7 +1786,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
        /* read all groups the page covers into the cache */
        for (i = 0; i < groups_per_page; i++) {
-                if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+                if ((first_group + i) >= ngroups)
                        break;
                grp = ext4_get_group_info(sb, first_group + i);
                /* take all groups write allocation
@@ -1945,8 +1924,7 @@ err:
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-        ext4_group_t group;
+        ext4_group_t ngroups, group, i;
-        ext4_group_t i;
        int cr;
        int err = 0;
        int bsbits;
@@ -1957,6 +1935,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);
+        ngroups = ext4_get_groups_count(sb);
        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
        /* first, try the goal */
@@ -2017,11 +1996,11 @@ repeat:
                 */
                group = ac->ac_g_ex.fe_group;
-                for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
+                for (i = 0; i < ngroups; group++, i++) {
                        struct ext4_group_info *grp;
                        struct ext4_group_desc *desc;
-                        if (group == EXT4_SB(sb)->s_groups_count)
+                        if (group == ngroups)
                                group = 0;
                        /* quick check to skip empty groups */
@@ -2064,9 +2043,7 @@ repeat:
                        ac->ac_groups_scanned++;
                        desc = ext4_get_group_desc(sb, group, NULL);
-                        if (cr == 0 || (desc->bg_flags &
+                        if (cr == 0)
-                                        cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
-                                        ac->ac_2order != 0))
                                ext4_mb_simple_scan_group(ac, &e4b);
                        else if (cr == 1 &&
                                        ac->ac_g_ex.fe_len == sbi->s_stripe)
@@ -2315,12 +2292,10 @@ static struct file_operations ext4_mb_seq_history_fops = {
 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 {
        struct super_block *sb = seq->private;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group;
-        if (*pos < 0 || *pos >= sbi->s_groups_count)
+        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
 }
@@ -2328,11 +2303,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
 {
        struct super_block *sb = seq->private;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group;
        ++*pos;
-        if (*pos < 0 || *pos >= sbi->s_groups_count)
+        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
@@ -2420,7 +2394,8 @@ static void ext4_mb_history_release(struct super_block *sb)
        if (sbi->s_proc != NULL) {
                remove_proc_entry("mb_groups", sbi->s_proc);
-                remove_proc_entry("mb_history", sbi->s_proc);
+                if (sbi->s_mb_history_max)
+                        remove_proc_entry("mb_history", sbi->s_proc);
        }
        kfree(sbi->s_mb_history);
 }
@@ -2431,17 +2406,17 @@ static void ext4_mb_history_init(struct super_block *sb)
        int i;
        if (sbi->s_proc != NULL) {
-                proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+                if (sbi->s_mb_history_max)
-                                 &ext4_mb_seq_history_fops, sb);
+                        proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+                                         &ext4_mb_seq_history_fops, sb);
                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
                                 &ext4_mb_seq_groups_fops, sb);
        }
-        sbi->s_mb_history_max = 1000;
        sbi->s_mb_history_cur = 0;
        spin_lock_init(&sbi->s_mb_history_lock);
        i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-        sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
+        sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
        /* if we can't allocate history, then we simple won't use it */
 }
@@ -2451,7 +2426,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_mb_history h;
-        if (unlikely(sbi->s_mb_history == NULL))
+        if (sbi->s_mb_history == NULL)
                return;
        if (!(ac->ac_op & sbi->s_mb_history_filter))
@@ -2587,6 +2562,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
 static int ext4_mb_init_backend(struct super_block *sb)
 {
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        int metalen;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2598,7 +2574,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        struct ext4_group_desc *desc;
        /* This is the number of blocks used by GDT */
-        num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+        num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
                                1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
        /*
@@ -2644,7 +2620,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        for (i = 0; i < num_meta_group_infos; i++) {
                if ((i + 1) == num_meta_group_infos)
                        metalen = sizeof(*meta_group_info) *
-                                (sbi->s_groups_count -
+                                (ngroups -
                                        (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
                meta_group_info = kmalloc(metalen, GFP_KERNEL);
                if (meta_group_info == NULL) {
@@ -2655,7 +2631,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                sbi->s_group_info[i] = meta_group_info;
        }
-        for (i = 0; i < sbi->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        printk(KERN_ERR
@@ -2761,7 +2737,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        return 0;
 }
-/* need to called with ext4 group lock (ext4_lock_group) */
+/* need to called with the ext4 group lock held */
 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 {
        struct ext4_prealloc_space *pa;
@@ -2781,13 +2757,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 int ext4_mb_release(struct super_block *sb)
 {
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        int num_meta_group_infos;
        struct ext4_group_info *grinfo;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        if (sbi->s_group_info) {
-                for (i = 0; i < sbi->s_groups_count; i++) {
+                for (i = 0; i < ngroups; i++) {
                        grinfo = ext4_get_group_info(sb, i);
 #ifdef DOUBLE_CHECK
                        kfree(grinfo->bb_bitmap);
@@ -2797,7 +2774,7 @@ int ext4_mb_release(struct super_block *sb)
                        ext4_unlock_group(sb, i);
                        kfree(grinfo);
                }
-                num_meta_group_infos = (sbi->s_groups_count +
+                num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                for (i = 0; i < num_meta_group_infos; i++)
@@ -2984,27 +2961,25 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                + le32_to_cpu(es->s_first_data_block);
        len = ac->ac_b_ex.fe_len;
-        if (in_range(ext4_block_bitmap(sb, gdp), block, len) ||
+        if (!ext4_data_block_valid(sbi, block, len)) {
-            in_range(ext4_inode_bitmap(sb, gdp), block, len) ||
-            in_range(block, ext4_inode_table(sb, gdp),
-                     EXT4_SB(sb)->s_itb_per_group) ||
-            in_range(block + len - 1, ext4_inode_table(sb, gdp),
-                     EXT4_SB(sb)->s_itb_per_group)) {
                ext4_error(sb, __func__,
-                           "Allocating block %llu in system zone of %d group\n",
+                           "Allocating blocks %llu-%llu which overlap "
-                           block, ac->ac_b_ex.fe_group);
+                           "fs metadata\n", block, block+len);
                /* File system mounted not to panic on error
                 * Fix the bitmap and repeat the block allocation
                 * We leak some of the blocks here.
                 */
-                mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
+                ext4_lock_group(sb, ac->ac_b_ex.fe_group);
-                                bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+                mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
-                                ac->ac_b_ex.fe_len);
+                            ac->ac_b_ex.fe_len);
+                ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!err)
                        err = -EAGAIN;
                goto out_err;
        }
+        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
 #ifdef AGGRESSIVE_CHECK
        {
                int i;
@@ -3014,9 +2989,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                }
        }
 #endif
-        spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+        mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
-        mb_set_bits(NULL, bitmap_bh->b_data,
-                                ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                ext4_free_blks_set(sb, gdp,
@@ -3026,7 +2999,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
        ext4_free_blks_set(sb, gdp, len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
-        spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
        /*
         * Now reduce the dirty block count also. Should not go negative
@@ -3459,7 +3433,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 * the function goes through all block freed in the group
 * but not yet committed and marks them used in in-core bitmap.
 * buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with the ext4 group lock held
 */
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group)
@@ -3473,9 +3447,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
        while (n) {
                entry = rb_entry(n, struct ext4_free_data, node);
-                mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+                mb_set_bits(bitmap, entry->start_blk, entry->count);
-                                bitmap, entry->start_blk,
-                                entry->count);
                n = rb_next(n);
        }
        return;
@@ -3484,7 +3456,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 /*
 * the function goes through all preallocation in this group and marks them
 * used in in-core bitmap. buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with ext4 group lock held
 */
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group)
@@ -3516,8 +3488,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                if (unlikely(len == 0))
                        continue;
                BUG_ON(groupnr != group);
-                mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+                mb_set_bits(bitmap, start, len);
-                                                bitmap, start, len);
                preallocated += len;
                count++;
        }
@@ -4121,7 +4092,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
        struct super_block *sb = ac->ac_sb;
-        ext4_group_t i;
+        ext4_group_t ngroups, i;
        printk(KERN_ERR "EXT4-fs: Can't allocate:"
                        " Allocation context details:\n");
@@ -4145,7 +4116,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
        printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
                ac->ac_found);
        printk(KERN_ERR "EXT4-fs: groups: \n");
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+        ngroups = ext4_get_groups_count(sb);
+        for (i = 0; i < ngroups; i++) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
                struct ext4_prealloc_space *pa;
                ext4_grpblk_t start;
@@ -4469,13 +4441,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 {
-        ext4_group_t i;
+        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
        int ret;
        int freed = 0;
        trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
                   sb->s_id, needed);
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+        for (i = 0; i < ngroups && needed > 0; i++) {
                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
                freed += ret;
                needed -= ret;
@@ -4859,29 +4831,25 @@ do_more:
                new_entry->group  = block_group;
                new_entry->count = count;
                new_entry->t_tid = handle->h_transaction->t_tid;
                ext4_lock_group(sb, block_group);
-                mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                mb_clear_bits(bitmap_bh->b_data, bit, count);
-                                bit, count);
                ext4_mb_free_metadata(handle, &e4b, new_entry);
-                ext4_unlock_group(sb, block_group);
        } else {
-                ext4_lock_group(sb, block_group);
                /* need to update group_info->bb_free and bitmap
                 * with group lock held. generate_buddy look at
                 * them with group lock_held
                 */
-                mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                ext4_lock_group(sb, block_group);
-                                bit, count);
+                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-                ext4_unlock_group(sb, block_group);
        }
-        spin_lock(sb_bgl_lock(sbi, block_group));
        ret = ext4_free_blks_count(sb, gdp) + count;
        ext4_free_blks_set(sb, gdp, ret);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
-        spin_unlock(sb_bgl_lock(sbi, block_group));
+        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
        if (sbi->s_log_groups_per_flex) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index dd9e6cd5f6cf..75e34f69215b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -23,7 +23,6 @@
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
-#include "group.h"
 /*
 * with AGGRESSIVE_CHECK allocator runs consistency checks over
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 22098e1cd085..07eb6649e4fa 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -37,7 +37,6 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
-#include "namei.h"
 #include "xattr.h"
 #include "acl.h"
@@ -750,7 +749,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
                        ext4fs_dirhash(de->name, de->name_len, &h);
                        map_tail--;
                        map_tail->hash = h.hash;
-                        map_tail->offs = (u16) ((char *) de - base);
+                        map_tail->offs = ((char *) de - base)>>2;
                        map_tail->size = le16_to_cpu(de->rec_len);
                        count++;
                        cond_resched();
@@ -1148,7 +1147,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
        unsigned rec_len = 0;
        while (count--) {
-                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+                                                (from + (map->offs<<2));
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
                ((struct ext4_dir_entry_2 *) to)->rec_len =
@@ -1997,7 +1997,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        if (!ext4_handle_valid(handle))
                return 0;
-        lock_super(sb);
+        mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
        if (!list_empty(&EXT4_I(inode)->i_orphan))
                goto out_unlock;
@@ -2006,9 +2006,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        /* @@@ FIXME: Observation from aviro:
         * I think I can trigger J_ASSERT in ext4_orphan_add().  We block
-         * here (on lock_super()), so race with ext4_link() which might bump
+         * here (on s_orphan_lock), so race with ext4_link() which might bump
         * ->i_nlink. For, say it, character device. Not a regular file,
         * not a directory, not a symlink and ->i_nlink > 0.
+         *
+         * tytso, 4/25/2009: I'm not sure how that could happen;
+         * shouldn't the fs core protect us from these sort of
+         * unlink()/link() races?
         */
        J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -2045,7 +2049,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        jbd_debug(4, "orphan inode %lu will point to %d\n",
                        inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
-        unlock_super(sb);
+        mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
        ext4_std_error(inode->i_sb, err);
        return err;
 }
@@ -2066,11 +2070,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
        if (!ext4_handle_valid(handle))
                return 0;
-        lock_super(inode->i_sb);
+        mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
-        if (list_empty(&ei->i_orphan)) {
+        if (list_empty(&ei->i_orphan))
-                unlock_super(inode->i_sb);
+                goto out;
-                return 0;
-        }
        ino_next = NEXT_ORPHAN(inode);
        prev = ei->i_orphan.prev;
@@ -2120,7 +2122,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 out_err:
        ext4_std_error(inode->i_sb, err);
 out:
-        unlock_super(inode->i_sb);
+        mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
        return err;
 out_brelse:
@@ -2533,6 +2535,7 @@ const struct inode_operations ext4_dir_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .permission     = ext4_permission,
+        .fiemap         = ext4_fiemap,
 };
 const struct inode_operations ext4_special_inode_operations = {
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
deleted file mode 100644
index 5e4dfff36a00..000000000000
--- a/fs/ext4/namei.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*  linux/fs/ext4/namei.h
- *
- * Copyright (C) 2005 Simtec Electronics
- *      Ben Dooks <ben@simtec.co.uk>
- *
-*/
-extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 546c7dd869e1..27eb289eea37 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
-#include "group.h"
 #define outside(b, first, last) ((b) < (first) || (b) >= (last))
 #define inside(b, first, last)  ((b) >= (first) && (b) < (last))
@@ -193,7 +192,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        if (IS_ERR(handle))
                return PTR_ERR(handle);
-        lock_super(sb);
+        mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
                err = -EBUSY;
                goto exit_journal;
@@ -302,7 +301,7 @@ exit_bh:
        brelse(bh);
 exit_journal:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext4_journal_stop(handle)) && !err)
                err = err2;
@@ -643,11 +642,12 @@ exit_free:
 * important part is that the new block and inode counts are in the backup
 * superblocks, and the location of the new group metadata in the GDT backups.
 *
- * We do not need lock_super() for this, because these blocks are not
+ * We do not need take the s_resize_lock for this, because these
- * otherwise touched by the filesystem code when it is mounted.  We don't
+ * blocks are not otherwise touched by the filesystem code when it is
- * need to worry about last changing from sbi->s_groups_count, because the
+ * mounted.  We don't need to worry about last changing from
- * worst that can happen is that we do not copy the full number of backups
+ * sbi->s_groups_count, because the worst that can happen is that we
- * at this time.  The resize which changed s_groups_count will backup again.
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
 */
 static void update_backups(struct super_block *sb,
                           int blk_off, char *data, int size)
@@ -809,7 +809,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                goto exit_put;
        }
-        lock_super(sb);
+        mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
                ext4_warning(sb, __func__,
                             "multiple resizers run on filesystem!");
@@ -840,7 +840,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        /*
         * OK, now we've set up the new group.  Time to make it active.
         *
-         * Current kernels don't lock all allocations via lock_super(),
+         * We do not lock all allocations via s_resize_lock
         * so we have to be safe wrt. concurrent accesses the group
         * data.  So we need to be careful to set all of the relevant
         * group descriptor data etc. *before* we enable the group.
@@ -900,12 +900,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         *
         * The precise rules we use are:
         *
-         * * Writers of s_groups_count *must* hold lock_super
+         * * Writers of s_groups_count *must* hold s_resize_lock
         * AND
         * * Writers must perform a smp_wmb() after updating all dependent
         *   data and before modifying the groups count
         *
-         * * Readers must hold lock_super() over the access
+         * * Readers must hold s_resize_lock over the access
         * OR
         * * Readers must perform an smp_rmb() after reading the groups count
         *   and before reading any dependent data.
@@ -948,7 +948,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        sb->s_dirt = 1;
 exit_journal:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext4_journal_stop(handle)) && !err)
                err = err2;
        if (!err) {
@@ -986,7 +986,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
-         * taking lock_super() below. */
+         * taking the s_resize_lock below. */
        o_blocks_count = ext4_blocks_count(es);
        o_groups_count = EXT4_SB(sb)->s_groups_count;
@@ -1056,11 +1056,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                goto exit_put;
        }
-        lock_super(sb);
+        mutex_lock(&EXT4_SB(sb)->s_resize_lock);
        if (o_blocks_count != ext4_blocks_count(es)) {
                ext4_warning(sb, __func__,
                             "multiple resizers run on filesystem!");
-                unlock_super(sb);
+                mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                err = -EBUSY;
                goto exit_put;
@@ -1070,14 +1070,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                                                 EXT4_SB(sb)->s_sbh))) {
                ext4_warning(sb, __func__,
                             "error %d on journal write access", err);
-                unlock_super(sb);
+                mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                goto exit_put;
        }
        ext4_blocks_count_set(es, o_blocks_count + add);
        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        sb->s_dirt = 1;
-        unlock_super(sb);
+        mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        /* We add the blocks to the bitmap and set the group need init bit */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e6f222..f016707597a7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -20,6 +20,7 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/time.h>
+#include <linux/vmalloc.h>
 #include <linux/jbd2.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -45,16 +46,20 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "namei.h"
-#include "group.h"
+static int default_mb_history_length = 1000;
+module_param_named(default_mb_history_length, default_mb_history_length,
+                   int, 0644);
+MODULE_PARM_DESC(default_mb_history_length,
+                 "Default number of entries saved for mb_history");
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
-static int ext4_commit_super(struct super_block *sb,
+static int ext4_commit_super(struct super_block *sb, int sync);
-                              struct ext4_super_block *es, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
                                        struct ext4_super_block *es);
 static void ext4_clear_journal_err(struct super_block *sb,
@@ -74,7 +79,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 {
        return le32_to_cpu(bg->bg_block_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+                 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 }
 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -82,7 +87,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 {
        return le32_to_cpu(bg->bg_inode_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 }
 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
@@ -90,7 +95,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 {
        return le32_to_cpu(bg->bg_inode_table_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 __u32 ext4_free_blks_count(struct super_block *sb,
@@ -98,7 +103,7 @@ __u32 ext4_free_blks_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+                 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 }
 __u32 ext4_free_inodes_count(struct super_block *sb,
@@ -106,7 +111,7 @@ __u32 ext4_free_inodes_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_free_inodes_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+                 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 }
 __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -114,7 +119,7 @@ __u32 ext4_used_dirs_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_used_dirs_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+                 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 }
 __u32 ext4_itable_unused_count(struct super_block *sb,
@@ -122,7 +127,7 @@ __u32 ext4_itable_unused_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_itable_unused_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+                 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 }
 void ext4_block_bitmap_set(struct super_block *sb,
@@ -202,8 +207,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
        journal = EXT4_SB(sb)->s_journal;
        if (journal) {
                if (is_journal_aborted(journal)) {
-                        ext4_abort(sb, __func__,
+                        ext4_abort(sb, __func__, "Detected aborted journal");
-                                   "Detected aborted journal");
                        return ERR_PTR(-EROFS);
                }
                return jbd2_journal_start(journal, nblocks);
@@ -302,10 +306,10 @@ static void ext4_handle_error(struct super_block *sb)
                        jbd2_journal_abort(journal, -EIO);
        }
        if (test_opt(sb, ERRORS_RO)) {
-                printk(KERN_CRIT "Remounting filesystem read-only\n");
+                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
                sb->s_flags |= MS_RDONLY;
        }
-        ext4_commit_super(sb, es, 1);
+        ext4_commit_super(sb, 1);
        if (test_opt(sb, ERRORS_PANIC))
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
@@ -395,8 +399,6 @@ void ext4_abort(struct super_block *sb, const char *function,
 {
        va_list args;
-        printk(KERN_CRIT "ext4_abort called.\n");
        va_start(args, fmt);
        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
        vprintk(fmt, args);
@@ -409,7 +411,7 @@ void ext4_abort(struct super_block *sb, const char *function,
        if (sb->s_flags & MS_RDONLY)
                return;
-        printk(KERN_CRIT "Remounting filesystem read-only\n");
+        ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
        sb->s_flags |= MS_RDONLY;
        EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
@@ -417,6 +419,18 @@ void ext4_abort(struct super_block *sb, const char *function,
                jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
+void ext4_msg (struct super_block * sb, const char *prefix,
+                   const char *fmt, ...)
+{
+        va_list args;
+        va_start(args, fmt);
+        printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+}
 void ext4_warning(struct super_block *sb, const char *function,
                  const char *fmt, ...)
 {
@@ -431,7 +445,7 @@ void ext4_warning(struct super_block *sb, const char *function,
 }
 void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
-                                const char *function, const char *fmt, ...)
+                           const char *function, const char *fmt, ...)
 __releases(bitlock)
 __acquires(bitlock)
 {
@@ -447,7 +461,7 @@ __acquires(bitlock)
        if (test_opt(sb, ERRORS_CONT)) {
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                ext4_commit_super(sb, es, 0);
+                ext4_commit_super(sb, 0);
                return;
        }
        ext4_unlock_group(sb, grp);
@@ -467,7 +481,6 @@ __acquires(bitlock)
        return;
 }
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -496,7 +509,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 /*
 * Open the external journal device
 */
-static struct block_device *ext4_blkdev_get(dev_t dev)
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 {
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
@@ -507,7 +520,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
        return bdev;
 fail:
-        printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
+        ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
                        __bdevname(dev, b), PTR_ERR(bdev));
        return NULL;
 }
@@ -543,8 +556,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 {
        struct list_head *l;
-        printk(KERN_ERR "sb orphan head is %d\n",
+        ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
-               le32_to_cpu(sbi->s_es->s_last_orphan));
+                 le32_to_cpu(sbi->s_es->s_last_orphan));
        printk(KERN_ERR "sb_info orphan list:\n");
        list_for_each(l, &sbi->s_orphan) {
@@ -563,6 +576,7 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
@@ -576,7 +590,7 @@ static void ext4_put_super(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                ext4_commit_super(sb, es, 1);
+                ext4_commit_super(sb, 1);
        }
        if (sbi->s_proc) {
                remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -586,7 +600,10 @@ static void ext4_put_super(struct super_block *sb)
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
        kfree(sbi->s_group_desc);
-        kfree(sbi->s_flex_groups);
+        if (is_vmalloc_addr(sbi->s_flex_groups))
+                vfree(sbi->s_flex_groups);
+        else
+                kfree(sbi->s_flex_groups);
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -629,7 +646,6 @@ static void ext4_put_super(struct super_block *sb)
        lock_kernel();
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        return;
 }
 static struct kmem_cache *ext4_inode_cachep;
@@ -644,6 +660,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        ei->i_acl = EXT4_ACL_NOT_CACHED;
        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
@@ -664,14 +681,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_allocated_meta_blocks = 0;
        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
        return &ei->vfs_inode;
 }
 static void ext4_destroy_inode(struct inode *inode)
 {
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
-                printk("EXT4 Inode %p: orphan list check failed!\n",
+                ext4_msg(inode->i_sb, KERN_ERR,
-                        EXT4_I(inode));
+                         "Inode %lu (%p): orphan list check failed!",
+                         inode->i_ino, EXT4_I(inode));
                print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
                                EXT4_I(inode), sizeof(struct ext4_inode_info),
                                true);
@@ -870,12 +889,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",noauto_da_alloc");
        ext4_show_quota_options(seq, sb);
        return 0;
 }
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
-                u64 ino, u32 generation)
+                                        u64 ino, u32 generation)
 {
        struct inode *inode;
@@ -904,14 +923,14 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 }
 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
-                int fh_len, int fh_type)
+                                        int fh_len, int fh_type)
 {
        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
 }
 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
-                int fh_len, int fh_type)
+                                        int fh_len, int fh_type)
 {
        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
@@ -923,7 +942,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
 * which would prevent try_to_free_buffers() from freeing them, we must use
 * jbd2 layer's try_to_free_buffers() function to release them.
 */
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+                                 gfp_t wait)
 {
        journal_t *journal = EXT4_SB(sb)->s_journal;
@@ -992,7 +1012,6 @@ static const struct super_operations ext4_sops = {
        .dirty_inode    = ext4_dirty_inode,
        .delete_inode   = ext4_delete_inode,
        .put_super      = ext4_put_super,
-        .write_super    = ext4_write_super,
        .sync_fs        = ext4_sync_fs,
        .freeze_fs      = ext4_freeze,
        .unfreeze_fs    = ext4_unfreeze,
@@ -1007,6 +1026,25 @@ static const struct super_operations ext4_sops = {
        .bdev_try_to_free_page = bdev_try_to_free_page,
 };
+static const struct super_operations ext4_nojournal_sops = {
+        .alloc_inode    = ext4_alloc_inode,
+        .destroy_inode  = ext4_destroy_inode,
+        .write_inode    = ext4_write_inode,
+        .dirty_inode    = ext4_dirty_inode,
+        .delete_inode   = ext4_delete_inode,
+        .write_super    = ext4_write_super,
+        .put_super      = ext4_put_super,
+        .statfs         = ext4_statfs,
+        .remount_fs     = ext4_remount,
+        .clear_inode    = ext4_clear_inode,
+        .show_options   = ext4_show_options,
+#ifdef CONFIG_QUOTA
+        .quota_read     = ext4_quota_read,
+        .quota_write    = ext4_quota_write,
+#endif
+        .bdev_try_to_free_page = bdev_try_to_free_page,
+};
 static const struct export_operations ext4_export_ops = {
        .fh_to_dentry = ext4_fh_to_dentry,
        .fh_to_parent = ext4_fh_to_parent,
@@ -1023,12 +1061,13 @@ enum {
        Opt_journal_update, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-        Opt_data_err_abort, Opt_data_err_ignore,
+        Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
        Opt_usrquota, Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+        Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio
 };
@@ -1069,6 +1108,7 @@ static const match_table_t tokens = {
        {Opt_data_writeback, "data=writeback"},
        {Opt_data_err_abort, "data_err=abort"},
        {Opt_data_err_ignore, "data_err=ignore"},
+        {Opt_mb_history_length, "mb_history_length=%u"},
        {Opt_offusrjquota, "usrjquota="},
        {Opt_usrjquota, "usrjquota=%s"},
        {Opt_offgrpjquota, "grpjquota="},
@@ -1087,6 +1127,8 @@ static const match_table_t tokens = {
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
+        {Opt_block_validity, "block_validity"},
+        {Opt_noblock_validity, "noblock_validity"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
        {Opt_journal_ioprio, "journal_ioprio=%u"},
        {Opt_auto_da_alloc, "auto_da_alloc=%u"},
@@ -1102,8 +1144,9 @@ static ext4_fsblk_t get_sb_block(void **data)
        if (!options || strncmp(options, "sb=", 3) != 0)
                return 1;       /* Default location */
        options += 3;
-        /*todo: use simple_strtoll with >32bit ext4 */
+        /* TODO: use simple_strtoll with >32bit ext4 */
        sb_block = simple_strtoul(options, &options, 0);
        if (*options && *options != ',') {
                printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
@@ -1113,6 +1156,7 @@ static ext4_fsblk_t get_sb_block(void **data)
        if (*options == ',')
                options++;
        *data = (void *) options;
        return sb_block;
 }
@@ -1206,8 +1250,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
                case Opt_user_xattr:
                case Opt_nouser_xattr:
-                        printk(KERN_ERR "EXT4 (no)user_xattr options "
+                        ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
-                               "not supported\n");
                        break;
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1220,8 +1263,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
                case Opt_acl:
                case Opt_noacl:
-                        printk(KERN_ERR "EXT4 (no)acl options "
+                        ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
-                               "not supported\n");
                        break;
 #endif
                case Opt_journal_update:
@@ -1231,16 +1273,16 @@ static int parse_options(char *options, struct super_block *sb,
                           user to specify an existing inode to be the
                           journal file. */
                        if (is_remount) {
-                                printk(KERN_ERR "EXT4-fs: cannot specify "
+                                ext4_msg(sb, KERN_ERR,
-                                       "journal on remount\n");
+                                         "Cannot specify journal on remount");
                                return 0;
                        }
                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
                        break;
                case Opt_journal_dev:
                        if (is_remount) {
-                                printk(KERN_ERR "EXT4-fs: cannot specify "
+                                ext4_msg(sb, KERN_ERR,
-                                       "journal on remount\n");
+                                        "Cannot specify journal on remount");
                                return 0;
                        }
                        if (match_int(&args[0], &option))
@@ -1294,9 +1336,8 @@ static int parse_options(char *options, struct super_block *sb,
                        if (is_remount) {
                                if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
                                                != data_opt) {
-                                        printk(KERN_ERR
+                                        ext4_msg(sb, KERN_ERR,
-                                                "EXT4-fs: cannot change data "
+                                                "Cannot change data mode on remount");
-                                                "mode on remount\n");
                                        return 0;
                                }
                        } else {
@@ -1310,6 +1351,13 @@ static int parse_options(char *options, struct super_block *sb,
                case Opt_data_err_ignore:
                        clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
                        break;
+                case Opt_mb_history_length:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0)
+                                return 0;
+                        sbi->s_mb_history_max = option;
+                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
                        qtype = USRQUOTA;
@@ -1319,31 +1367,31 @@ static int parse_options(char *options, struct super_block *sb,
 set_qf_name:
                        if (sb_any_quota_loaded(sb) &&
                            !sbi->s_qf_names[qtype]) {
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                       "EXT4-fs: Cannot change journaled "
+                                       "Cannot change journaled "
-                                       "quota options when quota turned on.\n");
+                                       "quota options when quota turned on");
                                return 0;
                        }
                        qname = match_strdup(&args[0]);
                        if (!qname) {
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                        "EXT4-fs: not enough memory for "
+                                        "Not enough memory for "
-                                        "storing quotafile name.\n");
+                                        "storing quotafile name");
                                return 0;
                        }
                        if (sbi->s_qf_names[qtype] &&
                            strcmp(sbi->s_qf_names[qtype], qname)) {
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                        "EXT4-fs: %s quota file already "
+                                        "%s quota file already "
-                                        "specified.\n", QTYPE2NAME(qtype));
+                                        "specified", QTYPE2NAME(qtype));
                                kfree(qname);
                                return 0;
                        }
                        sbi->s_qf_names[qtype] = qname;
                        if (strchr(sbi->s_qf_names[qtype], '/')) {
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                        "EXT4-fs: quotafile must be on "
+                                        "quotafile must be on "
-                                        "filesystem root.\n");
+                                        "filesystem root");
                                kfree(sbi->s_qf_names[qtype]);
                                sbi->s_qf_names[qtype] = NULL;
                                return 0;
@@ -1358,9 +1406,9 @@ set_qf_name:
 clear_qf_name:
                        if (sb_any_quota_loaded(sb) &&
                            sbi->s_qf_names[qtype]) {
-                                printk(KERN_ERR "EXT4-fs: Cannot change "
+                                ext4_msg(sb, KERN_ERR, "Cannot change "
                                        "journaled quota options when "
-                                        "quota turned on.\n");
+                                        "quota turned on");
                                return 0;
                        }
                        /*
@@ -1377,9 +1425,9 @@ clear_qf_name:
 set_qf_format:
                        if (sb_any_quota_loaded(sb) &&
                            sbi->s_jquota_fmt != qfmt) {
-                                printk(KERN_ERR "EXT4-fs: Cannot change "
+                                ext4_msg(sb, KERN_ERR, "Cannot change "
                                        "journaled quota options when "
-                                        "quota turned on.\n");
+                                        "quota turned on");
                                return 0;
                        }
                        sbi->s_jquota_fmt = qfmt;
@@ -1395,8 +1443,8 @@ set_qf_format:
                        break;
                case Opt_noquota:
                        if (sb_any_quota_loaded(sb)) {
-                                printk(KERN_ERR "EXT4-fs: Cannot change quota "
+                                ext4_msg(sb, KERN_ERR, "Cannot change quota "
-                                        "options when quota turned on.\n");
+                                        "options when quota turned on");
                                return 0;
                        }
                        clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1407,8 +1455,8 @@ set_qf_format:
                case Opt_quota:
                case Opt_usrquota:
                case Opt_grpquota:
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                                "EXT4-fs: quota options not supported.\n");
+                                "quota options not supported");
                        break;
                case Opt_usrjquota:
                case Opt_grpjquota:
@@ -1416,9 +1464,8 @@ set_qf_format:
                case Opt_offgrpjquota:
                case Opt_jqfmt_vfsold:
                case Opt_jqfmt_vfsv0:
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                                "EXT4-fs: journaled quota options not "
+                                "journaled quota options not supported");
-                                "supported.\n");
                        break;
                case Opt_noquota:
                        break;
@@ -1443,8 +1490,9 @@ set_qf_format:
                        break;
                case Opt_resize:
                        if (!is_remount) {
-                                printk("EXT4-fs: resize option only available "
+                                ext4_msg(sb, KERN_ERR,
-                                        "for remount\n");
+                                        "resize option only available "
+                                        "for remount");
                                return 0;
                        }
                        if (match_int(&args[0], &option) != 0)
@@ -1474,14 +1522,21 @@ set_qf_format:
                case Opt_delalloc:
                        set_opt(sbi->s_mount_opt, DELALLOC);
                        break;
+                case Opt_block_validity:
+                        set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        break;
+                case Opt_noblock_validity:
+                        clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        break;
                case Opt_inode_readahead_blks:
                        if (match_int(&args[0], &option))
                                return 0;
                        if (option < 0 || option > (1 << 30))
                                return 0;
-                        if (option & (option - 1)) {
+                        if (!is_power_of_2(option)) {
-                                printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
+                                ext4_msg(sb, KERN_ERR,
-                                       " must be a power of 2\n");
+                                         "EXT4-fs: inode_readahead_blks"
+                                         " must be a power of 2");
                                return 0;
                        }
                        sbi->s_inode_readahead_blks = option;
@@ -1508,9 +1563,9 @@ set_qf_format:
                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
                        break;
                default:
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: Unrecognized mount option \"%s\" "
+                               "Unrecognized mount option \"%s\" "
-                               "or missing value\n", p);
+                               "or missing value", p);
                        return 0;
                }
        }
@@ -1528,21 +1583,21 @@ set_qf_format:
                                (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
                    (sbi->s_qf_names[GRPQUOTA] &&
                                (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
-                        printk(KERN_ERR "EXT4-fs: old and new quota "
+                        ext4_msg(sb, KERN_ERR, "old and new quota "
-                                        "format mixing.\n");
+                                        "format mixing");
                        return 0;
                }
                if (!sbi->s_jquota_fmt) {
-                        printk(KERN_ERR "EXT4-fs: journaled quota format "
+                        ext4_msg(sb, KERN_ERR, "journaled quota format "
-                                        "not specified.\n");
+                                        "not specified");
                        return 0;
                }
        } else {
                if (sbi->s_jquota_fmt) {
-                        printk(KERN_ERR "EXT4-fs: journaled quota format "
+                        ext4_msg(sb, KERN_ERR, "journaled quota format "
                                        "specified with no journaling "
-                                        "enabled.\n");
+                                        "enabled");
                        return 0;
                }
        }
@@ -1557,32 +1612,32 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        int res = 0;
        if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
-                printk(KERN_ERR "EXT4-fs warning: revision level too high, "
+                ext4_msg(sb, KERN_ERR, "revision level too high, "
-                       "forcing read-only mode\n");
+                         "forcing read-only mode");
                res = MS_RDONLY;
        }
        if (read_only)
                return res;
        if (!(sbi->s_mount_state & EXT4_VALID_FS))
-                printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
+                ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
-                       "running e2fsck is recommended\n");
+                         "running e2fsck is recommended");
        else if ((sbi->s_mount_state & EXT4_ERROR_FS))
-                printk(KERN_WARNING
+                ext4_msg(sb, KERN_WARNING,
-                       "EXT4-fs warning: mounting fs with errors, "
+                         "warning: mounting fs with errors, "
-                       "running e2fsck is recommended\n");
+                         "running e2fsck is recommended");
        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-                printk(KERN_WARNING
+                ext4_msg(sb, KERN_WARNING,
-                       "EXT4-fs warning: maximal mount count reached, "
+                         "warning: maximal mount count reached, "
-                       "running e2fsck is recommended\n");
+                         "running e2fsck is recommended");
        else if (le32_to_cpu(es->s_checkinterval) &&
                (le32_to_cpu(es->s_lastcheck) +
                        le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-                printk(KERN_WARNING
+                ext4_msg(sb, KERN_WARNING,
-                       "EXT4-fs warning: checktime reached, "
+                         "warning: checktime reached, "
-                       "running e2fsck is recommended\n");
+                         "running e2fsck is recommended");
-        if (!sbi->s_journal) 
+        if (!sbi->s_journal)
                es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
@@ -1592,7 +1647,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        if (sbi->s_journal)
                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-        ext4_commit_super(sb, es, 1);
+        ext4_commit_super(sb, 1);
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                "bpg=%lu, ipg=%lu, mo=%04lx]\n",
@@ -1603,11 +1658,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                        sbi->s_mount_opt);
        if (EXT4_SB(sb)->s_journal) {
-                printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+                ext4_msg(sb, KERN_INFO, "%s journal on %s",
-                       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+                       EXT4_SB(sb)->s_journal->j_inode ? "internal" :
                       "external", EXT4_SB(sb)->s_journal->j_devname);
        } else {
-                printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+                ext4_msg(sb, KERN_INFO, "no journal");
        }
        return res;
 }
@@ -1616,10 +1671,10 @@ static int ext4_fill_flex_info(struct super_block *sb)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
-        struct buffer_head *bh;
        ext4_group_t flex_group_count;
        ext4_group_t flex_group;
        int groups_per_flex = 0;
+        size_t size;
        int i;
        if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1634,16 +1689,21 @@ static int ext4_fill_flex_info(struct super_block *sb)
        flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
                        ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
                              EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
-        sbi->s_flex_groups = kzalloc(flex_group_count *
+        size = flex_group_count * sizeof(struct flex_groups);
-                                     sizeof(struct flex_groups), GFP_KERNEL);
+        sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
+        if (sbi->s_flex_groups == NULL) {
+                sbi->s_flex_groups = vmalloc(size);
+                if (sbi->s_flex_groups)
+                        memset(sbi->s_flex_groups, 0, size);
+        }
        if (sbi->s_flex_groups == NULL) {
-                printk(KERN_ERR "EXT4-fs: not enough memory for "
+                ext4_msg(sb, KERN_ERR, "not enough memory for "
-                                "%u flex groups\n", flex_group_count);
+                                "%u flex groups", flex_group_count);
                goto failed;
        }
        for (i = 0; i < sbi->s_groups_count; i++) {
-                gdp = ext4_get_group_desc(sb, i, &bh);
+                gdp = ext4_get_group_desc(sb, i, NULL);
                flex_group = ext4_flex_group(sbi, i);
                atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
@@ -1724,44 +1784,44 @@ static int ext4_check_descriptors(struct super_block *sb)
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
-                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Block bitmap for group %u not in group "
-                               "(block %llu)!\n", i, block_bitmap);
+                               "(block %llu)!", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
-                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode bitmap for group %u not in group "
-                               "(block %llu)!\n", i, inode_bitmap);
+                               "(block %llu)!", i, inode_bitmap);
                        return 0;
                }
                inode_table = ext4_inode_table(sb, gdp);
                if (inode_table < first_block ||
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
-                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode table for group %u not in group "
-                               "(block %llu)!\n", i, inode_table);
+                               "(block %llu)!", i, inode_table);
                        return 0;
                }
-                spin_lock(sb_bgl_lock(sbi, i));
+                ext4_lock_group(sb, i);
                if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
-                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
-                               "Checksum for group %u failed (%u!=%u)\n",
+                                 "Checksum for group %u failed (%u!=%u)",
-                               i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+                                 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
-                               gdp)), le16_to_cpu(gdp->bg_checksum));
+                                     gdp)), le16_to_cpu(gdp->bg_checksum));
                        if (!(sb->s_flags & MS_RDONLY)) {
-                                spin_unlock(sb_bgl_lock(sbi, i));
+                                ext4_unlock_group(sb, i);
                                return 0;
                        }
                }
-                spin_unlock(sb_bgl_lock(sbi, i));
+                ext4_unlock_group(sb, i);
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
-        sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+        sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
        return 1;
 }
@@ -1796,8 +1856,8 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        }
        if (bdev_read_only(sb->s_bdev)) {
-                printk(KERN_ERR "EXT4-fs: write access "
+                ext4_msg(sb, KERN_ERR, "write access "
-                        "unavailable, skipping orphan cleanup.\n");
+                        "unavailable, skipping orphan cleanup");
                return;
        }
@@ -1811,8 +1871,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        }
        if (s_flags & MS_RDONLY) {
-                printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
+                ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
-                       sb->s_id);
                sb->s_flags &= ~MS_RDONLY;
        }
 #ifdef CONFIG_QUOTA
@@ -1823,9 +1882,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                if (EXT4_SB(sb)->s_qf_names[i]) {
                        int ret = ext4_quota_on_mount(sb, i);
                        if (ret < 0)
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                        "EXT4-fs: Cannot turn on journaled "
+                                        "Cannot turn on journaled "
-                                        "quota: error %d\n", ret);
+                                        "quota: error %d", ret);
                }
        }
 #endif
@@ -1842,16 +1901,16 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
                vfs_dq_init(inode);
                if (inode->i_nlink) {
-                        printk(KERN_DEBUG
+                        ext4_msg(sb, KERN_DEBUG,
-                                "%s: truncating inode %lu to %lld bytes\n",
+                                "%s: truncating inode %lu to %lld bytes",
                                __func__, inode->i_ino, inode->i_size);
                        jbd_debug(2, "truncating inode %lu to %lld bytes\n",
                                  inode->i_ino, inode->i_size);
                        ext4_truncate(inode);
                        nr_truncates++;
                } else {
-                        printk(KERN_DEBUG
+                        ext4_msg(sb, KERN_DEBUG,
-                                "%s: deleting unreferenced inode %lu\n",
+                                "%s: deleting unreferenced inode %lu",
                                __func__, inode->i_ino);
                        jbd_debug(2, "deleting unreferenced inode %lu\n",
                                  inode->i_ino);
@@ -1863,11 +1922,11 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
        if (nr_orphans)
-                printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
+                ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
-                       sb->s_id, PLURAL(nr_orphans));
+                       PLURAL(nr_orphans));
        if (nr_truncates)
-                printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
+                ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
-                       sb->s_id, PLURAL(nr_truncates));
+                       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
@@ -1877,6 +1936,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 }
 /*
 * Maximal extent format file size.
 * Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -1927,19 +1987,19 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
        loff_t res = EXT4_NDIR_BLOCKS;
        int meta_blocks;
        loff_t upper_limit;
-        /* This is calculated to be the largest file size for a
+        /* This is calculated to be the largest file size for a dense, block
-         * dense, bitmapped file such that the total number of
+         * mapped file such that the file's total number of 512-byte sectors,
-         * sectors in the file, including data and all indirect blocks,
+         * including data and all indirect blocks, does not exceed (2^48 - 1).
-         * does not exceed 2^48 -1
+         *
-         * __u32 i_blocks_lo and _u16 i_blocks_high representing the
+         * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
-         * total number of  512 bytes blocks of the file
+         * number of 512-byte sectors of the file.
         */
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * !has_huge_files or CONFIG_LBD is not enabled
+                 * !has_huge_files or CONFIG_LBD not enabled implies that
-                 * implies the inode i_block represent total blocks in
+                 * the inode i_block field represents total file blocks in
-                 * 512 bytes 32 == size of vfs inode i_blocks * 8
+                 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
                 */
                upper_limit = (1LL << 32) - 1;
@@ -1981,7 +2041,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 }
 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
-                                ext4_fsblk_t logical_sb_block, int nr)
+                                   ext4_fsblk_t logical_sb_block, int nr)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t bg, first_meta_bg;
@@ -1995,6 +2055,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
        bg = sbi->s_desc_per_block * nr;
        if (ext4_bg_has_super(sb, bg))
                has_super = 1;
        return (has_super + ext4_group_first_block_no(sb, bg));
 }
@@ -2091,8 +2152,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
        if (parse_strtoul(buf, 0x40000000, &t))
                return -EINVAL;
-        /* inode_readahead_blks must be a power of 2 */
+        if (!is_power_of_2(t))
-        if (t & (t-1))
                return -EINVAL;
        sbi->s_inode_readahead_blks = t;
@@ -2100,7 +2160,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 }
 static ssize_t sbi_ui_show(struct ext4_attr *a,
-                                struct ext4_sb_info *sbi, char *buf)
+                           struct ext4_sb_info *sbi, char *buf)
 {
        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
@@ -2205,7 +2265,6 @@ static struct kobj_type ext4_ktype = {
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
 {
        struct buffer_head *bh;
        struct ext4_super_block *es = NULL;
@@ -2256,7 +2315,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
        if (!blocksize) {
-                printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
+                ext4_msg(sb, KERN_ERR, "unable to set blocksize");
                goto out_fail;
        }
@@ -2272,7 +2331,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (!(bh = sb_bread(sb, logical_sb_block))) {
-                printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
+                ext4_msg(sb, KERN_ERR, "unable to read superblock");
                goto out_fail;
        }
        /*
@@ -2321,6 +2380,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+        sbi->s_mb_history_max = default_mb_history_length;
        set_opt(sbi->s_mount_opt, BARRIER);
@@ -2330,7 +2390,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        set_opt(sbi->s_mount_opt, DELALLOC);
        if (!parse_options((char *) data, sb, &journal_devnum,
                           &journal_ioprio, NULL, 0))
                goto failed_mount;
@@ -2342,9 +2401,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
            (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
             EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
             EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-                printk(KERN_WARNING
+                ext4_msg(sb, KERN_WARNING,
-                       "EXT4-fs warning: feature flags set on rev 0 fs, "
+                       "feature flags set on rev 0 fs, "
-                       "running e2fsck is recommended\n");
+                       "running e2fsck is recommended");
        /*
         * Check feature flags regardless of the revision level, since we
@@ -2353,16 +2412,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
        if (features) {
-                printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
+                ext4_msg(sb, KERN_ERR,
-                       "unsupported optional features (%x).\n", sb->s_id,
+                        "Couldn't mount because of "
+                        "unsupported optional features (%x)",
                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
                        ~EXT4_FEATURE_INCOMPAT_SUPP));
                goto failed_mount;
        }
        features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
        if (!(sb->s_flags & MS_RDONLY) && features) {
-                printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
+                ext4_msg(sb, KERN_ERR,
-                       "unsupported optional features (%x).\n", sb->s_id,
+                        "Couldn't mount RDWR because of "
+                        "unsupported optional features (%x)",
                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                goto failed_mount;
@@ -2376,9 +2437,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 */
                if (sizeof(root->i_blocks) < sizeof(u64) &&
                                !(sb->s_flags & MS_RDONLY)) {
-                        printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
+                        ext4_msg(sb, KERN_ERR, "Filesystem with huge "
                                        "files cannot be mounted read-write "
-                                        "without CONFIG_LBD.\n", sb->s_id);
+                                        "without CONFIG_LBD");
                        goto failed_mount;
                }
        }
@@ -2386,17 +2447,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (blocksize < EXT4_MIN_BLOCK_SIZE ||
            blocksize > EXT4_MAX_BLOCK_SIZE) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                       "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
+                       "Unsupported filesystem blocksize %d", blocksize);
-                       blocksize, sb->s_id);
                goto failed_mount;
        }
        if (sb->s_blocksize != blocksize) {
                /* Validate the filesystem blocksize */
                if (!sb_set_blocksize(sb, blocksize)) {
-                        printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
+                        ext4_msg(sb, KERN_ERR, "bad block size %d",
                                        blocksize);
                        goto failed_mount;
                }
@@ -2406,15 +2465,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                offset = do_div(logical_sb_block, blocksize);
                bh = sb_bread(sb, logical_sb_block);
                if (!bh) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: Can't read superblock on 2nd try.\n");
+                               "Can't read superblock on 2nd try");
                        goto failed_mount;
                }
                es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: Magic mismatch, very weird !\n");
+                               "Magic mismatch, very weird!");
                        goto failed_mount;
                }
        }
@@ -2432,30 +2491,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
                    (!is_power_of_2(sbi->s_inode_size)) ||
                    (sbi->s_inode_size > blocksize)) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: unsupported inode size: %d\n",
+                               "unsupported inode size: %d",
                               sbi->s_inode_size);
                        goto failed_mount;
                }
                if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
                        sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
        }
        sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
                if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
                    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
                    !is_power_of_2(sbi->s_desc_size)) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: unsupported descriptor size %lu\n",
+                               "unsupported descriptor size %lu",
                               sbi->s_desc_size);
                        goto failed_mount;
                }
        } else
                sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
        if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
                goto cantfind_ext4;
        sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0)
                goto cantfind_ext4;
@@ -2466,6 +2528,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_mount_state = le16_to_cpu(es->s_state);
        sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
        sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
@@ -2483,25 +2546,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (sbi->s_blocks_per_group > blocksize * 8) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                       "EXT4-fs: #blocks per group too big: %lu\n",
+                       "#blocks per group too big: %lu",
                       sbi->s_blocks_per_group);
                goto failed_mount;
        }
        if (sbi->s_inodes_per_group > blocksize * 8) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                       "EXT4-fs: #inodes per group too big: %lu\n",
+                       "#inodes per group too big: %lu",
                       sbi->s_inodes_per_group);
                goto failed_mount;
        }
        if (ext4_blocks_count(es) >
                    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-                printk(KERN_ERR "EXT4-fs: filesystem on %s:"
+                ext4_msg(sb, KERN_ERR, "filesystem"
-                        " too large to mount safely\n", sb->s_id);
+                        " too large to mount safely");
                if (sizeof(sector_t) < 8)
-                        printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
+                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled");
-                                        "enabled\n");
                goto failed_mount;
        }
@@ -2511,21 +2573,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* check blocks count against device size */
        blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
        if (blocks_count && ext4_blocks_count(es) > blocks_count) {
-                printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
+                ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
-                       "exceeds size of device (%llu blocks)\n",
+                       "exceeds size of device (%llu blocks)",
                       ext4_blocks_count(es), blocks_count);
                goto failed_mount;
        }
-        /*
+        /*
-         * It makes no sense for the first data block to be beyond the end
+         * It makes no sense for the first data block to be beyond the end
-         * of the filesystem.
+         * of the filesystem.
-         */
+         */
-        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-                printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
+                ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
-                       "block %u is beyond end of filesystem (%llu)\n",
+                         "block %u is beyond end of filesystem (%llu)",
-                       le32_to_cpu(es->s_first_data_block),
+                         le32_to_cpu(es->s_first_data_block),
-                       ext4_blocks_count(es));
+                         ext4_blocks_count(es));
                goto failed_mount;
        }
        blocks_count = (ext4_blocks_count(es) -
@@ -2533,9 +2595,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
        if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
-                printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+                ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
                       "(block count %llu, first data block %u, "
-                       "blocks per group %lu)\n", sbi->s_groups_count,
+                       "blocks per group %lu)", sbi->s_groups_count,
                       ext4_blocks_count(es),
                       le32_to_cpu(es->s_first_data_block),
                       EXT4_BLOCKS_PER_GROUP(sb));
@@ -2547,7 +2609,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
                                    GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
-                printk(KERN_ERR "EXT4-fs: not enough memory\n");
+                ext4_msg(sb, KERN_ERR, "not enough memory");
                goto failed_mount;
        }
@@ -2562,21 +2624,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                block = descriptor_loc(sb, logical_sb_block, i);
                sbi->s_group_desc[i] = sb_bread(sb, block);
                if (!sbi->s_group_desc[i]) {
-                        printk(KERN_ERR "EXT4-fs: "
+                        ext4_msg(sb, KERN_ERR,
-                               "can't read group descriptor %d\n", i);
+                               "can't read group descriptor %d", i);
                        db_count = i;
                        goto failed_mount2;
                }
        }
        if (!ext4_check_descriptors(sb)) {
-                printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                goto failed_mount2;
        }
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                if (!ext4_fill_flex_info(sb)) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: unable to initialize "
+                               "unable to initialize "
-                               "flex_bg meta info!\n");
+                               "flex_bg meta info!");
                        goto failed_mount2;
                }
@@ -2598,7 +2660,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
        }
        if (err) {
-                printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount3;
        }
@@ -2607,7 +2669,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /*
         * set up enough so that it can read an inode
         */
-        sb->s_op = &ext4_sops;
+        if (!test_opt(sb, NOLOAD) &&
+            EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+                sb->s_op = &ext4_sops;
+        else
+                sb->s_op = &ext4_nojournal_sops;
        sb->s_export_op = &ext4_export_ops;
        sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
@@ -2615,6 +2681,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sb->dq_op = &ext4_quota_operations;
 #endif
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+        mutex_init(&sbi->s_orphan_lock);
+        mutex_init(&sbi->s_resize_lock);
        sb->s_root = NULL;
@@ -2632,13 +2700,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        goto failed_mount3;
                if (!(sb->s_flags & MS_RDONLY) &&
                    EXT4_SB(sb)->s_journal->j_failed_commit) {
-                        printk(KERN_CRIT "EXT4-fs error (device %s): "
+                        ext4_msg(sb, KERN_CRIT, "error: "
                               "ext4_fill_super: Journal transaction "
-                               "%u is corrupt\n", sb->s_id,
+                               "%u is corrupt",
                               EXT4_SB(sb)->s_journal->j_failed_commit);
                        if (test_opt(sb, ERRORS_RO)) {
-                                printk(KERN_CRIT
+                                ext4_msg(sb, KERN_CRIT,
-                                       "Mounting filesystem read-only\n");
+                                       "Mounting filesystem read-only");
                                sb->s_flags |= MS_RDONLY;
                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2646,14 +2714,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        if (test_opt(sb, ERRORS_PANIC)) {
                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                                ext4_commit_super(sb, es, 1);
+                                ext4_commit_super(sb, 1);
                                goto failed_mount4;
                        }
                }
        } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
              EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
-                printk(KERN_ERR "EXT4-fs: required journal recovery "
+                ext4_msg(sb, KERN_ERR, "required journal recovery "
-                       "suppressed and not mounted read-only\n");
+                       "suppressed and not mounted read-only");
                goto failed_mount4;
        } else {
                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
@@ -2666,7 +2734,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (ext4_blocks_count(es) > 0xffffffffULL &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
-                printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
+                ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
                goto failed_mount4;
        }
@@ -2704,8 +2772,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        case EXT4_MOUNT_WRITEBACK_DATA:
                if (!jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
-                        printk(KERN_ERR "EXT4-fs: Journal does not support "
+                        ext4_msg(sb, KERN_ERR, "Journal does not support "
-                               "requested data journaling mode\n");
+                               "requested data journaling mode");
                        goto failed_mount4;
                }
        default:
@@ -2717,8 +2785,8 @@ no_journal:
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
-                        printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
+                        ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
-                                "its supported only with writeback mode\n");
+                                "its supported only with writeback mode");
                        clear_opt(sbi->s_mount_opt, NOBH);
                }
        }
@@ -2729,18 +2797,18 @@ no_journal:
        root = ext4_iget(sb, EXT4_ROOT_INO);
        if (IS_ERR(root)) {
-                printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+                ext4_msg(sb, KERN_ERR, "get root inode failed");
                ret = PTR_ERR(root);
                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                iput(root);
-                printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
+                ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
                goto failed_mount4;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
-                printk(KERN_ERR "EXT4-fs: get root dentry failed\n");
+                ext4_msg(sb, KERN_ERR, "get root dentry failed");
                iput(root);
                ret = -ENOMEM;
                goto failed_mount4;
@@ -2769,22 +2837,29 @@ no_journal:
                                                        sbi->s_inode_size) {
                sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
                                                       EXT4_GOOD_OLD_INODE_SIZE;
-                printk(KERN_INFO "EXT4-fs: required extra inode space not"
+                ext4_msg(sb, KERN_INFO, "required extra inode space not"
-                        "available.\n");
+                         "available");
        }
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-                printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+                ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
-                                "requested data journaling mode\n");
+                         "requested data journaling mode");
                clear_opt(sbi->s_mount_opt, DELALLOC);
        } else if (test_opt(sb, DELALLOC))
-                printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+                ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
+        err = ext4_setup_system_zone(sb);
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "failed to initialize system "
+                         "zone (%d)\n", err);
+                goto failed_mount4;
+        }
        ext4_ext_init(sb);
        err = ext4_mb_init(sb, needs_recovery);
        if (err) {
-                printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+                ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
-                       err);
+                         err);
                goto failed_mount4;
        }
@@ -2798,19 +2873,11 @@ no_journal:
                goto failed_mount4;
        };
-        /*
-         * akpm: core read_super() calls in here with the superblock locked.
-         * That deadlocks, because orphan cleanup needs to lock the superblock
-         * in numerous places.  Here we just pop the lock - it's relatively
-         * harmless, because we are now ready to accept write_super() requests,
-         * and aviro says that's the only reason for hanging onto the
-         * superblock lock.
-         */
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
        if (needs_recovery) {
-                printk(KERN_INFO "EXT4-fs: recovery complete.\n");
+                ext4_msg(sb, KERN_INFO, "recovery complete");
                ext4_mark_recovery_complete(sb, es);
        }
        if (EXT4_SB(sb)->s_journal) {
@@ -2823,25 +2890,30 @@ no_journal:
        } else
                descr = "out journal";
-        printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
+        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
-               sb->s_id, descr);
        lock_kernel();
        return 0;
 cantfind_ext4:
        if (!silent)
-                printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
+                ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
-                       sb->s_id);
        goto failed_mount;
 failed_mount4:
-        printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+        ext4_msg(sb, KERN_ERR, "mount failed");
+        ext4_release_system_zone(sb);
        if (sbi->s_journal) {
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
 failed_mount3:
+        if (sbi->s_flex_groups) {
+                if (is_vmalloc_addr(sbi->s_flex_groups))
+                        vfree(sbi->s_flex_groups);
+                else
+                        kfree(sbi->s_flex_groups);
+        }
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -2862,6 +2934,7 @@ failed_mount:
        brelse(bh);
 out_fail:
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
        return ret;
@@ -2906,27 +2979,27 @@ static journal_t *ext4_get_journal(struct super_block *sb,
        journal_inode = ext4_iget(sb, journal_inum);
        if (IS_ERR(journal_inode)) {
-                printk(KERN_ERR "EXT4-fs: no journal found.\n");
+                ext4_msg(sb, KERN_ERR, "no journal found");
                return NULL;
        }
        if (!journal_inode->i_nlink) {
                make_bad_inode(journal_inode);
                iput(journal_inode);
-                printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+                ext4_msg(sb, KERN_ERR, "journal inode is deleted");
                return NULL;
        }
        jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
                  journal_inode, journal_inode->i_size);
        if (!S_ISREG(journal_inode->i_mode)) {
-                printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+                ext4_msg(sb, KERN_ERR, "invalid journal inode");
                iput(journal_inode);
                return NULL;
        }
        journal = jbd2_journal_init_inode(journal_inode);
        if (!journal) {
-                printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+                ext4_msg(sb, KERN_ERR, "Could not load journal inode");
                iput(journal_inode);
                return NULL;
        }
@@ -2950,22 +3023,22 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
-        bdev = ext4_blkdev_get(j_dev);
+        bdev = ext4_blkdev_get(j_dev, sb);
        if (bdev == NULL)
                return NULL;
        if (bd_claim(bdev, sb)) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                        "EXT4-fs: failed to claim external journal device.\n");
+                        "failed to claim external journal device");
                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
                return NULL;
        }
        blocksize = sb->s_blocksize;
-        hblock = bdev_hardsect_size(bdev);
+        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                        "EXT4-fs: blocksize too small for journal device.\n");
+                        "blocksize too small for journal device");
                goto out_bdev;
        }
@@ -2973,8 +3046,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        offset = EXT4_MIN_BLOCK_SIZE % blocksize;
        set_blocksize(bdev, blocksize);
        if (!(bh = __bread(bdev, sb_block, blocksize))) {
-                printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
+                ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
-                       "external journal\n");
+                       "external journal");
                goto out_bdev;
        }
@@ -2982,14 +3055,14 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-                printk(KERN_ERR "EXT4-fs: external journal has "
+                ext4_msg(sb, KERN_ERR, "external journal has "
-                                        "bad superblock\n");
+                                        "bad superblock");
                brelse(bh);
                goto out_bdev;
        }
        if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
-                printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+                ext4_msg(sb, KERN_ERR, "journal UUID does not match");
                brelse(bh);
                goto out_bdev;
        }
@@ -3001,25 +3074,26 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
                                        start, len, blocksize);
        if (!journal) {
-                printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+                ext4_msg(sb, KERN_ERR, "failed to create device journal");
                goto out_bdev;
        }
        journal->j_private = sb;
        ll_rw_block(READ, 1, &journal->j_sb_buffer);
        wait_on_buffer(journal->j_sb_buffer);
        if (!buffer_uptodate(journal->j_sb_buffer)) {
-                printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+                ext4_msg(sb, KERN_ERR, "I/O error on journal device");
                goto out_journal;
        }
        if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
-                printk(KERN_ERR "EXT4-fs: External journal has more than one "
+                ext4_msg(sb, KERN_ERR, "External journal has more than one "
-                                        "user (unsupported) - %d\n",
+                                        "user (unsupported) - %d",
                        be32_to_cpu(journal->j_superblock->s_nr_users));
                goto out_journal;
        }
        EXT4_SB(sb)->journal_bdev = bdev;
        ext4_init_journal_params(sb, journal);
        return journal;
 out_journal:
        jbd2_journal_destroy(journal);
 out_bdev:
@@ -3041,8 +3115,8 @@ static int ext4_load_journal(struct super_block *sb,
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
-                printk(KERN_INFO "EXT4-fs: external journal device major/minor "
+                ext4_msg(sb, KERN_INFO, "external journal device major/minor "
-                        "numbers have changed\n");
+                        "numbers have changed");
                journal_dev = new_decode_dev(journal_devnum);
        } else
                journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -3054,24 +3128,23 @@ static int ext4_load_journal(struct super_block *sb,
         * crash?  For recovery, we need to check in advance whether we
         * can get read-write access to the device.
         */
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
                if (sb->s_flags & MS_RDONLY) {
-                        printk(KERN_INFO "EXT4-fs: INFO: recovery "
+                        ext4_msg(sb, KERN_INFO, "INFO: recovery "
-                                        "required on readonly filesystem.\n");
+                                        "required on readonly filesystem");
                        if (really_read_only) {
-                                printk(KERN_ERR "EXT4-fs: write access "
+                                ext4_msg(sb, KERN_ERR, "write access "
-                                        "unavailable, cannot proceed.\n");
+                                        "unavailable, cannot proceed");
                                return -EROFS;
                        }
-                        printk(KERN_INFO "EXT4-fs: write access will "
+                        ext4_msg(sb, KERN_INFO, "write access will "
-                               "be enabled during recovery.\n");
+                               "be enabled during recovery");
                }
        }
        if (journal_inum && journal_dev) {
-                printk(KERN_ERR "EXT4-fs: filesystem has both journal "
+                ext4_msg(sb, KERN_ERR, "filesystem has both journal "
-                       "and inode journals!\n");
+                       "and inode journals!");
                return -EINVAL;
        }
@@ -3084,14 +3157,14 @@ static int ext4_load_journal(struct super_block *sb,
        }
        if (journal->j_flags & JBD2_BARRIER)
-                printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+                ext4_msg(sb, KERN_INFO, "barriers enabled");
        else
-                printk(KERN_INFO "EXT4-fs: barriers disabled\n");
+                ext4_msg(sb, KERN_INFO, "barriers disabled");
        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
                err = jbd2_journal_update_format(journal);
                if (err)  {
-                        printk(KERN_ERR "EXT4-fs: error updating journal.\n");
+                        ext4_msg(sb, KERN_ERR, "error updating journal");
                        jbd2_journal_destroy(journal);
                        return err;
                }
@@ -3103,7 +3176,7 @@ static int ext4_load_journal(struct super_block *sb,
                err = jbd2_journal_load(journal);
        if (err) {
-                printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+                ext4_msg(sb, KERN_ERR, "error loading journal");
                jbd2_journal_destroy(journal);
                return err;
        }
@@ -3114,18 +3187,17 @@ static int ext4_load_journal(struct super_block *sb,
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
-                sb->s_dirt = 1;
                /* Make sure we flush the recovery flag to disk. */
-                ext4_commit_super(sb, es, 1);
+                ext4_commit_super(sb, 1);
        }
        return 0;
 }
-static int ext4_commit_super(struct super_block *sb,
+static int ext4_commit_super(struct super_block *sb, int sync)
-                              struct ext4_super_block *es, int sync)
 {
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
        int error = 0;
@@ -3140,8 +3212,8 @@ static int ext4_commit_super(struct super_block *sb,
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
-                printk(KERN_ERR "EXT4-fs: previous I/O error to "
+                ext4_msg(sb, KERN_ERR, "previous I/O error to "
-                       "superblock detected for %s.\n", sb->s_id);
+                       "superblock detected");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
@@ -3154,7 +3226,7 @@ static int ext4_commit_super(struct super_block *sb,
                                        &EXT4_SB(sb)->s_freeblocks_counter));
        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeinodes_counter));
+        sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
        if (sync) {
@@ -3164,8 +3236,8 @@ static int ext4_commit_super(struct super_block *sb,
                error = buffer_write_io_error(sbh);
                if (error) {
-                        printk(KERN_ERR "EXT4-fs: I/O error while writing "
+                        ext4_msg(sb, KERN_ERR, "I/O error while writing "
-                               "superblock for %s.\n", sb->s_id);
+                               "superblock");
                        clear_buffer_write_io_error(sbh);
                        set_buffer_uptodate(sbh);
                }
@@ -3173,7 +3245,6 @@ static int ext4_commit_super(struct super_block *sb,
        return error;
 }
 /*
 * Have we just finished recovery?  If so, and if we are mounting (or
 * remounting) the filesystem readonly, then we will end up with a
@@ -3192,14 +3263,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
        if (jbd2_journal_flush(journal) < 0)
                goto out;
-        lock_super(sb);
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-                sb->s_dirt = 0;
+                ext4_commit_super(sb, 1);
-                ext4_commit_super(sb, es, 1);
        }
-        unlock_super(sb);
 out:
        jbd2_journal_unlock_updates(journal);
@@ -3238,7 +3306,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                ext4_commit_super(sb, es, 1);
+                ext4_commit_super(sb, 1);
                jbd2_journal_clear_err(journal);
        }
@@ -3257,29 +3325,15 @@ int ext4_force_commit(struct super_block *sb)
                return 0;
        journal = EXT4_SB(sb)->s_journal;
-        if (journal) {
+        if (journal)
-                sb->s_dirt = 0;
                ret = ext4_journal_force_commit(journal);
-        }
        return ret;
 }
-/*
- * Ext4 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
- */
 static void ext4_write_super(struct super_block *sb)
 {
-        if (EXT4_SB(sb)->s_journal) {
+        ext4_commit_super(sb, 1);
-                if (mutex_trylock(&sb->s_lock) != 0)
-                        BUG();
-                sb->s_dirt = 0;
-        } else {
-                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
-        }
 }
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3288,16 +3342,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
        tid_t target;
        trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
-        sb->s_dirt = 0;
+        if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
-        if (EXT4_SB(sb)->s_journal) {
+                if (wait)
-                if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
+                        jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
-                                              &target)) {
-                        if (wait)
-                                jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
-                                                     target);
-                }
-        } else {
-                ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
        }
        return ret;
 }
@@ -3310,34 +3357,32 @@ static int ext4_freeze(struct super_block *sb)
 {
        int error = 0;
        journal_t *journal;
-        sb->s_dirt = 0;
-        if (!(sb->s_flags & MS_RDONLY)) {
+        if (sb->s_flags & MS_RDONLY)
-                journal = EXT4_SB(sb)->s_journal;
+                return 0;
-                if (journal) {
+        journal = EXT4_SB(sb)->s_journal;
-                        /* Now we set up the journal barrier. */
-                        jbd2_journal_lock_updates(journal);
-                        /*
+        /* Now we set up the journal barrier. */
-                         * We don't want to clear needs_recovery flag when we
+        jbd2_journal_lock_updates(journal);
-                         * failed to flush the journal.
-                         */
-                        error = jbd2_journal_flush(journal);
-                        if (error < 0)
-                                goto out;
-                }
-                /* Journal blocked and flushed, clear needs_recovery flag. */
+        /*
-                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+         * Don't clear the needs_recovery flag if we failed to flush
-                error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+         * the journal.
-                if (error)
+         */
-                        goto out;
+        error = jbd2_journal_flush(journal);
+        if (error < 0) {
+        out:
+                jbd2_journal_unlock_updates(journal);
+                return error;
        }
+        /* Journal blocked and flushed, clear needs_recovery flag. */
+        EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+        error = ext4_commit_super(sb, 1);
+        if (error)
+                goto out;
        return 0;
-out:
-        jbd2_journal_unlock_updates(journal);
-        return error;
 }
 /*
@@ -3346,14 +3391,15 @@ out:
 */
 static int ext4_unfreeze(struct super_block *sb)
 {
-        if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
+        if (sb->s_flags & MS_RDONLY)
-                lock_super(sb);
+                return 0;
-                /* Reser the needs_recovery flag before the fs is unlocked. */
-                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+        lock_super(sb);
-                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+        /* Reset the needs_recovery flag before the fs is unlocked. */
-                unlock_super(sb);
+        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+        ext4_commit_super(sb, 1);
-        }
+        unlock_super(sb);
+        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        return 0;
 }
@@ -3432,22 +3478,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                            (sbi->s_mount_state & EXT4_VALID_FS))
                                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                        /*
+                        if (sbi->s_journal)
-                         * We have to unlock super so that we can wait for
-                         * transactions.
-                         */
-                        if (sbi->s_journal) {
-                                unlock_super(sb);
                                ext4_mark_recovery_complete(sb, es);
-                                lock_super(sb);
-                        }
                } else {
                        int ret;
                        if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                        ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
-                                printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+                                ext4_msg(sb, KERN_WARNING, "couldn't "
                                       "remount RDWR because of unsupported "
-                                       "optional features (%x).\n", sb->s_id,
+                                       "optional features (%x)",
                                (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
                                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                                err = -EROFS;
@@ -3456,17 +3495,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        /*
                         * Make sure the group descriptor checksums
-                         * are sane.  If they aren't, refuse to
+                         * are sane.  If they aren't, refuse to remount r/w.
-                         * remount r/w.
                         */
                        for (g = 0; g < sbi->s_groups_count; g++) {
                                struct ext4_group_desc *gdp =
                                        ext4_get_group_desc(sb, g, NULL);
                                if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
-                                        printk(KERN_ERR
+                                        ext4_msg(sb, KERN_ERR,
-               "EXT4-fs: ext4_remount: "
+               "ext4_remount: Checksum for group %u failed (%u!=%u)",
-                "Checksum for group %u failed (%u!=%u)\n",
                g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
                                               le16_to_cpu(gdp->bg_checksum));
                                        err = -EINVAL;
@@ -3480,11 +3517,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                         * require a full umount/remount for now.
                         */
                        if (es->s_last_orphan) {
-                                printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+                                ext4_msg(sb, KERN_WARNING, "Couldn't "
                                       "remount RDWR because of unprocessed "
                                       "orphan inode list.  Please "
-                                       "umount/remount instead.\n",
+                                       "umount/remount instead");
-                                       sb->s_id);
                                err = -EINVAL;
                                goto restore_opts;
                        }
@@ -3504,8 +3540,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                sb->s_flags &= ~MS_RDONLY;
                }
        }
+        ext4_setup_system_zone(sb);
        if (sbi->s_journal == NULL)
-                ext4_commit_super(sb, es, 1);
+                ext4_commit_super(sb, 1);
 #ifdef CONFIG_QUOTA
        /* Release old quota file names */
@@ -3515,6 +3552,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        kfree(old_opts.s_qf_names[i]);
 #endif
        return 0;
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
@@ -3545,9 +3583,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        if (test_opt(sb, MINIX_DF)) {
                sbi->s_overhead_last = 0;
        } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
-                ext4_group_t ngroups = sbi->s_groups_count, i;
+                ext4_group_t i, ngroups = ext4_get_groups_count(sb);
                ext4_fsblk_t overhead = 0;
-                smp_rmb();
                /*
                 * Compute the overhead (FS structures).  This is constant
@@ -3599,11 +3636,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
        buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
        buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
        return 0;
 }
-/* Helper function for writing quotas on sync - we need to start transaction before quota file
+/* Helper function for writing quotas on sync - we need to start transaction
- * is locked for write. Otherwise the are possible deadlocks:
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
 * Process 1                         Process 2
 * ext4_create()                     quota_sync()
 *   jbd2_journal_start()                  write_dquot()
@@ -3627,7 +3665,7 @@ static int ext4_write_dquot(struct dquot *dquot)
        inode = dquot_to_inode(dquot);
        handle = ext4_journal_start(inode,
-                                        EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+                                    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_commit(dquot);
@@ -3643,7 +3681,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
        handle_t *handle;
        handle = ext4_journal_start(dquot_to_inode(dquot),
-                                        EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+                                    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_acquire(dquot);
@@ -3659,7 +3697,7 @@ static int ext4_release_dquot(struct dquot *dquot)
        handle_t *handle;
        handle = ext4_journal_start(dquot_to_inode(dquot),
-                                        EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+                                    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle)) {
                /* Release dquot anyway to avoid endless cycle in dqput() */
                dquot_release(dquot);
@@ -3707,7 +3745,7 @@ static int ext4_write_info(struct super_block *sb, int type)
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
        return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-                        EXT4_SB(sb)->s_jquota_fmt, type);
+                                  EXT4_SB(sb)->s_jquota_fmt, type);
 }
 /*
@@ -3738,9 +3776,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
        if (EXT4_SB(sb)->s_qf_names[type]) {
                /* Quotafile not in fs root? */
                if (path.dentry->d_parent != sb->s_root)
-                        printk(KERN_WARNING
+                        ext4_msg(sb, KERN_WARNING,
-                                "EXT4-fs: Quota file not on filesystem root. "
+                                "Quota file not on filesystem root. "
-                                "Journaled quota will not work.\n");
+                                "Journaled quota will not work");
        }
        /*
@@ -3823,8 +3861,8 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        handle_t *handle = journal_current_handle();
        if (EXT4_SB(sb)->s_journal && !handle) {
-                printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
+                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
-                        " cancelled because transaction is not started.\n",
+                        " cancelled because transaction is not started",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }
@@ -3878,10 +3916,10 @@ out:
 #endif
-static int ext4_get_sb(struct file_system_type *fs_type,
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+                       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 static struct file_system_type ext4_fs_type = {
@@ -3893,14 +3931,14 @@ static struct file_system_type ext4_fs_type = {
 };
 #ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type,
+static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+                          const char *dev_name, void *data,struct vfsmount *mnt)
 {
-        printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
+        printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
-               "to mount using ext4\n");
+               "to mount using ext4\n", dev_name);
-        printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
+        printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
-               "will go away by 2.6.31\n");
+               "will go away by 2.6.31\n", dev_name);
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 static struct file_system_type ext4dev_fs_type = {
@@ -3917,13 +3955,16 @@ static int __init init_ext4_fs(void)
 {
        int err;
+        err = init_ext4_system_zone();
+        if (err)
+                return err;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
-                return -ENOMEM;
+                goto out4;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
        err = init_ext4_mballoc();
        if (err)
-                return err;
+                goto out3;
        err = init_ext4_xattr();
        if (err)
@@ -3948,6 +3989,11 @@ out1:
        exit_ext4_xattr();
 out2:
        exit_ext4_mballoc();
+out3:
+        remove_proc_entry("fs/ext4", NULL);
+        kset_unregister(ext4_kset);
+out4:
+        exit_ext4_system_zone();
        return err;
 }
@@ -3962,6 +4008,7 @@ static void __exit exit_ext4_fs(void)
        exit_ext4_mballoc();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
+        exit_ext4_system_zone();
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 3a981b7f64ca..cad957cdb1e5 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -7,6 +7,7 @@ config GFS2_FS
        select IP_SCTP if DLM_SCTP
        select FS_POSIX_ACL
        select CRC32
+        select SLOW_WORK
        help
          A cluster filesystem.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index a851ea4bdf70..d53a9bea1c2f 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,8 +1,8 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
        glops.o inode.o log.o lops.o main.o meta_io.o \
-        mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
+        aops.o dentry.o export.o file.o \
-        ops_fstype.o ops_inode.o ops_super.o quota.o \
+        ops_fstype.o ops_inode.o quota.o \
        recovery.o rgrp.o super.o sys.o trans.o util.o
 gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/aops.c
index a6dde1751e17..03ebb439ace0 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/aops.c
@@ -28,7 +28,6 @@
 #include "inode.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_address.h"
 #include "quota.h"
 #include "trans.h"
 #include "rgrp.h"
@@ -781,10 +780,12 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        unlock_page(page);
        page_cache_release(page);
-        if (inode->i_size < to) {
+        if (copied) {
-                i_size_write(inode, to);
+                if (inode->i_size < to) {
-                ip->i_disksize = inode->i_size;
+                        i_size_write(inode, to);
-                di->di_size = cpu_to_be64(inode->i_size);
+                        ip->i_disksize = inode->i_size;
+                }
+                gfs2_dinode_out(ip, di);
                mark_inode_dirty(inode);
        }
@@ -824,7 +825,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct buffer_head *dibh;
        struct gfs2_alloc *al = ip->i_alloc;
-        struct gfs2_dinode *di;
        unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
        unsigned int to = from + len;
        int ret;
@@ -847,11 +847,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
                gfs2_page_add_databufs(ip, page, from, to);
        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+        if (ret > 0) {
-        if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
+                if (inode->i_size > ip->i_disksize)
-                di = (struct gfs2_dinode *)dibh->b_data;
+                        ip->i_disksize = inode->i_size;
-                ip->i_disksize = inode->i_size;
+                gfs2_dinode_out(ip, dibh->b_data);
-                di->di_size = cpu_to_be64(inode->i_size);
                mark_inode_dirty(inode);
        }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3a5d3f883e10..329763530dc0 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -25,7 +25,6 @@
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
-#include "ops_address.h"
 /* This doesn't need to be that large as max 64 bit pointers in a 4k
 * block is 512, so __u16 is fine for that. It saves stack space to
@@ -136,7 +135,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
                   and write it out to disk */
                unsigned int n = 1;
-                block = gfs2_alloc_block(ip, &n);
+                error = gfs2_alloc_block(ip, &block, &n);
+                if (error)
+                        goto out_brelse;
                if (isdir) {
                        gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
                        error = gfs2_dir_get_new_buffer(ip, block, &bh);
@@ -476,8 +477,11 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
        blks = dblks + iblks;
        i = sheight;
        do {
+                int error;
                n = blks - alloced;
-                bn = gfs2_alloc_block(ip, &n);
+                error = gfs2_alloc_block(ip, &bn, &n);
+                if (error)
+                        return error;
                alloced += n;
                if (state != ALLOC_DATA || gfs2_is_jdata(ip))
                        gfs2_trans_add_unrevoke(sdp, bn, n);
@@ -1008,7 +1012,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
                gfs2_trans_add_bh(ip->i_gl, bh, 0);
        zero_user(page, offset, length);
+        mark_buffer_dirty(bh);
 unlock:
        unlock_page(page);
        page_cache_release(page);
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..022c66cd5606 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/dentry.c
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index aef4d0c06748..297d7e5cebad 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -803,13 +803,20 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        unsigned int n = 1;
-        u64 bn = gfs2_alloc_block(ip, &n);
+        u64 bn;
-        struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
+        int error;
+        struct buffer_head *bh;
        struct gfs2_leaf *leaf;
        struct gfs2_dirent *dent;
        struct qstr name = { .name = "", .len = 0, .hash = 0 };
+        error = gfs2_alloc_block(ip, &bn, &n);
+        if (error)
+                return NULL;
+        bh = gfs2_meta_new(ip->i_gl, bn);
        if (!bh)
                return NULL;
        gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
        gfs2_trans_add_bh(ip->i_gl, bh, 1);
        gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 899763aed217..07ea9529adda 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -582,8 +582,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
        struct gfs2_ea_header *ea;
        unsigned int n = 1;
        u64 block;
+        int error;
-        block = gfs2_alloc_block(ip, &n);
+        error = gfs2_alloc_block(ip, &block, &n);
+        if (error)
+                return error;
        gfs2_trans_add_unrevoke(sdp, block, 1);
        *bhp = gfs2_meta_new(ip->i_gl, block);
        gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
@@ -617,6 +620,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
                    struct gfs2_ea_request *er)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        int error;
        ea->ea_data_len = cpu_to_be32(er->er_data_len);
        ea->ea_name_len = er->er_name_len;
@@ -642,7 +646,9 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
                        int mh_size = sizeof(struct gfs2_meta_header);
                        unsigned int n = 1;
-                        block = gfs2_alloc_block(ip, &n);
+                        error = gfs2_alloc_block(ip, &block, &n);
+                        if (error)
+                                return error;
                        gfs2_trans_add_unrevoke(sdp, block, 1);
                        bh = gfs2_meta_new(ip->i_gl, block);
                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
@@ -963,7 +969,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        } else {
                u64 blk;
                unsigned int n = 1;
-                blk = gfs2_alloc_block(ip, &n);
+                error = gfs2_alloc_block(ip, &blk, &n);
+                if (error)
+                        return error;
                gfs2_trans_add_unrevoke(sdp, blk, 1);
                indbh = gfs2_meta_new(ip->i_gl, blk);
                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/export.c
index 9200ef221716..9200ef221716 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/export.c
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/file.c
index 5d82e91887e3..73318a3ce6f1 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/file.c
@@ -39,7 +39,6 @@
 #include "trans.h"
 #include "util.h"
 #include "eaops.h"
-#include "ops_address.h"
 /**
 * gfs2_llseek - seek to a location in a file
@@ -425,33 +424,36 @@ static struct vm_operations_struct gfs2_vm_ops = {
        .page_mkwrite = gfs2_page_mkwrite,
 };
 /**
 * gfs2_mmap -
 * @file: The file to map
 * @vma: The VMA which described the mapping
 *
- * Returns: 0 or error code
+ * There is no need to get a lock here unless we should be updating
+ * atime. We ignore any locking errors since the only consequence is
+ * a missed atime update (which will just be deferred until later).
+ *
+ * Returns: 0
 */
 static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-        struct gfs2_holder i_gh;
-        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+        if (!(file->f_flags & O_NOATIME)) {
-        error = gfs2_glock_nq(&i_gh);
+                struct gfs2_holder i_gh;
-        if (error) {
+                int error;
-                gfs2_holder_uninit(&i_gh);
-                return error;
-        }
+                gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+                error = gfs2_glock_nq(&i_gh);
+                file_accessed(file);
+                if (error == 0)
+                        gfs2_glock_dq_uninit(&i_gh);
+        }
        vma->vm_ops = &gfs2_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
-        gfs2_glock_dq_uninit(&i_gh);
+        return 0;
-        return error;
 }
 /**
@@ -692,12 +694,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
-        if (__mandatory_lock(&ip->i_inode))
+        if (fl->fl_type & LOCK_MAND)
-                return -ENOLCK;
+                return -EOPNOTSUPP;
        if (fl->fl_type == F_UNLCK) {
                do_unflock(file, fl);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ff4981090489..2bf62bcc5181 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -796,22 +796,37 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
        gh->gh_ip = 0;
 }
-static int just_schedule(void *word)
+/**
+ * gfs2_glock_holder_wait
+ * @word: unused
+ *
+ * This function and gfs2_glock_demote_wait both show up in the WCHAN
+ * field. Thus I've separated these otherwise identical functions in
+ * order to be more informative to the user.
+ */
+static int gfs2_glock_holder_wait(void *word)
 {
        schedule();
        return 0;
 }
+static int gfs2_glock_demote_wait(void *word)
+{
+        schedule();
+        return 0;
+}
 static void wait_on_holder(struct gfs2_holder *gh)
 {
        might_sleep();
-        wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
+        wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE);
 }
 static void wait_on_demote(struct gfs2_glock *gl)
 {
        might_sleep();
-        wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE);
+        wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
 }
 /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 70f87f43afa2..d5e4ab155ca0 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -310,24 +310,6 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 }
 /**
- * rgrp_go_dump - print out an rgrp
- * @seq: The iterator
- * @gl: The glock in question
- *
- */
-static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
-{
-        const struct gfs2_rgrpd *rgd = gl->gl_object;
-        if (rgd == NULL)
-                return 0;
-        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
-                       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
-                       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
-        return 0;
-}
-/**
 * trans_go_sync - promote/demote the transaction glock
 * @gl: the glock
 * @state: the requested state
@@ -410,7 +392,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
        .go_unlock = rgrp_go_unlock,
-        .go_dump = rgrp_go_dump,
+        .go_dump = gfs2_rgrp_dump,
        .go_type = LM_TYPE_RGRP,
        .go_min_hold_time = HZ / 5,
 };
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 399d1b978049..225347fbff3c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/workqueue.h>
+#include <linux/slow-work.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
@@ -63,9 +64,12 @@ struct gfs2_log_element {
        const struct gfs2_log_operations *le_ops;
 };
+#define GBF_FULL 1
 struct gfs2_bitmap {
        struct buffer_head *bi_bh;
        char *bi_clone;
+        unsigned long bi_flags;
        u32 bi_offset;
        u32 bi_start;
        u32 bi_len;
@@ -90,10 +94,11 @@ struct gfs2_rgrpd {
        struct gfs2_sbd *rd_sbd;
        unsigned int rd_bh_count;
        u32 rd_last_alloc;
-        unsigned char rd_flags;
+        u32 rd_flags;
-#define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
+#define GFS2_RDF_CHECK          0x10000000 /* check for unlinked inodes */
-#define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
+#define GFS2_RDF_UPTODATE       0x20000000 /* rg is up to date */
-#define GFS2_RDF_UPTODATE     0x04      /* rg is up to date */
+#define GFS2_RDF_ERROR          0x40000000 /* error in rg */
+#define GFS2_RDF_MASK           0xf0000000 /* mask for internal flags */
 };
 enum gfs2_state_bits {
@@ -376,11 +381,11 @@ struct gfs2_journal_extent {
 struct gfs2_jdesc {
        struct list_head jd_list;
        struct list_head extent_list;
+        struct slow_work jd_work;
        struct inode *jd_inode;
+        unsigned long jd_flags;
+#define JDF_RECOVERY 1
        unsigned int jd_jid;
-        int jd_dirty;
        unsigned int jd_blocks;
 };
@@ -390,9 +395,6 @@ struct gfs2_statfs_change_host {
        s64 sc_dinodes;
 };
-#define GFS2_GLOCKD_DEFAULT     1
-#define GFS2_GLOCKD_MAX         16
 #define GFS2_QUOTA_DEFAULT      GFS2_QUOTA_OFF
 #define GFS2_QUOTA_OFF          0
 #define GFS2_QUOTA_ACCOUNT      1
@@ -418,6 +420,7 @@ struct gfs2_args {
        unsigned int ar_data:2;                 /* ordered/writeback */
        unsigned int ar_meta:1;                 /* mount metafs */
        unsigned int ar_discard:1;              /* discard requests */
+        int ar_commit;                          /* Commit interval */
 };
 struct gfs2_tune {
@@ -426,7 +429,6 @@ struct gfs2_tune {
        unsigned int gt_incore_log_blocks;
        unsigned int gt_log_flush_secs;
-        unsigned int gt_recoverd_secs;
        unsigned int gt_logd_secs;
        unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -447,6 +449,7 @@ enum {
        SDF_JOURNAL_LIVE        = 1,
        SDF_SHUTDOWN            = 2,
        SDF_NOBARRIERS          = 3,
+        SDF_NORECOVERY          = 4,
 };
 #define GFS2_FSNAME_LEN         256
@@ -493,7 +496,6 @@ struct lm_lockstruct {
        unsigned long ls_flags;
        dlm_lockspace_t *ls_dlm;
-        int ls_recover_jid;
        int ls_recover_jid_done;
        int ls_recover_jid_status;
 };
@@ -582,7 +584,6 @@ struct gfs2_sbd {
        /* Daemon stuff */
-        struct task_struct *sd_recoverd_process;
        struct task_struct *sd_logd_process;
        struct task_struct *sd_quotad_process;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5a31d426116f..2f94bd723698 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -30,7 +30,6 @@
 #include "inode.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_address.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
@@ -1047,154 +1046,7 @@ fail:
        return ERR_PTR(error);
 }
-/**
+static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
- * gfs2_rmdiri - Remove a directory
- * @dip: The parent directory of the directory to be removed
- * @name: The name of the directory to be removed
- * @ip: The GFS2 inode of the directory to be removed
- *
- * Assumes Glocks on dip and ip are held
- *
- * Returns: errno
- */
-int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-                struct gfs2_inode *ip)
-{
-        struct qstr dotname;
-        int error;
-        if (ip->i_entries != 2) {
-                if (gfs2_consist_inode(ip))
-                        gfs2_dinode_print(ip);
-                return -EIO;
-        }
-        error = gfs2_dir_del(dip, name);
-        if (error)
-                return error;
-        error = gfs2_change_nlink(dip, -1);
-        if (error)
-                return error;
-        gfs2_str2qstr(&dotname, ".");
-        error = gfs2_dir_del(ip, &dotname);
-        if (error)
-                return error;
-        gfs2_str2qstr(&dotname, "..");
-        error = gfs2_dir_del(ip, &dotname);
-        if (error)
-                return error;
-        /* It looks odd, but it really should be done twice */
-        error = gfs2_change_nlink(ip, -1);
-        if (error)
-                return error;
-        error = gfs2_change_nlink(ip, -1);
-        if (error)
-                return error;
-        return error;
-}
-/*
- * gfs2_unlink_ok - check to see that a inode is still in a directory
- * @dip: the directory
- * @name: the name of the file
- * @ip: the inode
- *
- * Assumes that the lock on (at least) @dip is held.
- *
- * Returns: 0 if the parent/child relationship is correct, errno if it isn't
- */
-int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-                   const struct gfs2_inode *ip)
-{
-        int error;
-        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
-                return -EPERM;
-        if ((dip->i_inode.i_mode & S_ISVTX) &&
-            dip->i_inode.i_uid != current_fsuid() &&
-            ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
-                return -EPERM;
-        if (IS_APPEND(&dip->i_inode))
-                return -EPERM;
-        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
-        if (error)
-                return error;
-        error = gfs2_dir_check(&dip->i_inode, name, ip);
-        if (error)
-                return error;
-        return 0;
-}
-/**
- * gfs2_readlinki - return the contents of a symlink
- * @ip: the symlink's inode
- * @buf: a pointer to the buffer to be filled
- * @len: a pointer to the length of @buf
- *
- * If @buf is too small, a piece of memory is kmalloc()ed and needs
- * to be freed by the caller.
- *
- * Returns: errno
- */
-int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
-{
-        struct gfs2_holder i_gh;
-        struct buffer_head *dibh;
-        unsigned int x;
-        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-        error = gfs2_glock_nq(&i_gh);
-        if (error) {
-                gfs2_holder_uninit(&i_gh);
-                return error;
-        }
-        if (!ip->i_disksize) {
-                gfs2_consist_inode(ip);
-                error = -EIO;
-                goto out;
-        }
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error)
-                goto out;
-        x = ip->i_disksize + 1;
-        if (x > *len) {
-                *buf = kmalloc(x, GFP_NOFS);
-                if (!*buf) {
-                        error = -ENOMEM;
-                        goto out_brelse;
-                }
-        }
-        memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
-        *len = x;
-out_brelse:
-        brelse(dibh);
-out:
-        gfs2_glock_dq_uninit(&i_gh);
-        return error;
-}
-static int
-__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
        struct buffer_head *dibh;
        int error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c30be2b66580..c341aaf67adb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -11,8 +11,16 @@
 #define __INODE_DOT_H__
 #include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
 #include "util.h"
+extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+                              struct file_ra_state *ra_state,
+                              char *buf, loff_t *pos, unsigned size);
+extern void gfs2_set_aops(struct inode *inode);
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 {
        return !ip->i_height;
@@ -73,30 +81,26 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 }
-void gfs2_set_iop(struct inode *inode);
+extern void gfs2_set_iop(struct inode *inode);
-struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
+extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-                                u64 no_addr, u64 no_formal_ino,
+                                       u64 no_addr, u64 no_formal_ino,
-                                int skip_freeing);
+                                       int skip_freeing);
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
-int gfs2_inode_refresh(struct gfs2_inode *ip);
+extern int gfs2_inode_refresh(struct gfs2_inode *ip);
-int gfs2_dinode_dealloc(struct gfs2_inode *inode);
+extern int gfs2_dinode_dealloc(struct gfs2_inode *inode);
-int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
+extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
-struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-                           int is_root);
+                                  int is_root);
-struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
+extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
-                           unsigned int mode, dev_t dev);
+                                  const struct qstr *name,
-int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+                                  unsigned int mode, dev_t dev);
-                struct gfs2_inode *ip);
+extern int gfs2_permission(struct inode *inode, int mask);
-int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
-                   const struct gfs2_inode *ip);
+extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
-int gfs2_permission(struct inode *inode, int mask);
+extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
+extern void gfs2_dinode_print(const struct gfs2_inode *ip);
-int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
-struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
-void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-void gfs2_dinode_print(const struct gfs2_inode *ip);
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 98918a756410..aa62cf5976e8 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -120,7 +120,7 @@ __acquires(&sdp->sd_log_lock)
                        lock_buffer(bh);
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
-                                submit_bh(WRITE, bh);
+                                submit_bh(WRITE_SYNC_PLUG, bh);
                        } else {
                                unlock_buffer(bh);
                                brelse(bh);
@@ -604,7 +604,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
                goto skip_barrier;
        get_bh(bh);
-        submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+        submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh);
        wait_on_buffer(bh);
        if (buffer_eopnotsupp(bh)) {
                clear_buffer_eopnotsupp(bh);
@@ -664,7 +664,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
                lock_buffer(bh);
                if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
                        bh->b_end_io = end_buffer_write_sync;
-                        submit_bh(WRITE, bh);
+                        submit_bh(WRITE_SYNC_PLUG, bh);
                } else {
                        unlock_buffer(bh);
                        brelse(bh);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 80e4f5f898bb..00315f50fa46 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,6 +13,8 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -189,7 +191,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
                }
                gfs2_log_unlock(sdp);
-                submit_bh(WRITE, bh);
+                submit_bh(WRITE_SYNC_PLUG, bh);
                gfs2_log_lock(sdp);
                n = 0;
@@ -199,7 +201,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
                        gfs2_log_unlock(sdp);
                        lock_buffer(bd2->bd_bh);
                        bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
-                        submit_bh(WRITE, bh);
+                        submit_bh(WRITE_SYNC_PLUG, bh);
                        gfs2_log_lock(sdp);
                        if (++n >= num)
                                break;
@@ -341,7 +343,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
                sdp->sd_log_num_revoke--;
                if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
-                        submit_bh(WRITE, bh);
+                        submit_bh(WRITE_SYNC_PLUG, bh);
                        bh = gfs2_log_get_buf(sdp);
                        mh = (struct gfs2_meta_header *)bh->b_data;
@@ -358,7 +360,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
        }
        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
-        submit_bh(WRITE, bh);
+        submit_bh(WRITE_SYNC_PLUG, bh);
 }
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -560,7 +562,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
        ptr = bh_log_ptr(bh);
        
        get_bh(bh);
-        submit_bh(WRITE, bh);
+        submit_bh(WRITE_SYNC_PLUG, bh);
        gfs2_log_lock(sdp);
        while(!list_empty(list)) {
                bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -586,7 +588,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
                } else {
                        bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
                }
-                submit_bh(WRITE, bh1);
+                submit_bh(WRITE_SYNC_PLUG, bh1);
                gfs2_log_lock(sdp);
                ptr += 2;
        }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a6892ed0840a..eacd78a5d082 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
 #include <asm/atomic.h>
+#include <linux/slow-work.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -113,12 +114,18 @@ static int __init init_gfs2_fs(void)
        if (error)
                goto fail_unregister;
+        error = slow_work_register_user();
+        if (error)
+                goto fail_slow;
        gfs2_register_debugfs();
        printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
        return 0;
+fail_slow:
+        unregister_filesystem(&gfs2meta_fs_type);
 fail_unregister:
        unregister_filesystem(&gfs2_fs_type);
 fail:
@@ -156,6 +163,7 @@ static void __exit exit_gfs2_fs(void)
        gfs2_unregister_debugfs();
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
+        slow_work_unregister_user();
        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 8d6f13256b26..cb8d7a93d5ec 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -31,19 +31,66 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
-#include "ops_address.h"
-static int aspace_get_block(struct inode *inode, sector_t lblock,
+static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
-                            struct buffer_head *bh_result, int create)
 {
-        gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
+        int err;
-        return -EOPNOTSUPP;
+        struct buffer_head *bh, *head;
-}
+        int nr_underway = 0;
+        int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
+                        WRITE_SYNC_PLUG : WRITE));
+        BUG_ON(!PageLocked(page));
+        BUG_ON(!page_has_buffers(page));
+        head = page_buffers(page);
+        bh = head;
+        do {
+                if (!buffer_mapped(bh))
+                        continue;
+                /*
+                 * If it's a fully non-blocking write attempt and we cannot
+                 * lock the buffer then redirty the page.  Note that this can
+                 * potentially cause a busy-wait loop from pdflush and kswapd
+                 * activity, but those code paths have their own higher-level
+                 * throttling.
+                 */
+                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                        lock_buffer(bh);
+                } else if (!trylock_buffer(bh)) {
+                        redirty_page_for_writepage(wbc, page);
+                        continue;
+                }
+                if (test_clear_buffer_dirty(bh)) {
+                        mark_buffer_async_write(bh);
+                } else {
+                        unlock_buffer(bh);
+                }
+        } while ((bh = bh->b_this_page) != head);
+        /*
+         * The page and its buffers are protected by PageWriteback(), so we can
+         * drop the bh refcounts early.
+         */
+        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);
+        do {
+                struct buffer_head *next = bh->b_this_page;
+                if (buffer_async_write(bh)) {
+                        submit_bh(write_op, bh);
+                        nr_underway++;
+                }
+                bh = next;
+        } while (bh != head);
+        unlock_page(page);
-static int gfs2_aspace_writepage(struct page *page,
+        err = 0;
-                                 struct writeback_control *wbc)
+        if (nr_underway == 0)
-{
+                end_page_writeback(page);
-        return block_write_full_page(page, aspace_get_block, wbc);
+        return err;
 }
 static const struct address_space_operations aspace_aops = {
@@ -201,16 +248,32 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
                   struct buffer_head **bhp)
 {
-        *bhp = gfs2_getbuf(gl, blkno, CREATE);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
-        if (!buffer_uptodate(*bhp)) {
+        struct buffer_head *bh;
-                ll_rw_block(READ_META, 1, bhp);
-                if (flags & DIO_WAIT) {
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                        int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
+                return -EIO;
-                        if (error) {
-                                brelse(*bhp);
+        *bhp = bh = gfs2_getbuf(gl, blkno, CREATE);
-                                return error;
-                        }
+        lock_buffer(bh);
-                }
+        if (buffer_uptodate(bh)) {
+                unlock_buffer(bh);
+                return 0;
+        }
+        bh->b_end_io = end_buffer_read_sync;
+        get_bh(bh);
+        submit_bh(READ_SYNC | (1 << BIO_RW_META), bh);
+        if (!(flags & DIO_WAIT))
+                return 0;
+        wait_on_buffer(bh);
+        if (unlikely(!buffer_uptodate(bh))) {
+                struct gfs2_trans *tr = current->journal_info;
+                if (tr && tr->tr_touched)
+                        gfs2_io_error_bh(sdp, bh);
+                brelse(bh);
+                return -EIO;
        }
        return 0;
@@ -404,7 +467,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
        if (buffer_uptodate(first_bh))
                goto out;
        if (!buffer_locked(first_bh))
-                ll_rw_block(READ_META, 1, &first_bh);
+                ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh);
        dblock++;
        extlen--;
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
deleted file mode 100644
index f7e8527a21e0..000000000000
--- a/fs/gfs2/mount.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/parser.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "super.h"
-#include "sys.h"
-#include "util.h"
-enum {
-        Opt_lockproto,
-        Opt_locktable,
-        Opt_hostdata,
-        Opt_spectator,
-        Opt_ignore_local_fs,
-        Opt_localflocks,
-        Opt_localcaching,
-        Opt_debug,
-        Opt_nodebug,
-        Opt_upgrade,
-        Opt_acl,
-        Opt_noacl,
-        Opt_quota_off,
-        Opt_quota_account,
-        Opt_quota_on,
-        Opt_quota,
-        Opt_noquota,
-        Opt_suiddir,
-        Opt_nosuiddir,
-        Opt_data_writeback,
-        Opt_data_ordered,
-        Opt_meta,
-        Opt_discard,
-        Opt_nodiscard,
-        Opt_err,
-};
-static const match_table_t tokens = {
-        {Opt_lockproto, "lockproto=%s"},
-        {Opt_locktable, "locktable=%s"},
-        {Opt_hostdata, "hostdata=%s"},
-        {Opt_spectator, "spectator"},
-        {Opt_ignore_local_fs, "ignore_local_fs"},
-        {Opt_localflocks, "localflocks"},
-        {Opt_localcaching, "localcaching"},
-        {Opt_debug, "debug"},
-        {Opt_nodebug, "nodebug"},
-        {Opt_upgrade, "upgrade"},
-        {Opt_acl, "acl"},
-        {Opt_noacl, "noacl"},
-        {Opt_quota_off, "quota=off"},
-        {Opt_quota_account, "quota=account"},
-        {Opt_quota_on, "quota=on"},
-        {Opt_quota, "quota"},
-        {Opt_noquota, "noquota"},
-        {Opt_suiddir, "suiddir"},
-        {Opt_nosuiddir, "nosuiddir"},
-        {Opt_data_writeback, "data=writeback"},
-        {Opt_data_ordered, "data=ordered"},
-        {Opt_meta, "meta"},
-        {Opt_discard, "discard"},
-        {Opt_nodiscard, "nodiscard"},
-        {Opt_err, NULL}
-};
-/**
- * gfs2_mount_args - Parse mount options
- * @sdp:
- * @data:
- *
- * Return: errno
- */
-int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
-{
-        char *o;
-        int token;
-        substring_t tmp[MAX_OPT_ARGS];
-        /* Split the options into tokens with the "," character and
-           process them */
-        while (1) {
-                o = strsep(&options, ",");
-                if (o == NULL)
-                        break;
-                if (*o == '\0')
-                        continue;
-                token = match_token(o, tokens, tmp);
-                switch (token) {
-                case Opt_lockproto:
-                        match_strlcpy(args->ar_lockproto, &tmp[0],
-                                      GFS2_LOCKNAME_LEN);
-                        break;
-                case Opt_locktable:
-                        match_strlcpy(args->ar_locktable, &tmp[0],
-                                      GFS2_LOCKNAME_LEN);
-                        break;
-                case Opt_hostdata:
-                        match_strlcpy(args->ar_hostdata, &tmp[0],
-                                      GFS2_LOCKNAME_LEN);
-                        break;
-                case Opt_spectator:
-                        args->ar_spectator = 1;
-                        break;
-                case Opt_ignore_local_fs:
-                        args->ar_ignore_local_fs = 1;
-                        break;
-                case Opt_localflocks:
-                        args->ar_localflocks = 1;
-                        break;
-                case Opt_localcaching:
-                        args->ar_localcaching = 1;
-                        break;
-                case Opt_debug:
-                        args->ar_debug = 1;
-                        break;
-                case Opt_nodebug:
-                        args->ar_debug = 0;
-                        break;
-                case Opt_upgrade:
-                        args->ar_upgrade = 1;
-                        break;
-                case Opt_acl:
-                        args->ar_posix_acl = 1;
-                        break;
-                case Opt_noacl:
-                        args->ar_posix_acl = 0;
-                        break;
-                case Opt_quota_off:
-                case Opt_noquota:
-                        args->ar_quota = GFS2_QUOTA_OFF;
-                        break;
-                case Opt_quota_account:
-                        args->ar_quota = GFS2_QUOTA_ACCOUNT;
-                        break;
-                case Opt_quota_on:
-                case Opt_quota:
-                        args->ar_quota = GFS2_QUOTA_ON;
-                        break;
-                case Opt_suiddir:
-                        args->ar_suiddir = 1;
-                        break;
-                case Opt_nosuiddir:
-                        args->ar_suiddir = 0;
-                        break;
-                case Opt_data_writeback:
-                        args->ar_data = GFS2_DATA_WRITEBACK;
-                        break;
-                case Opt_data_ordered:
-                        args->ar_data = GFS2_DATA_ORDERED;
-                        break;
-                case Opt_meta:
-                        args->ar_meta = 1;
-                        break;
-                case Opt_discard:
-                        args->ar_discard = 1;
-                        break;
-                case Opt_nodiscard:
-                        args->ar_discard = 0;
-                        break;
-                case Opt_err:
-                default:
-                        fs_info(sdp, "invalid mount option: %s\n", o);
-                        return -EINVAL;
-                }
-        }
-        return 0;
-}
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
deleted file mode 100644
index 5da21285bba4..000000000000
--- a/fs/gfs2/ops_address.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_ADDRESS_DOT_H__
-#define __OPS_ADDRESS_DOT_H__
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/mm.h>
-extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
-extern int gfs2_internal_read(struct gfs2_inode *ip,
-                              struct file_ra_state *ra_state,
-                              char *buf, loff_t *pos, unsigned size);
-extern void gfs2_set_aops(struct inode *inode);
-#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473ea753..cc34f271b3e7 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,6 +17,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/slow-work.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -55,8 +56,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
        spin_lock_init(&gt->gt_spin);
        gt->gt_incore_log_blocks = 1024;
-        gt->gt_log_flush_secs = 60;
-        gt->gt_recoverd_secs = 60;
        gt->gt_logd_secs = 1;
        gt->gt_quota_simul_sync = 64;
        gt->gt_quota_warn_period = 10;
@@ -526,11 +525,11 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
        }
        /* Set up the buffer cache and SB for real */
-        if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
+        if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
                ret = -EINVAL;
                fs_err(sdp, "FS block size (%u) is too small for device "
                       "block size (%u)\n",
-                       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
+                       sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev));
                goto out;
        }
        if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
@@ -676,6 +675,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
                        break;
                INIT_LIST_HEAD(&jd->extent_list);
+                slow_work_init(&jd->jd_work, &gfs2_recover_ops);
                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
                        if (!jd->jd_inode)
@@ -701,14 +701,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
        struct inode *master = sdp->sd_master_dir->d_inode;
        struct gfs2_holder ji_gh;
-        struct task_struct *p;
        struct gfs2_inode *ip;
        int jindex = 1;
        int error = 0;
        if (undo) {
                jindex = 0;
-                goto fail_recoverd;
+                goto fail_jinode_gh;
        }
        sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
@@ -801,18 +800,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
        gfs2_glock_dq_uninit(&ji_gh);
        jindex = 0;
-        p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
-        error = IS_ERR(p);
-        if (error) {
-                fs_err(sdp, "can't start recoverd thread: %d\n", error);
-                goto fail_jinode_gh;
-        }
-        sdp->sd_recoverd_process = p;
        return 0;
-fail_recoverd:
-        kthread_stop(sdp->sd_recoverd_process);
 fail_jinode_gh:
        if (!sdp->sd_args.ar_spectator)
                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
@@ -1165,6 +1154,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
        sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
+        sdp->sd_args.ar_commit = 60;
        error = gfs2_mount_args(sdp, &sdp->sd_args, data);
        if (error) {
@@ -1172,8 +1162,10 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                goto fail;
        }
-        if (sdp->sd_args.ar_spectator)
+        if (sdp->sd_args.ar_spectator) {
                sb->s_flags |= MS_RDONLY;
+                set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+        }
        if (sdp->sd_args.ar_posix_acl)
                sb->s_flags |= MS_POSIXACL;
@@ -1191,6 +1183,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                               GFS2_BASIC_BLOCK_SHIFT;
        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+        sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
        error = init_names(sdp, silent);
        if (error)
                goto fail;
@@ -1279,9 +1273,22 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
 }
-static struct super_block *get_gfs2_sb(const char *dev_name)
+static int test_meta_super(struct super_block *s, void *ptr)
+{
+        struct block_device *bdev = ptr;
+        return (bdev == s->s_bdev);
+}
+static int set_meta_super(struct super_block *s, void *ptr)
 {
-        struct super_block *sb;
+        return -EINVAL;
+}
+static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+                            const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        struct super_block *s;
+        struct gfs2_sbd *sdp;
        struct path path;
        int error;
@@ -1289,30 +1296,17 @@ static struct super_block *get_gfs2_sb(const char *dev_name)
        if (error) {
                printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
                       dev_name, error);
-                return NULL;
+                return error;
        }
-        sb = path.dentry->d_inode->i_sb;
+        s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
-        if (sb && (sb->s_type == &gfs2_fs_type))
+                 path.dentry->d_inode->i_sb->s_bdev);
-                atomic_inc(&sb->s_active);
-        else
-                sb = NULL;
        path_put(&path);
-        return sb;
+        if (IS_ERR(s)) {
-}
-static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
-                            const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        struct super_block *sb = NULL;
-        struct gfs2_sbd *sdp;
-        sb = get_gfs2_sb(dev_name);
-        if (!sb) {
                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-                return -ENOENT;
+                return PTR_ERR(s);
        }
-        sdp = sb->s_fs_info;
+        sdp = s->s_fs_info;
-        mnt->mnt_sb = sb;
+        mnt->mnt_sb = s;
        mnt->mnt_root = dget(sdp->sd_master_dir);
        return 0;
 }
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1c70fa5168d6..f8bd20baf99c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -262,6 +262,44 @@ out_parent:
        return error;
 }
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+                          const struct gfs2_inode *ip)
+{
+        int error;
+        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+                return -EPERM;
+        if ((dip->i_inode.i_mode & S_ISVTX) &&
+            dip->i_inode.i_uid != current_fsuid() &&
+            ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (IS_APPEND(&dip->i_inode))
+                return -EPERM;
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+        if (error)
+                return error;
+        error = gfs2_dir_check(&dip->i_inode, name, ip);
+        if (error)
+                return error;
+        return 0;
+}
 /**
 * gfs2_unlink - Unlink a file
 * @dir: The inode of the directory containing the file to unlink
@@ -473,6 +511,59 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 /**
+ * gfs2_rmdiri - Remove a directory
+ * @dip: The parent directory of the directory to be removed
+ * @name: The name of the directory to be removed
+ * @ip: The GFS2 inode of the directory to be removed
+ *
+ * Assumes Glocks on dip and ip are held
+ *
+ * Returns: errno
+ */
+static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+                       struct gfs2_inode *ip)
+{
+        struct qstr dotname;
+        int error;
+        if (ip->i_entries != 2) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(ip);
+                return -EIO;
+        }
+        error = gfs2_dir_del(dip, name);
+        if (error)
+                return error;
+        error = gfs2_change_nlink(dip, -1);
+        if (error)
+                return error;
+        gfs2_str2qstr(&dotname, ".");
+        error = gfs2_dir_del(ip, &dotname);
+        if (error)
+                return error;
+        gfs2_str2qstr(&dotname, "..");
+        error = gfs2_dir_del(ip, &dotname);
+        if (error)
+                return error;
+        /* It looks odd, but it really should be done twice */
+        error = gfs2_change_nlink(ip, -1);
+        if (error)
+                return error;
+        error = gfs2_change_nlink(ip, -1);
+        if (error)
+                return error;
+        return error;
+}
+/**
 * gfs2_rmdir - Remove a directory
 * @dir: The parent directory of the directory to be removed
 * @dentry: The dentry of the directory to remove
@@ -885,6 +976,61 @@ out:
 }
 /**
+ * gfs2_readlinki - return the contents of a symlink
+ * @ip: the symlink's inode
+ * @buf: a pointer to the buffer to be filled
+ * @len: a pointer to the length of @buf
+ *
+ * If @buf is too small, a piece of memory is kmalloc()ed and needs
+ * to be freed by the caller.
+ *
+ * Returns: errno
+ */
+static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+{
+        struct gfs2_holder i_gh;
+        struct buffer_head *dibh;
+        unsigned int x;
+        int error;
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+        error = gfs2_glock_nq(&i_gh);
+        if (error) {
+                gfs2_holder_uninit(&i_gh);
+                return error;
+        }
+        if (!ip->i_disksize) {
+                gfs2_consist_inode(ip);
+                error = -EIO;
+                goto out;
+        }
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        x = ip->i_disksize + 1;
+        if (x > *len) {
+                *buf = kmalloc(x, GFP_NOFS);
+                if (!*buf) {
+                        error = -ENOMEM;
+                        goto out_brelse;
+                }
+        }
+        memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
+        *len = x;
+out_brelse:
+        brelse(dibh);
+out:
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+/**
 * gfs2_readlink - Read the value of a symlink
 * @dentry: the symlink
 * @buf: the buffer to read the symlink data into
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
deleted file mode 100644
index 458019569dcb..000000000000
--- a/fs/gfs2/ops_super.c
+++ /dev/null
@@ -1,723 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/statfs.h>
-#include <linux/seq_file.h>
-#include <linux/mount.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-#include <linux/time.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "glock.h"
-#include "inode.h"
-#include "log.h"
-#include "quota.h"
-#include "recovery.h"
-#include "rgrp.h"
-#include "super.h"
-#include "sys.h"
-#include "util.h"
-#include "trans.h"
-#include "dir.h"
-#include "eattr.h"
-#include "bmap.h"
-#include "meta_io.h"
-#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
-/**
- * gfs2_write_inode - Make sure the inode is stable on the disk
- * @inode: The inode
- * @sync: synchronous write flag
- *
- * Returns: errno
- */
-static int gfs2_write_inode(struct inode *inode, int sync)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct gfs2_holder gh;
-        struct buffer_head *bh;
-        struct timespec atime;
-        struct gfs2_dinode *di;
-        int ret = 0;
-        /* Check this is a "normal" inode, etc */
-        if (!test_bit(GIF_USER, &ip->i_flags) ||
-            (current->flags & PF_MEMALLOC))
-                return 0;
-        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-        if (ret)
-                goto do_flush;
-        ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
-        if (ret)
-                goto do_unlock;
-        ret = gfs2_meta_inode_buffer(ip, &bh);
-        if (ret == 0) {
-                di = (struct gfs2_dinode *)bh->b_data;
-                atime.tv_sec = be64_to_cpu(di->di_atime);
-                atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
-                if (timespec_compare(&inode->i_atime, &atime) > 0) {
-                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-                        gfs2_dinode_out(ip, bh->b_data);
-                }
-                brelse(bh);
-        }
-        gfs2_trans_end(sdp);
-do_unlock:
-        gfs2_glock_dq_uninit(&gh);
-do_flush:
-        if (sync != 0)
-                gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
-        return ret;
-}
-/**
- * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
- * @sdp: the filesystem
- *
- * Returns: errno
- */
-static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
-{
-        struct gfs2_holder t_gh;
-        int error;
-        gfs2_quota_sync(sdp);
-        gfs2_statfs_sync(sdp);
-        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
-                                   &t_gh);
-        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                return error;
-        gfs2_meta_syncfs(sdp);
-        gfs2_log_shutdown(sdp);
-        clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-        if (t_gh.gh_gl)
-                gfs2_glock_dq_uninit(&t_gh);
-        gfs2_quota_cleanup(sdp);
-        return error;
-}
-/**
- * gfs2_put_super - Unmount the filesystem
- * @sb: The VFS superblock
- *
- */
-static void gfs2_put_super(struct super_block *sb)
-{
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        int error;
-        /*  Unfreeze the filesystem, if we need to  */
-        mutex_lock(&sdp->sd_freeze_lock);
-        if (sdp->sd_freeze_count)
-                gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-        mutex_unlock(&sdp->sd_freeze_lock);
-        kthread_stop(sdp->sd_quotad_process);
-        kthread_stop(sdp->sd_logd_process);
-        kthread_stop(sdp->sd_recoverd_process);
-        if (!(sb->s_flags & MS_RDONLY)) {
-                error = gfs2_make_fs_ro(sdp);
-                if (error)
-                        gfs2_io_error(sdp);
-        }
-        /*  At this point, we're through modifying the disk  */
-        /*  Release stuff  */
-        iput(sdp->sd_jindex);
-        iput(sdp->sd_inum_inode);
-        iput(sdp->sd_statfs_inode);
-        iput(sdp->sd_rindex);
-        iput(sdp->sd_quota_inode);
-        gfs2_glock_put(sdp->sd_rename_gl);
-        gfs2_glock_put(sdp->sd_trans_gl);
-        if (!sdp->sd_args.ar_spectator) {
-                gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-                iput(sdp->sd_ir_inode);
-                iput(sdp->sd_sc_inode);
-                iput(sdp->sd_qc_inode);
-        }
-        gfs2_glock_dq_uninit(&sdp->sd_live_gh);
-        gfs2_clear_rgrpd(sdp);
-        gfs2_jindex_free(sdp);
-        /*  Take apart glock structures and buffer lists  */
-        gfs2_gl_hash_clear(sdp);
-        /*  Unmount the locking protocol  */
-        gfs2_lm_unmount(sdp);
-        /*  At this point, we're through participating in the lockspace  */
-        gfs2_sys_fs_del(sdp);
-}
-/**
- * gfs2_write_super
- * @sb: the superblock
- *
- */
-static void gfs2_write_super(struct super_block *sb)
-{
-        sb->s_dirt = 0;
-}
-/**
- * gfs2_sync_fs - sync the filesystem
- * @sb: the superblock
- *
- * Flushes the log to disk.
- */
-static int gfs2_sync_fs(struct super_block *sb, int wait)
-{
-        sb->s_dirt = 0;
-        if (wait && sb->s_fs_info)
-                gfs2_log_flush(sb->s_fs_info, NULL);
-        return 0;
-}
-/**
- * gfs2_freeze - prevent further writes to the filesystem
- * @sb: the VFS structure for the filesystem
- *
- */
-static int gfs2_freeze(struct super_block *sb)
-{
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        int error;
-        if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                return -EINVAL;
-        for (;;) {
-                error = gfs2_freeze_fs(sdp);
-                if (!error)
-                        break;
-                switch (error) {
-                case -EBUSY:
-                        fs_err(sdp, "waiting for recovery before freeze\n");
-                        break;
-                default:
-                        fs_err(sdp, "error freezing FS: %d\n", error);
-                        break;
-                }
-                fs_err(sdp, "retrying...\n");
-                msleep(1000);
-        }
-        return 0;
-}
-/**
- * gfs2_unfreeze - reallow writes to the filesystem
- * @sb: the VFS structure for the filesystem
- *
- */
-static int gfs2_unfreeze(struct super_block *sb)
-{
-        gfs2_unfreeze_fs(sb->s_fs_info);
-        return 0;
-}
-/**
- * statfs_fill - fill in the sg for a given RG
- * @rgd: the RG
- * @sc: the sc structure
- *
- * Returns: 0 on success, -ESTALE if the LVB is invalid
- */
-static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
-                            struct gfs2_statfs_change_host *sc)
-{
-        gfs2_rgrp_verify(rgd);
-        sc->sc_total += rgd->rd_data;
-        sc->sc_free += rgd->rd_free;
-        sc->sc_dinodes += rgd->rd_dinodes;
-        return 0;
-}
-/**
- * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
- * @sdp: the filesystem
- * @sc: the sc info that will be returned
- *
- * Any error (other than a signal) will cause this routine to fall back
- * to the synchronous version.
- *
- * FIXME: This really shouldn't busy wait like this.
- *
- * Returns: errno
- */
-static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-        struct gfs2_holder ri_gh;
-        struct gfs2_rgrpd *rgd_next;
-        struct gfs2_holder *gha, *gh;
-        unsigned int slots = 64;
-        unsigned int x;
-        int done;
-        int error = 0, err;
-        memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
-        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
-        if (!gha)
-                return -ENOMEM;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
-        if (error)
-                goto out;
-        rgd_next = gfs2_rgrpd_get_first(sdp);
-        for (;;) {
-                done = 1;
-                for (x = 0; x < slots; x++) {
-                        gh = gha + x;
-                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
-                                err = gfs2_glock_wait(gh);
-                                if (err) {
-                                        gfs2_holder_uninit(gh);
-                                        error = err;
-                                } else {
-                                        if (!error)
-                                                error = statfs_slow_fill(
-                                                        gh->gh_gl->gl_object, sc);
-                                        gfs2_glock_dq_uninit(gh);
-                                }
-                        }
-                        if (gh->gh_gl)
-                                done = 0;
-                        else if (rgd_next && !error) {
-                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
-                                                           LM_ST_SHARED,
-                                                           GL_ASYNC,
-                                                           gh);
-                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
-                                done = 0;
-                        }
-                        if (signal_pending(current))
-                                error = -ERESTARTSYS;
-                }
-                if (done)
-                        break;
-                yield();
-        }
-        gfs2_glock_dq_uninit(&ri_gh);
-out:
-        kfree(gha);
-        return error;
-}
-/**
- * gfs2_statfs_i - Do a statfs
- * @sdp: the filesystem
- * @sg: the sg structure
- *
- * Returns: errno
- */
-static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-        spin_lock(&sdp->sd_statfs_spin);
-        *sc = *m_sc;
-        sc->sc_total += l_sc->sc_total;
-        sc->sc_free += l_sc->sc_free;
-        sc->sc_dinodes += l_sc->sc_dinodes;
-        spin_unlock(&sdp->sd_statfs_spin);
-        if (sc->sc_free < 0)
-                sc->sc_free = 0;
-        if (sc->sc_free > sc->sc_total)
-                sc->sc_free = sc->sc_total;
-        if (sc->sc_dinodes < 0)
-                sc->sc_dinodes = 0;
-        return 0;
-}
-/**
- * gfs2_statfs - Gather and return stats about the filesystem
- * @sb: The superblock
- * @statfsbuf: The buffer
- *
- * Returns: 0 on success or error code
- */
-static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        struct super_block *sb = dentry->d_inode->i_sb;
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_statfs_change_host sc;
-        int error;
-        if (gfs2_tune_get(sdp, gt_statfs_slow))
-                error = gfs2_statfs_slow(sdp, &sc);
-        else
-                error = gfs2_statfs_i(sdp, &sc);
-        if (error)
-                return error;
-        buf->f_type = GFS2_MAGIC;
-        buf->f_bsize = sdp->sd_sb.sb_bsize;
-        buf->f_blocks = sc.sc_total;
-        buf->f_bfree = sc.sc_free;
-        buf->f_bavail = sc.sc_free;
-        buf->f_files = sc.sc_dinodes + sc.sc_free;
-        buf->f_ffree = sc.sc_free;
-        buf->f_namelen = GFS2_FNAMESIZE;
-        return 0;
-}
-/**
- * gfs2_remount_fs - called when the FS is remounted
- * @sb:  the filesystem
- * @flags:  the remount flags
- * @data:  extra data passed in (not used right now)
- *
- * Returns: errno
- */
-static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
-{
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_args args = sdp->sd_args; /* Default to current settings */
-        int error;
-        error = gfs2_mount_args(sdp, &args, data);
-        if (error)
-                return error;
-        /* Not allowed to change locking details */
-        if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
-            strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
-            strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
-                return -EINVAL;
-        /* Some flags must not be changed */
-        if (args_neq(&args, &sdp->sd_args, spectator) ||
-            args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
-            args_neq(&args, &sdp->sd_args, localflocks) ||
-            args_neq(&args, &sdp->sd_args, localcaching) ||
-            args_neq(&args, &sdp->sd_args, meta))
-                return -EINVAL;
-        if (sdp->sd_args.ar_spectator)
-                *flags |= MS_RDONLY;
-        if ((sb->s_flags ^ *flags) & MS_RDONLY) {
-                if (*flags & MS_RDONLY)
-                        error = gfs2_make_fs_ro(sdp);
-                else
-                        error = gfs2_make_fs_rw(sdp);
-                if (error)
-                        return error;
-        }
-        sdp->sd_args = args;
-        if (sdp->sd_args.ar_posix_acl)
-                sb->s_flags |= MS_POSIXACL;
-        else
-                sb->s_flags &= ~MS_POSIXACL;
-        return 0;
-}
-/**
- * gfs2_drop_inode - Drop an inode (test for remote unlink)
- * @inode: The inode to drop
- *
- * If we've received a callback on an iopen lock then its because a
- * remote node tried to deallocate the inode but failed due to this node
- * still having the inode open. Here we mark the link count zero
- * since we know that it must have reached zero if the GLF_DEMOTE flag
- * is set on the iopen glock. If we didn't do a disk read since the
- * remote node removed the final link then we might otherwise miss
- * this event. This check ensures that this node will deallocate the
- * inode's blocks, or alternatively pass the baton on to another
- * node for later deallocation.
- */
-static void gfs2_drop_inode(struct inode *inode)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
-                struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
-                if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
-                        clear_nlink(inode);
-        }
-        generic_drop_inode(inode);
-}
-/**
- * gfs2_clear_inode - Deallocate an inode when VFS is done with it
- * @inode: The VFS inode
- *
- */
-static void gfs2_clear_inode(struct inode *inode)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        /* This tells us its a "real" inode and not one which only
-         * serves to contain an address space (see rgrp.c, meta_io.c)
-         * which therefore doesn't have its own glocks.
-         */
-        if (test_bit(GIF_USER, &ip->i_flags)) {
-                ip->i_gl->gl_object = NULL;
-                gfs2_glock_put(ip->i_gl);
-                ip->i_gl = NULL;
-                if (ip->i_iopen_gh.gh_gl) {
-                        ip->i_iopen_gh.gh_gl->gl_object = NULL;
-                        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-                }
-        }
-}
-static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
-{
-        do {
-                if (d1 == d2)
-                        return 1;
-                d1 = d1->d_parent;
-        } while (!IS_ROOT(d1));
-        return 0;
-}
-/**
- * gfs2_show_options - Show mount options for /proc/mounts
- * @s: seq_file structure
- * @mnt: vfsmount
- *
- * Returns: 0 on success or error code
- */
-static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
-{
-        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
-        struct gfs2_args *args = &sdp->sd_args;
-        if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
-                seq_printf(s, ",meta");
-        if (args->ar_lockproto[0])
-                seq_printf(s, ",lockproto=%s", args->ar_lockproto);
-        if (args->ar_locktable[0])
-                seq_printf(s, ",locktable=%s", args->ar_locktable);
-        if (args->ar_hostdata[0])
-                seq_printf(s, ",hostdata=%s", args->ar_hostdata);
-        if (args->ar_spectator)
-                seq_printf(s, ",spectator");
-        if (args->ar_ignore_local_fs)
-                seq_printf(s, ",ignore_local_fs");
-        if (args->ar_localflocks)
-                seq_printf(s, ",localflocks");
-        if (args->ar_localcaching)
-                seq_printf(s, ",localcaching");
-        if (args->ar_debug)
-                seq_printf(s, ",debug");
-        if (args->ar_upgrade)
-                seq_printf(s, ",upgrade");
-        if (args->ar_posix_acl)
-                seq_printf(s, ",acl");
-        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
-                char *state;
-                switch (args->ar_quota) {
-                case GFS2_QUOTA_OFF:
-                        state = "off";
-                        break;
-                case GFS2_QUOTA_ACCOUNT:
-                        state = "account";
-                        break;
-                case GFS2_QUOTA_ON:
-                        state = "on";
-                        break;
-                default:
-                        state = "unknown";
-                        break;
-                }
-                seq_printf(s, ",quota=%s", state);
-        }
-        if (args->ar_suiddir)
-                seq_printf(s, ",suiddir");
-        if (args->ar_data != GFS2_DATA_DEFAULT) {
-                char *state;
-                switch (args->ar_data) {
-                case GFS2_DATA_WRITEBACK:
-                        state = "writeback";
-                        break;
-                case GFS2_DATA_ORDERED:
-                        state = "ordered";
-                        break;
-                default:
-                        state = "unknown";
-                        break;
-                }
-                seq_printf(s, ",data=%s", state);
-        }
-        if (args->ar_discard)
-                seq_printf(s, ",discard");
-        return 0;
-}
-/*
- * We have to (at the moment) hold the inodes main lock to cover
- * the gap between unlocking the shared lock on the iopen lock and
- * taking the exclusive lock. I'd rather do a shared -> exclusive
- * conversion on the iopen lock, but we can change that later. This
- * is safe, just less efficient.
- */
-static void gfs2_delete_inode(struct inode *inode)
-{
-        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_holder gh;
-        int error;
-        if (!test_bit(GIF_USER, &ip->i_flags))
-                goto out;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-        if (unlikely(error)) {
-                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-                goto out;
-        }
-        gfs2_glock_dq_wait(&ip->i_iopen_gh);
-        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
-        error = gfs2_glock_nq(&ip->i_iopen_gh);
-        if (error)
-                goto out_truncate;
-        if (S_ISDIR(inode->i_mode) &&
-            (ip->i_diskflags & GFS2_DIF_EXHASH)) {
-                error = gfs2_dir_exhash_dealloc(ip);
-                if (error)
-                        goto out_unlock;
-        }
-        if (ip->i_eattr) {
-                error = gfs2_ea_dealloc(ip);
-                if (error)
-                        goto out_unlock;
-        }
-        if (!gfs2_is_stuffed(ip)) {
-                error = gfs2_file_dealloc(ip);
-                if (error)
-                        goto out_unlock;
-        }
-        error = gfs2_dinode_dealloc(ip);
-        if (error)
-                goto out_unlock;
-out_truncate:
-        error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-        if (error)
-                goto out_unlock;
-        /* Needs to be done before glock release & also in a transaction */
-        truncate_inode_pages(&inode->i_data, 0);
-        gfs2_trans_end(sdp);
-out_unlock:
-        if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
-                gfs2_glock_dq(&ip->i_iopen_gh);
-        gfs2_holder_uninit(&ip->i_iopen_gh);
-        gfs2_glock_dq_uninit(&gh);
-        if (error && error != GLR_TRYFAILED)
-                fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
-out:
-        truncate_inode_pages(&inode->i_data, 0);
-        clear_inode(inode);
-}
-static struct inode *gfs2_alloc_inode(struct super_block *sb)
-{
-        struct gfs2_inode *ip;
-        ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
-        if (ip) {
-                ip->i_flags = 0;
-                ip->i_gl = NULL;
-        }
-        return &ip->i_inode;
-}
-static void gfs2_destroy_inode(struct inode *inode)
-{
-        kmem_cache_free(gfs2_inode_cachep, inode);
-}
-const struct super_operations gfs2_super_ops = {
-        .alloc_inode            = gfs2_alloc_inode,
-        .destroy_inode          = gfs2_destroy_inode,
-        .write_inode            = gfs2_write_inode,
-        .delete_inode           = gfs2_delete_inode,
-        .put_super              = gfs2_put_super,
-        .write_super            = gfs2_write_super,
-        .sync_fs                = gfs2_sync_fs,
-        .freeze_fs              = gfs2_freeze,
-        .unfreeze_fs            = gfs2_unfreeze,
-        .statfs                 = gfs2_statfs,
-        .remount_fs             = gfs2_remount_fs,
-        .clear_inode            = gfs2_clear_inode,
-        .drop_inode             = gfs2_drop_inode,
-        .show_options           = gfs2_show_options,
-};
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 152e6c4a0dca..2e9b9326bfc9 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -60,7 +60,6 @@
 #include "super.h"
 #include "trans.h"
 #include "inode.h"
-#include "ops_address.h"
 #include "util.h"
 #define QUOTA_USER 1
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 247e8f7d6b3d..59d2695509d3 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,8 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/kthread.h>
+#include <linux/slow-work.h>
-#include <linux/freezer.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -441,18 +440,25 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
-/**
+static int gfs2_recover_get_ref(struct slow_work *work)
- * gfs2_recover_journal - recover a given journal
+{
- * @jd: the struct gfs2_jdesc describing the journal
+        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
- *
+        if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
- * Acquire the journal's lock, check to see if the journal is clean, and
+                return -EBUSY;
- * do recovery if necessary.
+        return 0;
- *
+}
- * Returns: errno
- */
-int gfs2_recover_journal(struct gfs2_jdesc *jd)
+static void gfs2_recover_put_ref(struct slow_work *work)
+{
+        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
+        clear_bit(JDF_RECOVERY, &jd->jd_flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
+}
+static void gfs2_recover_work(struct slow_work *work)
 {
+        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
        struct gfs2_log_header_host head;
@@ -569,7 +575,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
                gfs2_glock_dq_uninit(&j_gh);
        fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
-        return 0;
+        return;
 fail_gunlock_tr:
        gfs2_glock_dq_uninit(&t_gh);
@@ -584,70 +590,28 @@ fail_gunlock_j:
 fail:
        gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
-        return error;
 }
-static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+struct slow_work_ops gfs2_recover_ops = {
-{
+        .get_ref = gfs2_recover_get_ref,
-        struct gfs2_jdesc *jd;
+        .put_ref = gfs2_recover_put_ref,
-        int found = 0;
+        .execute = gfs2_recover_work,
+};
-        spin_lock(&sdp->sd_jindex_spin);
-        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-                if (jd->jd_dirty) {
-                        jd->jd_dirty = 0;
-                        found = 1;
-                        break;
-                }
-        }
-        spin_unlock(&sdp->sd_jindex_spin);
-        if (!found)
-                jd = NULL;
-        return jd;
+static int gfs2_recovery_wait(void *word)
-}
-/**
- * gfs2_check_journals - Recover any dirty journals
- * @sdp: the filesystem
- *
- */
-static void gfs2_check_journals(struct gfs2_sbd *sdp)
 {
-        struct gfs2_jdesc *jd;
+        schedule();
+        return 0;
-        for (;;) {
-                jd = gfs2_jdesc_find_dirty(sdp);
-                if (!jd)
-                        break;
-                if (jd != sdp->sd_jdesc)
-                        gfs2_recover_journal(jd);
-        }
 }
-/**
+int gfs2_recover_journal(struct gfs2_jdesc *jd)
- * gfs2_recoverd - Recover dead machine's journals
- * @sdp: Pointer to GFS2 superblock
- *
- */
-int gfs2_recoverd(void *data)
 {
-        struct gfs2_sbd *sdp = data;
+        int rv;
-        unsigned long t;
+        rv = slow_work_enqueue(&jd->jd_work);
+        if (rv)
-        while (!kthread_should_stop()) {
+                return rv;
-                gfs2_check_journals(sdp);
+        wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE);
-                t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
-                if (freezing(current))
-                        refrigerator();
-                schedule_timeout_interruptible(t);
-        }
        return 0;
 }
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index a8218ea15b57..1616ac22569a 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -28,7 +28,7 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
 extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
                    struct gfs2_log_header_host *head);
 extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-extern int gfs2_recoverd(void *data);
+extern struct slow_work_ops gfs2_recover_ops;
 #endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 565038243fa2..de3239731db8 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -29,7 +29,6 @@
 #include "util.h"
 #include "log.h"
 #include "inode.h"
-#include "ops_address.h"
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
@@ -442,6 +441,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
        for (x = 0; x < length; x++) {
                bi = rgd->rd_bits + x;
+                bi->bi_flags = 0;
                /* small rgrp; bitmap stored completely in header block */
                if (length == 1) {
                        bytes = bytes_left;
@@ -580,7 +580,6 @@ static int read_rindex_entry(struct gfs2_inode *ip,
        rgd->rd_gl->gl_object = rgd;
        rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
-        rgd->rd_flags |= GFS2_RDF_CHECK;
        return error;
 }
@@ -701,10 +700,9 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
        u32 rg_flags;
        rg_flags = be32_to_cpu(str->rg_flags);
-        if (rg_flags & GFS2_RGF_NOALLOC)
+        rg_flags &= ~GFS2_RDF_MASK;
-                rgd->rd_flags |= GFS2_RDF_NOALLOC;
+        rgd->rd_flags &= GFS2_RDF_MASK;
-        else
+        rgd->rd_flags |= rg_flags;
-                rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
        rgd->rd_free = be32_to_cpu(str->rg_free);
        rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
        rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
@@ -713,11 +711,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
        struct gfs2_rgrp *str = buf;
-        u32 rg_flags = 0;
-        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+        str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
-                rg_flags |= GFS2_RGF_NOALLOC;
-        str->rg_flags = cpu_to_be32(rg_flags);
        str->rg_free = cpu_to_be32(rgd->rd_free);
        str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
        str->__pad = cpu_to_be32(0);
@@ -775,8 +770,10 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
        }
        if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
+                for (x = 0; x < length; x++)
+                        clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
                gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
-                rgd->rd_flags |= GFS2_RDF_UPTODATE;
+                rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
        }
        spin_lock(&sdp->sd_rindex_spin);
@@ -845,7 +842,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
        struct super_block *sb = sdp->sd_vfs;
        struct block_device *bdev = sb->s_bdev;
        const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
-                                           bdev_hardsect_size(sb->s_bdev);
+                                           bdev_logical_block_size(sb->s_bdev);
        u64 blk;
        sector_t start = 0;
        sector_t nr_sects = 0;
@@ -903,6 +900,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
                        continue;
                if (sdp->sd_args.ar_discard)
                        gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
+                clear_bit(GBF_FULL, &bi->bi_flags);
                memcpy(bi->bi_clone + bi->bi_offset,
                       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
        }
@@ -942,7 +940,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        int ret = 0;
-        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+        if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
                return 0;
        spin_lock(&sdp->sd_rindex_spin);
@@ -1315,30 +1313,37 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 {
        struct gfs2_bitmap *bi = NULL;
        const u32 length = rgd->rd_length;
-        u32 blk = 0;
+        u32 blk = BFITNOENT;
        unsigned int buf, x;
        const unsigned int elen = *n;
-        const u8 *buffer;
+        const u8 *buffer = NULL;
        *n = 0;
        /* Find bitmap block that contains bits for goal block */
        for (buf = 0; buf < length; buf++) {
                bi = rgd->rd_bits + buf;
-                if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+                /* Convert scope of "goal" from rgrp-wide to within found bit block */
-                        break;
+                if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
+                        goal -= bi->bi_start * GFS2_NBBY;
+                        goto do_search;
+                }
        }
+        buf = 0;
+        goal = 0;
-        gfs2_assert(rgd->rd_sbd, buf < length);
+do_search:
-        /* Convert scope of "goal" from rgrp-wide to within found bit block */
-        goal -= bi->bi_start * GFS2_NBBY;
        /* Search (up to entire) bitmap in this rgrp for allocatable block.
           "x <= length", instead of "x < length", because we typically start
           the search in the middle of a bit block, but if we can't find an
           allocatable block anywhere else, we want to be able wrap around and
           search in the first part of our first-searched bit block.  */
        for (x = 0; x <= length; x++) {
+                bi = rgd->rd_bits + buf;
+                if (test_bit(GBF_FULL, &bi->bi_flags) &&
+                    (old_state == GFS2_BLKST_FREE))
+                        goto skip;
                /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
                   bitmaps, so we must search the originals for that. */
                buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1349,33 +1354,39 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
                if (blk != BFITNOENT)
                        break;
+                if ((goal == 0) && (old_state == GFS2_BLKST_FREE))
+                        set_bit(GBF_FULL, &bi->bi_flags);
                /* Try next bitmap block (wrap back to rgrp header if at end) */
-                buf = (buf + 1) % length;
+skip:
-                bi = rgd->rd_bits + buf;
+                buf++;
+                buf %= length;
                goal = 0;
        }
-        if (blk != BFITNOENT && old_state != new_state) {
+        if (blk == BFITNOENT)
-                *n = 1;
+                return blk;
-                gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+        *n = 1;
+        if (old_state == new_state)
+                goto out;
+        gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+        gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
+                    bi->bi_len, blk, new_state);
+        goal = blk;
+        while (*n < elen) {
+                goal++;
+                if (goal >= (bi->bi_len * GFS2_NBBY))
+                        break;
+                if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
+                    GFS2_BLKST_FREE)
+                        break;
                gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
-                            bi->bi_len, blk, new_state);
+                            bi->bi_len, goal, new_state);
-                goal = blk;
+                (*n)++;
-                while (*n < elen) {
-                        goal++;
-                        if (goal >= (bi->bi_len * GFS2_NBBY))
-                                break;
-                        if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
-                            GFS2_BLKST_FREE)
-                                break;
-                        gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
-                                    bi->bi_offset, bi->bi_len, goal,
-                                    new_state);
-                        (*n)++;
-                }
        }
+out:
-        return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk;
+        return (bi->bi_start * GFS2_NBBY) + blk;
 }
 /**
@@ -1435,13 +1446,33 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 }
 /**
- * gfs2_alloc_block - Allocate a block
+ * gfs2_rgrp_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+        const struct gfs2_rgrpd *rgd = gl->gl_object;
+        if (rgd == NULL)
+                return 0;
+        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+                       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+                       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
+        return 0;
+}
+/**
+ * gfs2_alloc_block - Allocate one or more blocks
 * @ip: the inode to allocate the block for
+ * @bn: Used to return the starting block number
+ * @n: requested number of blocks/extent length (value/result)
 *
- * Returns: the allocated block
+ * Returns: 0 or error
 */
-u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
+int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct buffer_head *dibh;
@@ -1457,7 +1488,10 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
                goal = rgd->rd_last_alloc;
        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
-        BUG_ON(blk == BFITNOENT);
+        /* Since all blocks are reserved in advance, this shouldn't happen */
+        if (blk == BFITNOENT)
+                goto rgrp_error;
        rgd->rd_last_alloc = blk;
        block = rgd->rd_data0 + blk;
@@ -1469,7 +1503,9 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
                di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
                brelse(dibh);
        }
-        gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
+        if (rgd->rd_free < *n)
+                goto rgrp_error;
        rgd->rd_free -= *n;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1484,7 +1520,16 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
        rgd->rd_free_clone -= *n;
        spin_unlock(&sdp->sd_rindex_spin);
-        return block;
+        *bn = block;
+        return 0;
+rgrp_error:
+        fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+                (unsigned long long)rgd->rd_addr);
+        fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+        gfs2_rgrp_dump(NULL, rgd->rd_gl);
+        rgd->rd_flags |= GFS2_RDF_ERROR;
+        return -EIO;
 }
 /**
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3181c7e624bf..1e76ff0f3e00 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -14,22 +14,22 @@ struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
-void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
-void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
+extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
-int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
+extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
-struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 {
        BUG_ON(ip->i_alloc == NULL);
@@ -37,22 +37,22 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
        ip->i_alloc = NULL;
 }
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
-                         char *file, unsigned int line);
+                                  unsigned int line);
 #define gfs2_inplace_reserve(ip) \
 gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
-void gfs2_inplace_release(struct gfs2_inode *ip);
+extern void gfs2_inplace_release(struct gfs2_inode *ip);
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
+extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
-u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n);
+extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
-u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
-void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
-void gfs2_unlink_di(struct inode *inode);
+extern void gfs2_unlink_di(struct inode *inode);
 struct gfs2_rgrp_list {
        unsigned int rl_rgrps;
@@ -61,10 +61,11 @@ struct gfs2_rgrp_list {
        struct gfs2_holder *rl_ghs;
 };
-void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
-                    u64 block);
+                           u64 block);
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
+extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
-void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
-u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 601913e0a482..40bcc37e5a70 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,14 +7,20 @@
 * of the GNU General Public License version 2.
 */
+#include <linux/bio.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/crc32.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/bio.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -31,6 +37,183 @@
 #include "super.h"
 #include "trans.h"
 #include "util.h"
+#include "sys.h"
+#include "eattr.h"
+#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
+enum {
+        Opt_lockproto,
+        Opt_locktable,
+        Opt_hostdata,
+        Opt_spectator,
+        Opt_ignore_local_fs,
+        Opt_localflocks,
+        Opt_localcaching,
+        Opt_debug,
+        Opt_nodebug,
+        Opt_upgrade,
+        Opt_acl,
+        Opt_noacl,
+        Opt_quota_off,
+        Opt_quota_account,
+        Opt_quota_on,
+        Opt_quota,
+        Opt_noquota,
+        Opt_suiddir,
+        Opt_nosuiddir,
+        Opt_data_writeback,
+        Opt_data_ordered,
+        Opt_meta,
+        Opt_discard,
+        Opt_nodiscard,
+        Opt_commit,
+        Opt_error,
+};
+static const match_table_t tokens = {
+        {Opt_lockproto, "lockproto=%s"},
+        {Opt_locktable, "locktable=%s"},
+        {Opt_hostdata, "hostdata=%s"},
+        {Opt_spectator, "spectator"},
+        {Opt_ignore_local_fs, "ignore_local_fs"},
+        {Opt_localflocks, "localflocks"},
+        {Opt_localcaching, "localcaching"},
+        {Opt_debug, "debug"},
+        {Opt_nodebug, "nodebug"},
+        {Opt_upgrade, "upgrade"},
+        {Opt_acl, "acl"},
+        {Opt_noacl, "noacl"},
+        {Opt_quota_off, "quota=off"},
+        {Opt_quota_account, "quota=account"},
+        {Opt_quota_on, "quota=on"},
+        {Opt_quota, "quota"},
+        {Opt_noquota, "noquota"},
+        {Opt_suiddir, "suiddir"},
+        {Opt_nosuiddir, "nosuiddir"},
+        {Opt_data_writeback, "data=writeback"},
+        {Opt_data_ordered, "data=ordered"},
+        {Opt_meta, "meta"},
+        {Opt_discard, "discard"},
+        {Opt_nodiscard, "nodiscard"},
+        {Opt_commit, "commit=%d"},
+        {Opt_error, NULL}
+};
+/**
+ * gfs2_mount_args - Parse mount options
+ * @sdp:
+ * @data:
+ *
+ * Return: errno
+ */
+int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
+{
+        char *o;
+        int token;
+        substring_t tmp[MAX_OPT_ARGS];
+        int rv;
+        /* Split the options into tokens with the "," character and
+           process them */
+        while (1) {
+                o = strsep(&options, ",");
+                if (o == NULL)
+                        break;
+                if (*o == '\0')
+                        continue;
+                token = match_token(o, tokens, tmp);
+                switch (token) {
+                case Opt_lockproto:
+                        match_strlcpy(args->ar_lockproto, &tmp[0],
+                                      GFS2_LOCKNAME_LEN);
+                        break;
+                case Opt_locktable:
+                        match_strlcpy(args->ar_locktable, &tmp[0],
+                                      GFS2_LOCKNAME_LEN);
+                        break;
+                case Opt_hostdata:
+                        match_strlcpy(args->ar_hostdata, &tmp[0],
+                                      GFS2_LOCKNAME_LEN);
+                        break;
+                case Opt_spectator:
+                        args->ar_spectator = 1;
+                        break;
+                case Opt_ignore_local_fs:
+                        args->ar_ignore_local_fs = 1;
+                        break;
+                case Opt_localflocks:
+                        args->ar_localflocks = 1;
+                        break;
+                case Opt_localcaching:
+                        args->ar_localcaching = 1;
+                        break;
+                case Opt_debug:
+                        args->ar_debug = 1;
+                        break;
+                case Opt_nodebug:
+                        args->ar_debug = 0;
+                        break;
+                case Opt_upgrade:
+                        args->ar_upgrade = 1;
+                        break;
+                case Opt_acl:
+                        args->ar_posix_acl = 1;
+                        break;
+                case Opt_noacl:
+                        args->ar_posix_acl = 0;
+                        break;
+                case Opt_quota_off:
+                case Opt_noquota:
+                        args->ar_quota = GFS2_QUOTA_OFF;
+                        break;
+                case Opt_quota_account:
+                        args->ar_quota = GFS2_QUOTA_ACCOUNT;
+                        break;
+                case Opt_quota_on:
+                case Opt_quota:
+                        args->ar_quota = GFS2_QUOTA_ON;
+                        break;
+                case Opt_suiddir:
+                        args->ar_suiddir = 1;
+                        break;
+                case Opt_nosuiddir:
+                        args->ar_suiddir = 0;
+                        break;
+                case Opt_data_writeback:
+                        args->ar_data = GFS2_DATA_WRITEBACK;
+                        break;
+                case Opt_data_ordered:
+                        args->ar_data = GFS2_DATA_ORDERED;
+                        break;
+                case Opt_meta:
+                        args->ar_meta = 1;
+                        break;
+                case Opt_discard:
+                        args->ar_discard = 1;
+                        break;
+                case Opt_nodiscard:
+                        args->ar_discard = 0;
+                        break;
+                case Opt_commit:
+                        rv = match_int(&tmp[0], &args->ar_commit);
+                        if (rv || args->ar_commit <= 0) {
+                                fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+                                return rv ? rv : -EINVAL;
+                        }
+                        break;
+                case Opt_error:
+                default:
+                        fs_info(sdp, "invalid mount option: %s\n", o);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
 /**
 * gfs2_jindex_free - Clear all the journal index information
@@ -436,3 +619,719 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
        mutex_unlock(&sdp->sd_freeze_lock);
 }
+/**
+ * gfs2_write_inode - Make sure the inode is stable on the disk
+ * @inode: The inode
+ * @sync: synchronous write flag
+ *
+ * Returns: errno
+ */
+static int gfs2_write_inode(struct inode *inode, int sync)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_holder gh;
+        struct buffer_head *bh;
+        struct timespec atime;
+        struct gfs2_dinode *di;
+        int ret = 0;
+        /* Check this is a "normal" inode, etc */
+        if (!test_bit(GIF_USER, &ip->i_flags) ||
+            (current->flags & PF_MEMALLOC))
+                return 0;
+        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (ret)
+                goto do_flush;
+        ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (ret)
+                goto do_unlock;
+        ret = gfs2_meta_inode_buffer(ip, &bh);
+        if (ret == 0) {
+                di = (struct gfs2_dinode *)bh->b_data;
+                atime.tv_sec = be64_to_cpu(di->di_atime);
+                atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
+                if (timespec_compare(&inode->i_atime, &atime) > 0) {
+                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                        gfs2_dinode_out(ip, bh->b_data);
+                }
+                brelse(bh);
+        }
+        gfs2_trans_end(sdp);
+do_unlock:
+        gfs2_glock_dq_uninit(&gh);
+do_flush:
+        if (sync != 0)
+                gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+        return ret;
+}
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+        struct gfs2_holder t_gh;
+        int error;
+        gfs2_quota_sync(sdp);
+        gfs2_statfs_sync(sdp);
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
+                                   &t_gh);
+        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return error;
+        gfs2_meta_syncfs(sdp);
+        gfs2_log_shutdown(sdp);
+        clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+        if (t_gh.gh_gl)
+                gfs2_glock_dq_uninit(&t_gh);
+        gfs2_quota_cleanup(sdp);
+        return error;
+}
+static int gfs2_umount_recovery_wait(void *word)
+{
+        schedule();
+        return 0;
+}
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+static void gfs2_put_super(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        int error;
+        struct gfs2_jdesc *jd;
+        /*  Unfreeze the filesystem, if we need to  */
+        mutex_lock(&sdp->sd_freeze_lock);
+        if (sdp->sd_freeze_count)
+                gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+        mutex_unlock(&sdp->sd_freeze_lock);
+        /* No more recovery requests */
+        set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+        smp_mb();
+        /* Wait on outstanding recovery */
+restart:
+        spin_lock(&sdp->sd_jindex_spin);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
+                        continue;
+                spin_unlock(&sdp->sd_jindex_spin);
+                wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
+                            gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
+                goto restart;
+        }
+        spin_unlock(&sdp->sd_jindex_spin);
+        kthread_stop(sdp->sd_quotad_process);
+        kthread_stop(sdp->sd_logd_process);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                error = gfs2_make_fs_ro(sdp);
+                if (error)
+                        gfs2_io_error(sdp);
+        }
+        /*  At this point, we're through modifying the disk  */
+        /*  Release stuff  */
+        iput(sdp->sd_jindex);
+        iput(sdp->sd_inum_inode);
+        iput(sdp->sd_statfs_inode);
+        iput(sdp->sd_rindex);
+        iput(sdp->sd_quota_inode);
+        gfs2_glock_put(sdp->sd_rename_gl);
+        gfs2_glock_put(sdp->sd_trans_gl);
+        if (!sdp->sd_args.ar_spectator) {
+                gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+                iput(sdp->sd_ir_inode);
+                iput(sdp->sd_sc_inode);
+                iput(sdp->sd_qc_inode);
+        }
+        gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+        gfs2_clear_rgrpd(sdp);
+        gfs2_jindex_free(sdp);
+        /*  Take apart glock structures and buffer lists  */
+        gfs2_gl_hash_clear(sdp);
+        /*  Unmount the locking protocol  */
+        gfs2_lm_unmount(sdp);
+        /*  At this point, we're through participating in the lockspace  */
+        gfs2_sys_fs_del(sdp);
+}
+/**
+ * gfs2_write_super
+ * @sb: the superblock
+ *
+ */
+static void gfs2_write_super(struct super_block *sb)
+{
+        sb->s_dirt = 0;
+}
+/**
+ * gfs2_sync_fs - sync the filesystem
+ * @sb: the superblock
+ *
+ * Flushes the log to disk.
+ */
+static int gfs2_sync_fs(struct super_block *sb, int wait)
+{
+        sb->s_dirt = 0;
+        if (wait && sb->s_fs_info)
+                gfs2_log_flush(sb->s_fs_info, NULL);
+        return 0;
+}
+/**
+ * gfs2_freeze - prevent further writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+static int gfs2_freeze(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        int error;
+        if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return -EINVAL;
+        for (;;) {
+                error = gfs2_freeze_fs(sdp);
+                if (!error)
+                        break;
+                switch (error) {
+                case -EBUSY:
+                        fs_err(sdp, "waiting for recovery before freeze\n");
+                        break;
+                default:
+                        fs_err(sdp, "error freezing FS: %d\n", error);
+                        break;
+                }
+                fs_err(sdp, "retrying...\n");
+                msleep(1000);
+        }
+        return 0;
+}
+/**
+ * gfs2_unfreeze - reallow writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+static int gfs2_unfreeze(struct super_block *sb)
+{
+        gfs2_unfreeze_fs(sb->s_fs_info);
+        return 0;
+}
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+                            struct gfs2_statfs_change_host *sc)
+{
+        gfs2_rgrp_verify(rgd);
+        sc->sc_total += rgd->rd_data;
+        sc->sc_free += rgd->rd_free;
+        sc->sc_dinodes += rgd->rd_dinodes;
+        return 0;
+}
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+        struct gfs2_holder ri_gh;
+        struct gfs2_rgrpd *rgd_next;
+        struct gfs2_holder *gha, *gh;
+        unsigned int slots = 64;
+        unsigned int x;
+        int done;
+        int error = 0, err;
+        memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+        if (!gha)
+                return -ENOMEM;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                goto out;
+        rgd_next = gfs2_rgrpd_get_first(sdp);
+        for (;;) {
+                done = 1;
+                for (x = 0; x < slots; x++) {
+                        gh = gha + x;
+                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
+                                err = gfs2_glock_wait(gh);
+                                if (err) {
+                                        gfs2_holder_uninit(gh);
+                                        error = err;
+                                } else {
+                                        if (!error)
+                                                error = statfs_slow_fill(
+                                                        gh->gh_gl->gl_object, sc);
+                                        gfs2_glock_dq_uninit(gh);
+                                }
+                        }
+                        if (gh->gh_gl)
+                                done = 0;
+                        else if (rgd_next && !error) {
+                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
+                                                           LM_ST_SHARED,
+                                                           GL_ASYNC,
+                                                           gh);
+                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
+                                done = 0;
+                        }
+                        if (signal_pending(current))
+                                error = -ERESTARTSYS;
+                }
+                if (done)
+                        break;
+                yield();
+        }
+        gfs2_glock_dq_uninit(&ri_gh);
+out:
+        kfree(gha);
+        return error;
+}
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        spin_lock(&sdp->sd_statfs_spin);
+        *sc = *m_sc;
+        sc->sc_total += l_sc->sc_total;
+        sc->sc_free += l_sc->sc_free;
+        sc->sc_dinodes += l_sc->sc_dinodes;
+        spin_unlock(&sdp->sd_statfs_spin);
+        if (sc->sc_free < 0)
+                sc->sc_free = 0;
+        if (sc->sc_free > sc->sc_total)
+                sc->sc_free = sc->sc_total;
+        if (sc->sc_dinodes < 0)
+                sc->sc_dinodes = 0;
+        return 0;
+}
+/**
+ * gfs2_statfs - Gather and return stats about the filesystem
+ * @sb: The superblock
+ * @statfsbuf: The buffer
+ *
+ * Returns: 0 on success or error code
+ */
+static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_inode->i_sb;
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_statfs_change_host sc;
+        int error;
+        if (gfs2_tune_get(sdp, gt_statfs_slow))
+                error = gfs2_statfs_slow(sdp, &sc);
+        else
+                error = gfs2_statfs_i(sdp, &sc);
+        if (error)
+                return error;
+        buf->f_type = GFS2_MAGIC;
+        buf->f_bsize = sdp->sd_sb.sb_bsize;
+        buf->f_blocks = sc.sc_total;
+        buf->f_bfree = sc.sc_free;
+        buf->f_bavail = sc.sc_free;
+        buf->f_files = sc.sc_dinodes + sc.sc_free;
+        buf->f_ffree = sc.sc_free;
+        buf->f_namelen = GFS2_FNAMESIZE;
+        return 0;
+}
+/**
+ * gfs2_remount_fs - called when the FS is remounted
+ * @sb:  the filesystem
+ * @flags:  the remount flags
+ * @data:  extra data passed in (not used right now)
+ *
+ * Returns: errno
+ */
+static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_args args = sdp->sd_args; /* Default to current settings */
+        struct gfs2_tune *gt = &sdp->sd_tune;
+        int error;
+        spin_lock(&gt->gt_spin);
+        args.ar_commit = gt->gt_log_flush_secs;
+        spin_unlock(&gt->gt_spin);
+        error = gfs2_mount_args(sdp, &args, data);
+        if (error)
+                return error;
+        /* Not allowed to change locking details */
+        if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
+            strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
+            strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
+                return -EINVAL;
+        /* Some flags must not be changed */
+        if (args_neq(&args, &sdp->sd_args, spectator) ||
+            args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
+            args_neq(&args, &sdp->sd_args, localflocks) ||
+            args_neq(&args, &sdp->sd_args, localcaching) ||
+            args_neq(&args, &sdp->sd_args, meta))
+                return -EINVAL;
+        if (sdp->sd_args.ar_spectator)
+                *flags |= MS_RDONLY;
+        if ((sb->s_flags ^ *flags) & MS_RDONLY) {
+                if (*flags & MS_RDONLY)
+                        error = gfs2_make_fs_ro(sdp);
+                else
+                        error = gfs2_make_fs_rw(sdp);
+                if (error)
+                        return error;
+        }
+        sdp->sd_args = args;
+        if (sdp->sd_args.ar_posix_acl)
+                sb->s_flags |= MS_POSIXACL;
+        else
+                sb->s_flags &= ~MS_POSIXACL;
+        spin_lock(&gt->gt_spin);
+        gt->gt_log_flush_secs = args.ar_commit;
+        spin_unlock(&gt->gt_spin);
+        return 0;
+}
+/**
+ * gfs2_drop_inode - Drop an inode (test for remote unlink)
+ * @inode: The inode to drop
+ *
+ * If we've received a callback on an iopen lock then its because a
+ * remote node tried to deallocate the inode but failed due to this node
+ * still having the inode open. Here we mark the link count zero
+ * since we know that it must have reached zero if the GLF_DEMOTE flag
+ * is set on the iopen glock. If we didn't do a disk read since the
+ * remote node removed the final link then we might otherwise miss
+ * this event. This check ensures that this node will deallocate the
+ * inode's blocks, or alternatively pass the baton on to another
+ * node for later deallocation.
+ */
+static void gfs2_drop_inode(struct inode *inode)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
+                struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+                if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+                        clear_nlink(inode);
+        }
+        generic_drop_inode(inode);
+}
+/**
+ * gfs2_clear_inode - Deallocate an inode when VFS is done with it
+ * @inode: The VFS inode
+ *
+ */
+static void gfs2_clear_inode(struct inode *inode)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        /* This tells us its a "real" inode and not one which only
+         * serves to contain an address space (see rgrp.c, meta_io.c)
+         * which therefore doesn't have its own glocks.
+         */
+        if (test_bit(GIF_USER, &ip->i_flags)) {
+                ip->i_gl->gl_object = NULL;
+                gfs2_glock_put(ip->i_gl);
+                ip->i_gl = NULL;
+                if (ip->i_iopen_gh.gh_gl) {
+                        ip->i_iopen_gh.gh_gl->gl_object = NULL;
+                        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+                }
+        }
+}
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+        do {
+                if (d1 == d2)
+                        return 1;
+                d1 = d1->d_parent;
+        } while (!IS_ROOT(d1));
+        return 0;
+}
+/**
+ * gfs2_show_options - Show mount options for /proc/mounts
+ * @s: seq_file structure
+ * @mnt: vfsmount
+ *
+ * Returns: 0 on success or error code
+ */
+static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+        struct gfs2_args *args = &sdp->sd_args;
+        int lfsecs;
+        if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+                seq_printf(s, ",meta");
+        if (args->ar_lockproto[0])
+                seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+        if (args->ar_locktable[0])
+                seq_printf(s, ",locktable=%s", args->ar_locktable);
+        if (args->ar_hostdata[0])
+                seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+        if (args->ar_spectator)
+                seq_printf(s, ",spectator");
+        if (args->ar_ignore_local_fs)
+                seq_printf(s, ",ignore_local_fs");
+        if (args->ar_localflocks)
+                seq_printf(s, ",localflocks");
+        if (args->ar_localcaching)
+                seq_printf(s, ",localcaching");
+        if (args->ar_debug)
+                seq_printf(s, ",debug");
+        if (args->ar_upgrade)
+                seq_printf(s, ",upgrade");
+        if (args->ar_posix_acl)
+                seq_printf(s, ",acl");
+        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
+                char *state;
+                switch (args->ar_quota) {
+                case GFS2_QUOTA_OFF:
+                        state = "off";
+                        break;
+                case GFS2_QUOTA_ACCOUNT:
+                        state = "account";
+                        break;
+                case GFS2_QUOTA_ON:
+                        state = "on";
+                        break;
+                default:
+                        state = "unknown";
+                        break;
+                }
+                seq_printf(s, ",quota=%s", state);
+        }
+        if (args->ar_suiddir)
+                seq_printf(s, ",suiddir");
+        if (args->ar_data != GFS2_DATA_DEFAULT) {
+                char *state;
+                switch (args->ar_data) {
+                case GFS2_DATA_WRITEBACK:
+                        state = "writeback";
+                        break;
+                case GFS2_DATA_ORDERED:
+                        state = "ordered";
+                        break;
+                default:
+                        state = "unknown";
+                        break;
+                }
+                seq_printf(s, ",data=%s", state);
+        }
+        if (args->ar_discard)
+                seq_printf(s, ",discard");
+        lfsecs = sdp->sd_tune.gt_log_flush_secs;
+        if (lfsecs != 60)
+                seq_printf(s, ",commit=%d", lfsecs);
+        return 0;
+}
+/*
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+static void gfs2_delete_inode(struct inode *inode)
+{
+        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int error;
+        if (!test_bit(GIF_USER, &ip->i_flags))
+                goto out;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (unlikely(error)) {
+                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+                goto out;
+        }
+        gfs2_glock_dq_wait(&ip->i_iopen_gh);
+        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+        error = gfs2_glock_nq(&ip->i_iopen_gh);
+        if (error)
+                goto out_truncate;
+        if (S_ISDIR(inode->i_mode) &&
+            (ip->i_diskflags & GFS2_DIF_EXHASH)) {
+                error = gfs2_dir_exhash_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        if (ip->i_eattr) {
+                error = gfs2_ea_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        if (!gfs2_is_stuffed(ip)) {
+                error = gfs2_file_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        error = gfs2_dinode_dealloc(ip);
+        if (error)
+                goto out_unlock;
+out_truncate:
+        error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
+        if (error)
+                goto out_unlock;
+        /* Needs to be done before glock release & also in a transaction */
+        truncate_inode_pages(&inode->i_data, 0);
+        gfs2_trans_end(sdp);
+out_unlock:
+        if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+                gfs2_glock_dq(&ip->i_iopen_gh);
+        gfs2_holder_uninit(&ip->i_iopen_gh);
+        gfs2_glock_dq_uninit(&gh);
+        if (error && error != GLR_TRYFAILED && error != -EROFS)
+                fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
+out:
+        truncate_inode_pages(&inode->i_data, 0);
+        clear_inode(inode);
+}
+static struct inode *gfs2_alloc_inode(struct super_block *sb)
+{
+        struct gfs2_inode *ip;
+        ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+        if (ip) {
+                ip->i_flags = 0;
+                ip->i_gl = NULL;
+        }
+        return &ip->i_inode;
+}
+static void gfs2_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(gfs2_inode_cachep, inode);
+}
+const struct super_operations gfs2_super_ops = {
+        .alloc_inode            = gfs2_alloc_inode,
+        .destroy_inode          = gfs2_destroy_inode,
+        .write_inode            = gfs2_write_inode,
+        .delete_inode           = gfs2_delete_inode,
+        .put_super              = gfs2_put_super,
+        .write_super            = gfs2_write_super,
+        .sync_fs                = gfs2_sync_fs,
+        .freeze_fs              = gfs2_freeze,
+        .unfreeze_fs            = gfs2_unfreeze,
+        .statfs                 = gfs2_statfs,
+        .remount_fs             = gfs2_remount_fs,
+        .clear_inode            = gfs2_clear_inode,
+        .drop_inode             = gfs2_drop_inode,
+        .show_options           = gfs2_show_options,
+};
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7655f5025fec..23419dc3027b 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,6 +26,36 @@
 #include "util.h"
 #include "glops.h"
+struct gfs2_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+        return a->show ? a->show(sdp, buf) : 0;
+}
+static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
+                               const char *buf, size_t len)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+        return a->store ? a->store(sdp, buf, len) : len;
+}
+static struct sysfs_ops gfs2_attr_ops = {
+        .show  = gfs2_attr_show,
+        .store = gfs2_attr_store,
+};
+static struct kset *gfs2_kset;
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
        return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -212,11 +242,6 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
        return len;
 }
-struct gfs2_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
-};
 #define GFS2_ATTR(name, mode, show, store) \
 static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
@@ -246,58 +271,11 @@ static struct attribute *gfs2_attrs[] = {
        NULL,
 };
-static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
-                              char *buf)
-{
-        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
-        return a->show ? a->show(sdp, buf) : 0;
-}
-static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
-                               const char *buf, size_t len)
-{
-        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
-        return a->store ? a->store(sdp, buf, len) : len;
-}
-static struct sysfs_ops gfs2_attr_ops = {
-        .show  = gfs2_attr_show,
-        .store = gfs2_attr_store,
-};
 static struct kobj_type gfs2_ktype = {
        .default_attrs = gfs2_attrs,
        .sysfs_ops     = &gfs2_attr_ops,
 };
-static struct kset *gfs2_kset;
-/*
- * display struct lm_lockstruct fields
- */
-struct lockstruct_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-#define LOCKSTRUCT_ATTR(name, fmt)                                          \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-        return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
-}                                                                           \
-static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
-LOCKSTRUCT_ATTR(jid,      "%u\n");
-LOCKSTRUCT_ATTR(first,    "%u\n");
-static struct attribute *lockstruct_attrs[] = {
-        &lockstruct_attr_jid.attr,
-        &lockstruct_attr_first.attr,
-        NULL,
-};
 /*
 * lock_module. Originally from lock_dlm
@@ -359,34 +337,33 @@ static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
        return sprintf(buf, "%d\n", ls->ls_first_done);
 }
-static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf)
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-        return sprintf(buf, "%d\n", ls->ls_recover_jid);
-}
-static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
 {
+        unsigned jid;
        struct gfs2_jdesc *jd;
+        int rv;
+        rv = sscanf(buf, "%u", &jid);
+        if (rv != 1)
+                return -EINVAL;
+        rv = -ESHUTDOWN;
        spin_lock(&sdp->sd_jindex_spin);
+        if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
+                goto out;
+        rv = -EBUSY;
+        if (sdp->sd_jdesc->jd_jid == jid)
+                goto out;
+        rv = -ENOENT;
        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
                if (jd->jd_jid != jid)
                        continue;
-                jd->jd_dirty = 1;
+                rv = slow_work_enqueue(&jd->jd_work);
                break;
        }
+out:
        spin_unlock(&sdp->sd_jindex_spin);
-}
+        return rv ? rv : len;
-static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-        ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
-        gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
-        if (sdp->sd_recoverd_process)
-                wake_up_process(sdp->sd_recoverd_process);
-        return len;
 }
 static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
@@ -401,31 +378,31 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
        return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
 }
-struct gdlm_attr {
+static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
-        struct attribute attr;
+{
-        ssize_t (*show)(struct gfs2_sbd *sdp, char *);
+        return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
-        ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t);
+}
-};
 #define GDLM_ATTR(_name,_mode,_show,_store) \
-static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
-GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
+GDLM_ATTR(proto_name,     0444, proto_name_show,        NULL);
-GDLM_ATTR(block,          0644, block_show,          block_store);
+GDLM_ATTR(block,          0644, block_show,             block_store);
-GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
+GDLM_ATTR(withdraw,       0644, withdraw_show,          withdraw_store);
-GDLM_ATTR(id,             0444, lkid_show,           NULL);
+GDLM_ATTR(id,             0444, lkid_show,              NULL);
-GDLM_ATTR(first,          0444, lkfirst_show,        NULL);
+GDLM_ATTR(jid,            0444, jid_show,               NULL);
-GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
+GDLM_ATTR(first,          0444, lkfirst_show,           NULL);
-GDLM_ATTR(recover,        0644, recover_show,        recover_store);
+GDLM_ATTR(first_done,     0444, first_done_show,        NULL);
-GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
+GDLM_ATTR(recover,        0200, NULL,                   recover_store);
-GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+GDLM_ATTR(recover_done,   0444, recover_done_show,      NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show,    NULL);
 static struct attribute *lock_module_attrs[] = {
        &gdlm_attr_proto_name.attr,
        &gdlm_attr_block.attr,
        &gdlm_attr_withdraw.attr,
        &gdlm_attr_id.attr,
-        &lockstruct_attr_jid.attr,
+        &gdlm_attr_jid.attr,
        &gdlm_attr_first.attr,
        &gdlm_attr_first_done.attr,
        &gdlm_attr_recover.attr,
@@ -435,53 +412,6 @@ static struct attribute *lock_module_attrs[] = {
 };
 /*
- * display struct gfs2_args fields
- */
-struct args_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-#define ARGS_ATTR(name, fmt)                                                \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-        return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name);       \
-}                                                                           \
-static struct args_attr args_attr_##name = __ATTR_RO(name)
-ARGS_ATTR(lockproto,       "%s\n");
-ARGS_ATTR(locktable,       "%s\n");
-ARGS_ATTR(hostdata,        "%s\n");
-ARGS_ATTR(spectator,       "%d\n");
-ARGS_ATTR(ignore_local_fs, "%d\n");
-ARGS_ATTR(localcaching,    "%d\n");
-ARGS_ATTR(localflocks,     "%d\n");
-ARGS_ATTR(debug,           "%d\n");
-ARGS_ATTR(upgrade,         "%d\n");
-ARGS_ATTR(posix_acl,       "%d\n");
-ARGS_ATTR(quota,           "%u\n");
-ARGS_ATTR(suiddir,         "%d\n");
-ARGS_ATTR(data,            "%d\n");
-static struct attribute *args_attrs[] = {
-        &args_attr_lockproto.attr,
-        &args_attr_locktable.attr,
-        &args_attr_hostdata.attr,
-        &args_attr_spectator.attr,
-        &args_attr_ignore_local_fs.attr,
-        &args_attr_localcaching.attr,
-        &args_attr_localflocks.attr,
-        &args_attr_debug.attr,
-        &args_attr_upgrade.attr,
-        &args_attr_posix_acl.attr,
-        &args_attr_quota.attr,
-        &args_attr_suiddir.attr,
-        &args_attr_data.attr,
-        NULL,
-};
-/*
 * get and set struct gfs2_tune fields
 */
@@ -531,14 +461,8 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
        return len;
 }
-struct tune_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
-};
 #define TUNE_ATTR_3(name, show, store)                                        \
-static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
+static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store)
 #define TUNE_ATTR_2(name, store)                                              \
 static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                   \
@@ -554,15 +478,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
-#define TUNE_ATTR_DAEMON(name, process)                                       \
-static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
-{                                                                             \
-        ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len);      \
-        wake_up_process(sdp->sd_##process);                                   \
-        return r;                                                             \
-}                                                                             \
-TUNE_ATTR_2(name, name##_store)
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
@@ -574,8 +489,6 @@ TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
-TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
-TUNE_ATTR_DAEMON(logd_secs, logd_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 static struct attribute *tune_attrs[] = {
@@ -589,23 +502,11 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_quota_simul_sync.attr,
        &tune_attr_stall_secs.attr,
        &tune_attr_statfs_quantum.attr,
-        &tune_attr_recoverd_secs.attr,
-        &tune_attr_logd_secs.attr,
        &tune_attr_quota_scale.attr,
        &tune_attr_new_files_jdata.attr,
        NULL,
 };
-static struct attribute_group lockstruct_group = {
-        .name = "lockstruct",
-        .attrs = lockstruct_attrs,
-};
-static struct attribute_group args_group = {
-        .name = "args",
-        .attrs = args_attrs,
-};
 static struct attribute_group tune_group = {
        .name = "tune",
        .attrs = tune_attrs,
@@ -626,17 +527,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
        if (error)
                goto fail;
-        error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
-        if (error)
-                goto fail_reg;
-        error = sysfs_create_group(&sdp->sd_kobj, &args_group);
-        if (error)
-                goto fail_lockstruct;
        error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
        if (error)
-                goto fail_args;
+                goto fail_reg;
        error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
        if (error)
@@ -647,10 +540,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 fail_tune:
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
-fail_args:
-        sysfs_remove_group(&sdp->sd_kobj, &args_group);
-fail_lockstruct:
-        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
        kobject_put(&sdp->sd_kobj);
 fail:
@@ -661,8 +550,6 @@ fail:
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
-        sysfs_remove_group(&sdp->sd_kobj, &args_group);
-        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
        sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
        kobject_put(&sdp->sd_kobj);
 }
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 053752d4b27f..4ef0e9fa3549 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -33,6 +33,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
        BUG_ON(current->journal_info);
        BUG_ON(blocks == 0 && revokes == 0);
+        if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+                return -EROFS;
        tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
        if (!tr)
                return -ENOMEM;
@@ -54,12 +57,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
        if (error)
                goto fail_holder_uninit;
-        if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-                tr->tr_t_gh.gh_flags |= GL_NOCACHE;
-                error = -EROFS;
-                goto fail_gunlock;
-        }
        error = gfs2_log_reserve(sdp, tr->tr_reserved);
        if (error)
                goto fail_gunlock;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c1462d43e721..941c8425c10b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -30,6 +30,7 @@
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <asm/uaccess.h>
@@ -986,6 +987,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
                        &hugetlbfs_file_operations);
        if (!file)
                goto out_dentry; /* inode is already attached */
+        ima_counts_get(file);
        return file;
diff --git a/fs/inode.c b/fs/inode.c
index bca0c618fdb3..ca337014ae29 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
@@ -189,6 +190,10 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_private = NULL;
        inode->i_mapping = mapping;
+#ifdef CONFIG_FSNOTIFY
+        inode->i_fsnotify_mask = 0;
+#endif
        return inode;
 out_free_security:
@@ -221,6 +226,7 @@ void destroy_inode(struct inode *inode)
        BUG_ON(inode_has_buffers(inode));
        ima_inode_free(inode);
        security_inode_free(inode);
+        fsnotify_inode_delete(inode);
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
        else
@@ -252,6 +258,9 @@ void inode_init_once(struct inode *inode)
        INIT_LIST_HEAD(&inode->inotify_watches);
        mutex_init(&inode->inotify_mutex);
 #endif
+#ifdef CONFIG_FSNOTIFY
+        INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
+#endif
 }
 EXPORT_SYMBOL(inode_init_once);
@@ -398,6 +407,7 @@ int invalidate_inodes(struct super_block *sb)
        mutex_lock(&iprune_mutex);
        spin_lock(&inode_lock);
        inotify_unmount_inodes(&sb->s_inodes);
+        fsnotify_unmount_inodes(&sb->s_inodes);
        busy = invalidate_list(&sb->s_inodes, &throw_away);
        spin_unlock(&inode_lock);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 82d9c42b8bac..286f38dfc6c0 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -414,10 +414,6 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
        switch (cmd) {
        case FIBMAP:
                return ioctl_fibmap(filp, p);
-        case FS_IOC_FIEMAP:
-                return ioctl_fiemap(filp, arg);
-        case FIGETBSZ:
-                return put_user(inode->i_sb->s_blocksize, p);
        case FIONREAD:
                return put_user(i_size_read(inode) - filp->f_pos, p);
        }
@@ -557,6 +553,16 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                error = ioctl_fsthaw(filp);
                break;
+        case FS_IOC_FIEMAP:
+                return ioctl_fiemap(filp, arg);
+        case FIGETBSZ:
+        {
+                struct inode *inode = filp->f_path.dentry->d_inode;
+                int __user *p = (int __user *)arg;
+                return put_user(inode->i_sb->s_blocksize, p);
+        }
        default:
                if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
                        error = file_ioctl(filp, cmd, arg);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 58144102bf25..62be7d294ec2 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1781,7 +1781,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 * Journal abort has very specific semantics, which we describe
 * for journal abort.
 *
- * Two internal function, which provide abort to te jbd layer
+ * Two internal functions, which provide abort to the jbd layer
 * itself are here.
 */
@@ -1879,7 +1879,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
 * int jbd2_journal_errno () - returns the journal's error state.
 * @journal: journal to examine.
 *
- * This is the errno numbet set with jbd2_journal_abort(), the last
+ * This is the errno number set with jbd2_journal_abort(), the last
 * time the journal was mounted - if the journal was stopped
 * without calling abort this will be 0.
 *
@@ -1903,7 +1903,7 @@ int jbd2_journal_errno(journal_t *journal)
 * int jbd2_journal_clear_err () - clears the journal's error state
 * @journal: journal to act on.
 *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
 * mode.
 */
 int jbd2_journal_clear_err(journal_t *journal)
@@ -1923,7 +1923,7 @@ int jbd2_journal_clear_err(journal_t *journal)
 * void jbd2_journal_ack_err() - Ack journal err.
 * @journal: journal to act on.
 *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
 * mode.
 */
 void jbd2_journal_ack_err(journal_t *journal)
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 346057218edc..0fc30407f039 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2571,6 +2571,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
                        txAbort(tid, 0);
                        txEnd(tid);
+                        mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
                        /* release the inode map lock */
                        IWRITE_UNLOCK(ipimap);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6f21adf9479a..d9b0e92b3602 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -720,8 +720,10 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
                blk++;
        }
 out:
-        if (len == towrite)
+        if (len == towrite) {
+                mutex_unlock(&inode->i_mutex);
                return err;
+        }
        if (inode->i_size < off+len-towrite)
                i_size_write(inode, off+len-towrite);
        inode->i_version++;
diff --git a/fs/mpage.c b/fs/mpage.c
index 680ba60863ff..42381bd6543b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -379,7 +379,8 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
-        clear_buffer_mapped(&map_bh);
+        map_bh.b_state = 0;
+        map_bh.b_size = 0;
        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
                struct page *page = list_entry(pages->prev, struct page, lru);
@@ -412,7 +413,8 @@ int mpage_readpage(struct page *page, get_block_t get_block)
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
-        clear_buffer_mapped(&map_bh);
+        map_bh.b_state = 0;
+        map_bh.b_size = 0;
        bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
                        &map_bh, &first_logical_block, get_block);
        if (bio)
diff --git a/fs/namei.c b/fs/namei.c
index 967c3db92724..c82805d088e1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -853,7 +853,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                        err = inode_permission(nd->path.dentry->d_inode,
                                               MAY_EXEC);
                if (!err)
-                        err = ima_path_check(&nd->path, MAY_EXEC);
+                        err = ima_path_check(&nd->path, MAY_EXEC,
+                                             IMA_COUNT_UPDATE);
                if (err)
                        break;
@@ -1515,7 +1516,8 @@ int may_open(struct path *path, int acc_mode, int flag)
                return error;
        error = ima_path_check(path,
-                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+                               IMA_COUNT_UPDATE);
        if (error)
                return error;
        /*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b660435978d2..bd584bcf1d9f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -55,6 +55,7 @@
 #include <linux/security.h>
 #endif /* CONFIG_NFSD_V4 */
 #include <linux/jhash.h>
+#include <linux/ima.h>
 #include <asm/uaccess.h>
@@ -735,6 +736,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                            flags, cred);
        if (IS_ERR(*filp))
                host_err = PTR_ERR(*filp);
+        else
+                ima_counts_get(*filp);
 out_nfserr:
        err = nfserrno(host_err);
 out:
@@ -2024,6 +2027,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
                                        struct dentry *dentry, int acc)
 {
        struct inode    *inode = dentry->d_inode;
+        struct path     path;
        int             err;
        if (acc == NFSD_MAY_NOP)
@@ -2096,7 +2100,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
        if (err == -EACCES && S_ISREG(inode->i_mode) &&
            acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
                err = inode_permission(inode, MAY_EXEC);
+        if (err)
+                goto nfsd_out;
+        /* Do integrity (permission) checking now, but defer incrementing
+         * IMA counts to the actual file open.
+         */
+        path.mnt = exp->ex_path.mnt;
+        path.dentry = dentry;
+        err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
+                             IMA_COUNT_LEAVE);
+nfsd_out:
        return err? nfserrno(err) : 0;
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 7f65b3be4aa9..a91f15b8673c 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -515,7 +515,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
        if (sb->s_blocksize != blocksize) {
-                int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+                int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
                if (blocksize < hw_blocksize) {
                        printk(KERN_ERR
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 50914d7303c6..31dac7e3b0f1 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,2 +1,15 @@
+config FSNOTIFY
+        bool "Filesystem notification backend"
+        default y
+        ---help---
+           fsnotify is a backend for filesystem notification.  fsnotify does
+           not provide any userspace interface but does provide the basis
+           needed for other notification schemes such as dnotify, inotify,
+           and fanotify.
+           Say Y here to enable fsnotify suport.
+           If unsure, say Y.
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 5a95b6010ce7..0922cc826c46 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,2 +1,4 @@
+obj-$(CONFIG_FSNOTIFY)          += fsnotify.o notification.o group.o inode_mark.o
 obj-y                   += dnotify/
 obj-y                   += inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 26adf5dfa646..904ff8d5405a 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,5 +1,6 @@
 config DNOTIFY
        bool "Dnotify support"
+        depends on FSNOTIFY
        default y
        help
          Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index b0aa2cde80bd..828a889be909 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -3,6 +3,9 @@
 *
 * Copyright (C) 2000,2001,2002 Stephen Rothwell
 *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * dnotify was largly rewritten to use the new fsnotify infrastructure
+ *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
@@ -21,24 +24,173 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
 int dir_notify_enable __read_mostly = 1;
-static struct kmem_cache *dn_cache __read_mostly;
+static struct kmem_cache *dnotify_struct_cache __read_mostly;
+static struct kmem_cache *dnotify_mark_entry_cache __read_mostly;
+static struct fsnotify_group *dnotify_group __read_mostly;
+static DEFINE_MUTEX(dnotify_mark_mutex);
+/*
+ * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
+ * is being watched by dnotify.  If multiple userspace applications are watching
+ * the same directory with dnotify their information is chained in dn
+ */
+struct dnotify_mark_entry {
+        struct fsnotify_mark_entry fsn_entry;
+        struct dnotify_struct *dn;
+};
-static void redo_inode_mask(struct inode *inode)
+/*
+ * When a process starts or stops watching an inode the set of events which
+ * dnotify cares about for that inode may change.  This function runs the
+ * list of everything receiving dnotify events about this directory and calculates
+ * the set of all those events.  After it updates what dnotify is interested in
+ * it calls the fsnotify function so it can update the set of all events relevant
+ * to this inode.
+ */
+static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
 {
-        unsigned long new_mask;
+        __u32 new_mask, old_mask;
        struct dnotify_struct *dn;
+        struct dnotify_mark_entry *dnentry  = container_of(entry,
+                                                           struct dnotify_mark_entry,
+                                                           fsn_entry);
+        assert_spin_locked(&entry->lock);
+        old_mask = entry->mask;
        new_mask = 0;
-        for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next)
+        for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next)
-                new_mask |= dn->dn_mask & ~DN_MULTISHOT;
+                new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
-        inode->i_dnotify_mask = new_mask;
+        entry->mask = new_mask;
+        if (old_mask == new_mask)
+                return;
+        if (entry->inode)
+                fsnotify_recalc_inode_mask(entry->inode);
+}
+/*
+ * Mains fsnotify call where events are delivered to dnotify.
+ * Find the dnotify mark on the relevant inode, run the list of dnotify structs
+ * on that mark and determine which of them has expressed interest in receiving
+ * events of this type.  When found send the correct process and signal and
+ * destroy the dnotify struct if it was not registered to receive multiple
+ * events.
+ */
+static int dnotify_handle_event(struct fsnotify_group *group,
+                                struct fsnotify_event *event)
+{
+        struct fsnotify_mark_entry *entry = NULL;
+        struct dnotify_mark_entry *dnentry;
+        struct inode *to_tell;
+        struct dnotify_struct *dn;
+        struct dnotify_struct **prev;
+        struct fown_struct *fown;
+        to_tell = event->to_tell;
+        spin_lock(&to_tell->i_lock);
+        entry = fsnotify_find_mark_entry(group, to_tell);
+        spin_unlock(&to_tell->i_lock);
+        /* unlikely since we alreay passed dnotify_should_send_event() */
+        if (unlikely(!entry))
+                return 0;
+        dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+        spin_lock(&entry->lock);
+        prev = &dnentry->dn;
+        while ((dn = *prev) != NULL) {
+                if ((dn->dn_mask & event->mask) == 0) {
+                        prev = &dn->dn_next;
+                        continue;
+                }
+                fown = &dn->dn_filp->f_owner;
+                send_sigio(fown, dn->dn_fd, POLL_MSG);
+                if (dn->dn_mask & FS_DN_MULTISHOT)
+                        prev = &dn->dn_next;
+                else {
+                        *prev = dn->dn_next;
+                        kmem_cache_free(dnotify_struct_cache, dn);
+                        dnotify_recalc_inode_mask(entry);
+                }
+        }
+        spin_unlock(&entry->lock);
+        fsnotify_put_mark(entry);
+        return 0;
+}
+/*
+ * Given an inode and mask determine if dnotify would be interested in sending
+ * userspace notification for that pair.
+ */
+static bool dnotify_should_send_event(struct fsnotify_group *group,
+                                      struct inode *inode, __u32 mask)
+{
+        struct fsnotify_mark_entry *entry;
+        bool send;
+        /* !dir_notify_enable should never get here, don't waste time checking
+        if (!dir_notify_enable)
+                return 0; */
+        /* not a dir, dnotify doesn't care */
+        if (!S_ISDIR(inode->i_mode))
+                return false;
+        spin_lock(&inode->i_lock);
+        entry = fsnotify_find_mark_entry(group, inode);
+        spin_unlock(&inode->i_lock);
+        /* no mark means no dnotify watch */
+        if (!entry)
+                return false;
+        mask = (mask & ~FS_EVENT_ON_CHILD);
+        send = (mask & entry->mask);
+        fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
+        return send;
+}
+static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+        struct dnotify_mark_entry *dnentry = container_of(entry,
+                                                          struct dnotify_mark_entry,
+                                                          fsn_entry);
+        BUG_ON(dnentry->dn);
+        kmem_cache_free(dnotify_mark_entry_cache, dnentry);
 }
+static struct fsnotify_ops dnotify_fsnotify_ops = {
+        .handle_event = dnotify_handle_event,
+        .should_send_event = dnotify_should_send_event,
+        .free_group_priv = NULL,
+        .freeing_mark = NULL,
+        .free_event_priv = NULL,
+};
+/*
+ * Called every time a file is closed.  Looks first for a dnotify mark on the
+ * inode.  If one is found run all of the ->dn entries attached to that
+ * mark for one relevant to this process closing the file and remove that
+ * dnotify_struct.  If that was the last dnotify_struct also remove the
+ * fsnotify_mark_entry.
+ */
 void dnotify_flush(struct file *filp, fl_owner_t id)
 {
+        struct fsnotify_mark_entry *entry;
+        struct dnotify_mark_entry *dnentry;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct inode *inode;
@@ -46,145 +198,243 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
        inode = filp->f_path.dentry->d_inode;
        if (!S_ISDIR(inode->i_mode))
                return;
        spin_lock(&inode->i_lock);
-        prev = &inode->i_dnotify;
+        entry = fsnotify_find_mark_entry(dnotify_group, inode);
+        spin_unlock(&inode->i_lock);
+        if (!entry)
+                return;
+        dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+        mutex_lock(&dnotify_mark_mutex);
+        spin_lock(&entry->lock);
+        prev = &dnentry->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
                        *prev = dn->dn_next;
-                        redo_inode_mask(inode);
+                        kmem_cache_free(dnotify_struct_cache, dn);
-                        kmem_cache_free(dn_cache, dn);
+                        dnotify_recalc_inode_mask(entry);
                        break;
                }
                prev = &dn->dn_next;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&entry->lock);
+        /* nothing else could have found us thanks to the dnotify_mark_mutex */
+        if (dnentry->dn == NULL)
+                fsnotify_destroy_mark_by_entry(entry);
+        fsnotify_recalc_group_mask(dnotify_group);
+        mutex_unlock(&dnotify_mark_mutex);
+        fsnotify_put_mark(entry);
+}
+/* this conversion is done only at watch creation */
+static __u32 convert_arg(unsigned long arg)
+{
+        __u32 new_mask = FS_EVENT_ON_CHILD;
+        if (arg & DN_MULTISHOT)
+                new_mask |= FS_DN_MULTISHOT;
+        if (arg & DN_DELETE)
+                new_mask |= (FS_DELETE | FS_MOVED_FROM);
+        if (arg & DN_MODIFY)
+                new_mask |= FS_MODIFY;
+        if (arg & DN_ACCESS)
+                new_mask |= FS_ACCESS;
+        if (arg & DN_ATTRIB)
+                new_mask |= FS_ATTRIB;
+        if (arg & DN_RENAME)
+                new_mask |= FS_DN_RENAME;
+        if (arg & DN_CREATE)
+                new_mask |= (FS_CREATE | FS_MOVED_TO);
+        return new_mask;
 }
+/*
+ * If multiple processes watch the same inode with dnotify there is only one
+ * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
+ * onto that mark.  This function either attaches the new dnotify_struct onto
+ * that list, or it |= the mask onto an existing dnofiy_struct.
+ */
+static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry,
+                     fl_owner_t id, int fd, struct file *filp, __u32 mask)
+{
+        struct dnotify_struct *odn;
+        odn = dnentry->dn;
+        while (odn != NULL) {
+                /* adding more events to existing dnofiy_struct? */
+                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
+                        odn->dn_fd = fd;
+                        odn->dn_mask |= mask;
+                        return -EEXIST;
+                }
+                odn = odn->dn_next;
+        }
+        dn->dn_mask = mask;
+        dn->dn_fd = fd;
+        dn->dn_filp = filp;
+        dn->dn_owner = id;
+        dn->dn_next = dnentry->dn;
+        dnentry->dn = dn;
+        return 0;
+}
+/*
+ * When a process calls fcntl to attach a dnotify watch to a directory it ends
+ * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
+ * attached to the fsnotify_mark.
+ */
 int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 {
+        struct dnotify_mark_entry *new_dnentry, *dnentry;
+        struct fsnotify_mark_entry *new_entry, *entry;
        struct dnotify_struct *dn;
-        struct dnotify_struct *odn;
-        struct dnotify_struct **prev;
        struct inode *inode;
        fl_owner_t id = current->files;
        struct file *f;
-        int error = 0;
+        int destroy = 0, error = 0;
+        __u32 mask;
+        /* we use these to tell if we need to kfree */
+        new_entry = NULL;
+        dn = NULL;
+        if (!dir_notify_enable) {
+                error = -EINVAL;
+                goto out_err;
+        }
+        /* a 0 mask means we are explicitly removing the watch */
        if ((arg & ~DN_MULTISHOT) == 0) {
                dnotify_flush(filp, id);
-                return 0;
+                error = 0;
+                goto out_err;
        }
-        if (!dir_notify_enable)
-                return -EINVAL;
+        /* dnotify only works on directories */
        inode = filp->f_path.dentry->d_inode;
-        if (!S_ISDIR(inode->i_mode))
+        if (!S_ISDIR(inode->i_mode)) {
-                return -ENOTDIR;
+                error = -ENOTDIR;
-        dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
+                goto out_err;
-        if (dn == NULL)
-                return -ENOMEM;
-        spin_lock(&inode->i_lock);
-        prev = &inode->i_dnotify;
-        while ((odn = *prev) != NULL) {
-                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
-                        odn->dn_fd = fd;
-                        odn->dn_mask |= arg;
-                        inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-                        goto out_free;
-                }
-                prev = &odn->dn_next;
        }
-        rcu_read_lock();
+        /* expect most fcntl to add new rather than augment old */
-        f = fcheck(fd);
+        dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
-        rcu_read_unlock();
+        if (!dn) {
-        /* we'd lost the race with close(), sod off silently */
+                error = -ENOMEM;
-        /* note that inode->i_lock prevents reordering problems
+                goto out_err;
-         * between accesses to descriptor table and ->i_dnotify */
+        }
-        if (f != filp)
-                goto out_free;
-        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+        /* new fsnotify mark, we expect most fcntl calls to add a new mark */
-        if (error)
+        new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL);
-                goto out_free;
+        if (!new_dnentry) {
+                error = -ENOMEM;
+                goto out_err;
+        }
-        dn->dn_mask = arg;
+        /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
-        dn->dn_fd = fd;
+        mask = convert_arg(arg);
-        dn->dn_filp = filp;
-        dn->dn_owner = id;
-        inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-        dn->dn_next = inode->i_dnotify;
-        inode->i_dnotify = dn;
-        spin_unlock(&inode->i_lock);
-        return 0;
-out_free:
+        /* set up the new_entry and new_dnentry */
-        spin_unlock(&inode->i_lock);
+        new_entry = &new_dnentry->fsn_entry;
-        kmem_cache_free(dn_cache, dn);
+        fsnotify_init_mark(new_entry, dnotify_free_mark);
-        return error;
+        new_entry->mask = mask;
-}
+        new_dnentry->dn = NULL;
-void __inode_dir_notify(struct inode *inode, unsigned long event)
+        /* this is needed to prevent the fcntl/close race described below */
-{
+        mutex_lock(&dnotify_mark_mutex);
-        struct dnotify_struct * dn;
-        struct dnotify_struct **prev;
-        struct fown_struct *    fown;
-        int                     changed = 0;
+        /* add the new_entry or find an old one. */
        spin_lock(&inode->i_lock);
-        prev = &inode->i_dnotify;
+        entry = fsnotify_find_mark_entry(dnotify_group, inode);
-        while ((dn = *prev) != NULL) {
-                if ((dn->dn_mask & event) == 0) {
-                        prev = &dn->dn_next;
-                        continue;
-                }
-                fown = &dn->dn_filp->f_owner;
-                send_sigio(fown, dn->dn_fd, POLL_MSG);
-                if (dn->dn_mask & DN_MULTISHOT)
-                        prev = &dn->dn_next;
-                else {
-                        *prev = dn->dn_next;
-                        changed = 1;
-                        kmem_cache_free(dn_cache, dn);
-                }
-        }
-        if (changed)
-                redo_inode_mask(inode);
        spin_unlock(&inode->i_lock);
-}
+        if (entry) {
+                dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
-EXPORT_SYMBOL(__inode_dir_notify);
+                spin_lock(&entry->lock);
+        } else {
+                fsnotify_add_mark(new_entry, dnotify_group, inode);
+                spin_lock(&new_entry->lock);
+                entry = new_entry;
+                dnentry = new_dnentry;
+                /* we used new_entry, so don't free it */
+                new_entry = NULL;
+        }
-/*
+        rcu_read_lock();
- * This is hopelessly wrong, but unfixable without API changes.  At
+        f = fcheck(fd);
- * least it doesn't oops the kernel...
+        rcu_read_unlock();
- *
- * To safely access ->d_parent we need to keep d_move away from it.  Use the
- * dentry's d_lock for this.
- */
-void dnotify_parent(struct dentry *dentry, unsigned long event)
-{
-        struct dentry *parent;
-        if (!dir_notify_enable)
+        /* if (f != filp) means that we lost a race and another task/thread
-                return;
+         * actually closed the fd we are still playing with before we grabbed
+         * the dnotify_mark_mutex and entry->lock.  Since closing the fd is the
+         * only time we clean up the mark entries we need to get our mark off
+         * the list. */
+        if (f != filp) {
+                /* if we added ourselves, shoot ourselves, it's possible that
+                 * the flush actually did shoot this entry.  That's fine too
+                 * since multiple calls to destroy_mark is perfectly safe, if
+                 * we found a dnentry already attached to the inode, just sod
+                 * off silently as the flush at close time dealt with it.
+                 */
+                if (dnentry == new_dnentry)
+                        destroy = 1;
+                goto out;
+        }
-        spin_lock(&dentry->d_lock);
+        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-        parent = dentry->d_parent;
+        if (error) {
-        if (parent->d_inode->i_dnotify_mask & event) {
+                /* if we added, we must shoot */
-                dget(parent);
+                if (dnentry == new_dnentry)
-                spin_unlock(&dentry->d_lock);
+                        destroy = 1;
-                __inode_dir_notify(parent->d_inode, event);
+                goto out;
-                dput(parent);
-        } else {
-                spin_unlock(&dentry->d_lock);
        }
+        error = attach_dn(dn, dnentry, id, fd, filp, mask);
+        /* !error means that we attached the dn to the dnentry, so don't free it */
+        if (!error)
+                dn = NULL;
+        /* -EEXIST means that we didn't add this new dn and used an old one.
+         * that isn't an error (and the unused dn should be freed) */
+        else if (error == -EEXIST)
+                error = 0;
+        dnotify_recalc_inode_mask(entry);
+out:
+        spin_unlock(&entry->lock);
+        if (destroy)
+                fsnotify_destroy_mark_by_entry(entry);
+        fsnotify_recalc_group_mask(dnotify_group);
+        mutex_unlock(&dnotify_mark_mutex);
+        fsnotify_put_mark(entry);
+out_err:
+        if (new_entry)
+                fsnotify_put_mark(new_entry);
+        if (dn)
+                kmem_cache_free(dnotify_struct_cache, dn);
+        return error;
 }
-EXPORT_SYMBOL_GPL(dnotify_parent);
 static int __init dnotify_init(void)
 {
-        dn_cache = kmem_cache_create("dnotify_cache",
+        dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
-                sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
+        dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
+        dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
+                                              0, &dnotify_fsnotify_ops);
+        if (IS_ERR(dnotify_group))
+                panic("unable to allocate fsnotify group for dnotify\n");
        return 0;
 }
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
new file mode 100644
index 000000000000..ec2f7bd76818
--- /dev/null
+++ b/fs/notify/fsnotify.c
@@ -0,0 +1,186 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/srcu.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+/*
+ * Clear all of the marks on an inode when it is being evicted from core
+ */
+void __fsnotify_inode_delete(struct inode *inode)
+{
+        fsnotify_clear_marks_by_inode(inode);
+}
+EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
+/*
+ * Given an inode, first check if we care what happens to our children.  Inotify
+ * and dnotify both tell their parents about events.  If we care about any event
+ * on a child we run all of our children and set a dentry flag saying that the
+ * parent cares.  Thus when an event happens on a child it can quickly tell if
+ * if there is a need to find a parent and send the event to the parent.
+ */
+void __fsnotify_update_child_dentry_flags(struct inode *inode)
+{
+        struct dentry *alias;
+        int watched;
+        if (!S_ISDIR(inode->i_mode))
+                return;
+        /* determine if the children should tell inode about their events */
+        watched = fsnotify_inode_watches_children(inode);
+        spin_lock(&dcache_lock);
+        /* run all of the dentries associated with this inode.  Since this is a
+         * directory, there damn well better only be one item on this list */
+        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+                struct dentry *child;
+                /* run all of the children of the original inode and fix their
+                 * d_flags to indicate parental interest (their parent is the
+                 * original inode) */
+                list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+                        if (!child->d_inode)
+                                continue;
+                        spin_lock(&child->d_lock);
+                        if (watched)
+                                child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+                        else
+                                child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+                        spin_unlock(&child->d_lock);
+                }
+        }
+        spin_unlock(&dcache_lock);
+}
+/* Notify this dentry's parent about a child's events. */
+void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+{
+        struct dentry *parent;
+        struct inode *p_inode;
+        bool send = false;
+        bool should_update_children = false;
+        if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
+                return;
+        spin_lock(&dentry->d_lock);
+        parent = dentry->d_parent;
+        p_inode = parent->d_inode;
+        if (fsnotify_inode_watches_children(p_inode)) {
+                if (p_inode->i_fsnotify_mask & mask) {
+                        dget(parent);
+                        send = true;
+                }
+        } else {
+                /*
+                 * The parent doesn't care about events on it's children but
+                 * at least one child thought it did.  We need to run all the
+                 * children and update their d_flags to let them know p_inode
+                 * doesn't care about them any more.
+                 */
+                dget(parent);
+                should_update_children = true;
+        }
+        spin_unlock(&dentry->d_lock);
+        if (send) {
+                /* we are notifying a parent so come up with the new mask which
+                 * specifies these are events which came from a child. */
+                mask |= FS_EVENT_ON_CHILD;
+                fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+                         dentry->d_name.name, 0);
+                dput(parent);
+        }
+        if (unlikely(should_update_children)) {
+                __fsnotify_update_child_dentry_flags(p_inode);
+                dput(parent);
+        }
+}
+EXPORT_SYMBOL_GPL(__fsnotify_parent);
+/*
+ * This is the main call to fsnotify.  The VFS calls into hook specific functions
+ * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
+ * out to all of the registered fsnotify_group.  Those groups can then use the
+ * notification event in whatever means they feel necessary.
+ */
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
+{
+        struct fsnotify_group *group;
+        struct fsnotify_event *event = NULL;
+        int idx;
+        /* global tests shouldn't care about events on child only the specific event */
+        __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+        if (list_empty(&fsnotify_groups))
+                return;
+        if (!(test_mask & fsnotify_mask))
+                return;
+        if (!(test_mask & to_tell->i_fsnotify_mask))
+                return;
+        /*
+         * SRCU!!  the groups list is very very much read only and the path is
+         * very hot.  The VAST majority of events are not going to need to do
+         * anything other than walk the list so it's crazy to pre-allocate.
+         */
+        idx = srcu_read_lock(&fsnotify_grp_srcu);
+        list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
+                if (test_mask & group->mask) {
+                        if (!group->ops->should_send_event(group, to_tell, mask))
+                                continue;
+                        if (!event) {
+                                event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
+                                /* shit, we OOM'd and now we can't tell, maybe
+                                 * someday someone else will want to do something
+                                 * here */
+                                if (!event)
+                                        break;
+                        }
+                        group->ops->handle_event(group, event);
+                }
+        }
+        srcu_read_unlock(&fsnotify_grp_srcu, idx);
+        /*
+         * fsnotify_create_event() took a reference so the event can't be cleaned
+         * up while we are still trying to add it to lists, drop that one.
+         */
+        if (event)
+                fsnotify_put_event(event);
+}
+EXPORT_SYMBOL_GPL(fsnotify);
+static __init int fsnotify_init(void)
+{
+        return init_srcu_struct(&fsnotify_grp_srcu);
+}
+subsys_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
new file mode 100644
index 000000000000..4dc240824b2d
--- /dev/null
+++ b/fs/notify/fsnotify.h
@@ -0,0 +1,34 @@
+#ifndef __FS_NOTIFY_FSNOTIFY_H_
+#define __FS_NOTIFY_FSNOTIFY_H_
+#include <linux/list.h>
+#include <linux/fsnotify.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+/* protects reads of fsnotify_groups */
+extern struct srcu_struct fsnotify_grp_srcu;
+/* all groups which receive fsnotify events */
+extern struct list_head fsnotify_groups;
+/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
+extern __u32 fsnotify_mask;
+/* destroy all events sitting in this groups notification queue */
+extern void fsnotify_flush_notify(struct fsnotify_group *group);
+/* final kfree of a group */
+extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
+/* run the list of all marks associated with inode and flag them to be freed */
+extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+/*
+ * update the dentry->d_flags of all of inode's children to indicate if inode cares
+ * about events that happen to its children.
+ */
+extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
+/* allocate and destroy and event holder to attach events to notification/access queues */
+extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
+extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
+#endif  /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
new file mode 100644
index 000000000000..0e1677144bc5
--- /dev/null
+++ b/fs/notify/group.c
@@ -0,0 +1,254 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/rculist.h>
+#include <linux/wait.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+#include <asm/atomic.h>
+/* protects writes to fsnotify_groups and fsnotify_mask */
+static DEFINE_MUTEX(fsnotify_grp_mutex);
+/* protects reads while running the fsnotify_groups list */
+struct srcu_struct fsnotify_grp_srcu;
+/* all groups registered to receive filesystem notifications */
+LIST_HEAD(fsnotify_groups);
+/* bitwise OR of all events (FS_*) interesting to some group on this system */
+__u32 fsnotify_mask;
+/*
+ * When a new group registers or changes it's set of interesting events
+ * this function updates the fsnotify_mask to contain all interesting events
+ */
+void fsnotify_recalc_global_mask(void)
+{
+        struct fsnotify_group *group;
+        __u32 mask = 0;
+        int idx;
+        idx = srcu_read_lock(&fsnotify_grp_srcu);
+        list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
+                mask |= group->mask;
+        srcu_read_unlock(&fsnotify_grp_srcu, idx);
+        fsnotify_mask = mask;
+}
+/*
+ * Update the group->mask by running all of the marks associated with this
+ * group and finding the bitwise | of all of the mark->mask.  If we change
+ * the group->mask we need to update the global mask of events interesting
+ * to the system.
+ */
+void fsnotify_recalc_group_mask(struct fsnotify_group *group)
+{
+        __u32 mask = 0;
+        __u32 old_mask = group->mask;
+        struct fsnotify_mark_entry *entry;
+        spin_lock(&group->mark_lock);
+        list_for_each_entry(entry, &group->mark_entries, g_list)
+                mask |= entry->mask;
+        spin_unlock(&group->mark_lock);
+        group->mask = mask;
+        if (old_mask != mask)
+                fsnotify_recalc_global_mask();
+}
+/*
+ * Take a reference to a group so things found under the fsnotify_grp_mutex
+ * can't get freed under us
+ */
+static void fsnotify_get_group(struct fsnotify_group *group)
+{
+        atomic_inc(&group->refcnt);
+}
+/*
+ * Final freeing of a group
+ */
+void fsnotify_final_destroy_group(struct fsnotify_group *group)
+{
+        /* clear the notification queue of all events */
+        fsnotify_flush_notify(group);
+        if (group->ops->free_group_priv)
+                group->ops->free_group_priv(group);
+        kfree(group);
+}
+/*
+ * Trying to get rid of a group.  We need to first get rid of any outstanding
+ * allocations and then free the group.  Remember that fsnotify_clear_marks_by_group
+ * could miss marks that are being freed by inode and those marks could still
+ * hold a reference to this group (via group->num_marks)  If we get into that
+ * situtation, the fsnotify_final_destroy_group will get called when that final
+ * mark is freed.
+ */
+static void fsnotify_destroy_group(struct fsnotify_group *group)
+{
+        /* clear all inode mark entries for this group */
+        fsnotify_clear_marks_by_group(group);
+        /* past the point of no return, matches the initial value of 1 */
+        if (atomic_dec_and_test(&group->num_marks))
+                fsnotify_final_destroy_group(group);
+}
+/*
+ * Remove this group from the global list of groups that will get events
+ * this can be done even if there are still references and things still using
+ * this group.  This just stops the group from getting new events.
+ */
+static void __fsnotify_evict_group(struct fsnotify_group *group)
+{
+        BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+        if (group->on_group_list)
+                list_del_rcu(&group->group_list);
+        group->on_group_list = 0;
+}
+/*
+ * Called when a group is no longer interested in getting events.  This can be
+ * used if a group is misbehaving or if for some reason a group should no longer
+ * get any filesystem events.
+ */
+void fsnotify_evict_group(struct fsnotify_group *group)
+{
+        mutex_lock(&fsnotify_grp_mutex);
+        __fsnotify_evict_group(group);
+        mutex_unlock(&fsnotify_grp_mutex);
+}
+/*
+ * Drop a reference to a group.  Free it if it's through.
+ */
+void fsnotify_put_group(struct fsnotify_group *group)
+{
+        if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
+                return;
+        /*
+         * OK, now we know that there's no other users *and* we hold mutex,
+         * so no new references will appear
+         */
+        __fsnotify_evict_group(group);
+        /*
+         * now it's off the list, so the only thing we might care about is
+         * srcu access....
+         */
+        mutex_unlock(&fsnotify_grp_mutex);
+        synchronize_srcu(&fsnotify_grp_srcu);
+        /* and now it is really dead. _Nothing_ could be seeing it */
+        fsnotify_recalc_global_mask();
+        fsnotify_destroy_group(group);
+}
+/*
+ * Simply run the fsnotify_groups list and find a group which matches
+ * the given parameters.  If a group is found we take a reference to that
+ * group.
+ */
+static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
+                                                  const struct fsnotify_ops *ops)
+{
+        struct fsnotify_group *group_iter;
+        struct fsnotify_group *group = NULL;
+        BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+        list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
+                if (group_iter->group_num == group_num) {
+                        if ((group_iter->mask == mask) &&
+                            (group_iter->ops == ops)) {
+                                fsnotify_get_group(group_iter);
+                                group = group_iter;
+                        } else
+                                group = ERR_PTR(-EEXIST);
+                }
+        }
+        return group;
+}
+/*
+ * Either finds an existing group which matches the group_num, mask, and ops or
+ * creates a new group and adds it to the global group list.  In either case we
+ * take a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
+                                             const struct fsnotify_ops *ops)
+{
+        struct fsnotify_group *group, *tgroup;
+        /* very low use, simpler locking if we just always alloc */
+        group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+        if (!group)
+                return ERR_PTR(-ENOMEM);
+        atomic_set(&group->refcnt, 1);
+        group->on_group_list = 0;
+        group->group_num = group_num;
+        group->mask = mask;
+        mutex_init(&group->notification_mutex);
+        INIT_LIST_HEAD(&group->notification_list);
+        init_waitqueue_head(&group->notification_waitq);
+        group->q_len = 0;
+        group->max_events = UINT_MAX;
+        spin_lock_init(&group->mark_lock);
+        atomic_set(&group->num_marks, 0);
+        INIT_LIST_HEAD(&group->mark_entries);
+        group->ops = ops;
+        mutex_lock(&fsnotify_grp_mutex);
+        tgroup = fsnotify_find_group(group_num, mask, ops);
+        if (tgroup) {
+                /* group already exists */
+                mutex_unlock(&fsnotify_grp_mutex);
+                /* destroy the new one we made */
+                fsnotify_put_group(group);
+                return tgroup;
+        }
+        /* group not found, add a new one */
+        list_add_rcu(&group->group_list, &fsnotify_groups);
+        group->on_group_list = 1;
+        /* being on the fsnotify_groups list holds one num_marks */
+        atomic_inc(&group->num_marks);
+        mutex_unlock(&fsnotify_grp_mutex);
+        if (mask)
+                fsnotify_recalc_global_mask();
+        return group;
+}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
new file mode 100644
index 000000000000..c8a07c65482b
--- /dev/null
+++ b/fs/notify/inode_mark.c
@@ -0,0 +1,426 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * fsnotify inode mark locking/lifetime/and refcnting
+ *
+ * REFCNT:
+ * The mark->refcnt tells how many "things" in the kernel currently are
+ * referencing this object.  The object typically will live inside the kernel
+ * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
+ * which can find this object holding the appropriete locks, can take a reference
+ * and the object itself is guarenteed to survive until the reference is dropped.
+ *
+ * LOCKING:
+ * There are 3 spinlocks involved with fsnotify inode marks and they MUST
+ * be taken in order as follows:
+ *
+ * entry->lock
+ * group->mark_lock
+ * inode->i_lock
+ *
+ * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
+ * that lock to dereference either of these things (they could be NULL even with
+ * the lock)
+ *
+ * group->mark_lock protects the mark_entries list anchored inside a given group
+ * and each entry is hooked via the g_list.  It also sorta protects the
+ * free_g_list, which when used is anchored by a private list on the stack of the
+ * task which held the group->mark_lock.
+ *
+ * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
+ * given inode and each entry is hooked via the i_list. (and sorta the
+ * free_i_list)
+ *
+ *
+ * LIFETIME:
+ * Inode marks survive between when they are added to an inode and when their
+ * refcnt==0.
+ *
+ * The inode mark can be cleared for a number of different reasons including:
+ * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
+ * - The inode is being evicted from cache. (fsnotify_inode_delete)
+ * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
+ * - The fsnotify_group associated with the mark is going away and all such marks
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *
+ * Worst case we are given an inode and need to clean up all the marks on that
+ * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
+ * mark on the list we take a reference (so the mark can't disappear under us).
+ * We remove that mark form the inode's list of marks and we add this mark to a
+ * private list anchored on the stack using i_free_list;  At this point we no
+ * longer fear anything finding the mark using the inode's list of marks.
+ *
+ * We can safely and locklessly run the private list on the stack of everything
+ * we just unattached from the original inode.  For each mark on the private list
+ * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
+ * we see the group and inode are not NULL we take those locks.  Now holding all
+ * 3 locks we can completely remove the mark from other tasks finding it in the
+ * future.  Remember, 10 things might already be referencing this mark, but they
+ * better be holding a ref.  We drop our reference we took before we unhooked it
+ * from the inode.  When the ref hits 0 we can free the mark.
+ *
+ * Very similarly for freeing by group, except we use free_g_list.
+ *
+ * This has the very interesting property of being able to run concurrently with
+ * any (or all) other directions.
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/writeback.h> /* for inode_lock */
+#include <asm/atomic.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
+{
+        atomic_inc(&entry->refcnt);
+}
+void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
+{
+        if (atomic_dec_and_test(&entry->refcnt))
+                entry->free_mark(entry);
+}
+/*
+ * Recalculate the mask of events relevant to a given inode locked.
+ */
+static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
+{
+        struct fsnotify_mark_entry *entry;
+        struct hlist_node *pos;
+        __u32 new_mask = 0;
+        assert_spin_locked(&inode->i_lock);
+        hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
+                new_mask |= entry->mask;
+        inode->i_fsnotify_mask = new_mask;
+}
+/*
+ * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this inode.
+ */
+void fsnotify_recalc_inode_mask(struct inode *inode)
+{
+        spin_lock(&inode->i_lock);
+        fsnotify_recalc_inode_mask_locked(inode);
+        spin_unlock(&inode->i_lock);
+        __fsnotify_update_child_dentry_flags(inode);
+}
+/*
+ * Any time a mark is getting freed we end up here.
+ * The caller had better be holding a reference to this mark so we don't actually
+ * do the final put under the entry->lock
+ */
+void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
+{
+        struct fsnotify_group *group;
+        struct inode *inode;
+        spin_lock(&entry->lock);
+        group = entry->group;
+        inode = entry->inode;
+        BUG_ON(group && !inode);
+        BUG_ON(!group && inode);
+        /* if !group something else already marked this to die */
+        if (!group) {
+                spin_unlock(&entry->lock);
+                return;
+        }
+        /* 1 from caller and 1 for being on i_list/g_list */
+        BUG_ON(atomic_read(&entry->refcnt) < 2);
+        spin_lock(&group->mark_lock);
+        spin_lock(&inode->i_lock);
+        hlist_del_init(&entry->i_list);
+        entry->inode = NULL;
+        list_del_init(&entry->g_list);
+        entry->group = NULL;
+        fsnotify_put_mark(entry); /* for i_list and g_list */
+        /*
+         * this mark is now off the inode->i_fsnotify_mark_entries list and we
+         * hold the inode->i_lock, so this is the perfect time to update the
+         * inode->i_fsnotify_mask
+         */
+        fsnotify_recalc_inode_mask_locked(inode);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&group->mark_lock);
+        spin_unlock(&entry->lock);
+        /*
+         * Some groups like to know that marks are being freed.  This is a
+         * callback to the group function to let it know that this entry
+         * is being freed.
+         */
+        if (group->ops->freeing_mark)
+                group->ops->freeing_mark(entry, group);
+        /*
+         * __fsnotify_update_child_dentry_flags(inode);
+         *
+         * I really want to call that, but we can't, we have no idea if the inode
+         * still exists the second we drop the entry->lock.
+         *
+         * The next time an event arrive to this inode from one of it's children
+         * __fsnotify_parent will see that the inode doesn't care about it's
+         * children and will update all of these flags then.  So really this
+         * is just a lazy update (and could be a perf win...)
+         */
+        iput(inode);
+        /*
+         * it's possible that this group tried to destroy itself, but this
+         * this mark was simultaneously being freed by inode.  If that's the
+         * case, we finish freeing the group here.
+         */
+        if (unlikely(atomic_dec_and_test(&group->num_marks)))
+                fsnotify_final_destroy_group(group);
+}
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+        struct fsnotify_mark_entry *lentry, *entry;
+        LIST_HEAD(free_list);
+        spin_lock(&group->mark_lock);
+        list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
+                list_add(&entry->free_g_list, &free_list);
+                list_del_init(&entry->g_list);
+                fsnotify_get_mark(entry);
+        }
+        spin_unlock(&group->mark_lock);
+        list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
+                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_put_mark(entry);
+        }
+}
+/*
+ * Given an inode, destroy all of the marks associated with that inode.
+ */
+void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+        struct fsnotify_mark_entry *entry, *lentry;
+        struct hlist_node *pos, *n;
+        LIST_HEAD(free_list);
+        spin_lock(&inode->i_lock);
+        hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
+                list_add(&entry->free_i_list, &free_list);
+                hlist_del_init(&entry->i_list);
+                fsnotify_get_mark(entry);
+        }
+        spin_unlock(&inode->i_lock);
+        list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
+                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_put_mark(entry);
+        }
+}
+/*
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
+                                                     struct inode *inode)
+{
+        struct fsnotify_mark_entry *entry;
+        struct hlist_node *pos;
+        assert_spin_locked(&inode->i_lock);
+        hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
+                if (entry->group == group) {
+                        fsnotify_get_mark(entry);
+                        return entry;
+                }
+        }
+        return NULL;
+}
+/*
+ * Nothing fancy, just initialize lists and locks and counters.
+ */
+void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
+                        void (*free_mark)(struct fsnotify_mark_entry *entry))
+{
+        spin_lock_init(&entry->lock);
+        atomic_set(&entry->refcnt, 1);
+        INIT_HLIST_NODE(&entry->i_list);
+        entry->group = NULL;
+        entry->mask = 0;
+        entry->inode = NULL;
+        entry->free_mark = free_mark;
+}
+/*
+ * Attach an initialized mark entry to a given group and inode.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group and for which inodes.
+ */
+int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
+                      struct fsnotify_group *group, struct inode *inode)
+{
+        struct fsnotify_mark_entry *lentry;
+        int ret = 0;
+        inode = igrab(inode);
+        if (unlikely(!inode))
+                return -EINVAL;
+        /*
+         * LOCKING ORDER!!!!
+         * entry->lock
+         * group->mark_lock
+         * inode->i_lock
+         */
+        spin_lock(&entry->lock);
+        spin_lock(&group->mark_lock);
+        spin_lock(&inode->i_lock);
+        entry->group = group;
+        entry->inode = inode;
+        lentry = fsnotify_find_mark_entry(group, inode);
+        if (!lentry) {
+                hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
+                list_add(&entry->g_list, &group->mark_entries);
+                fsnotify_get_mark(entry); /* for i_list and g_list */
+                atomic_inc(&group->num_marks);
+                fsnotify_recalc_inode_mask_locked(inode);
+        }
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&group->mark_lock);
+        spin_unlock(&entry->lock);
+        if (lentry) {
+                ret = -EEXIST;
+                iput(inode);
+                fsnotify_put_mark(lentry);
+        } else {
+                __fsnotify_update_child_dentry_flags(inode);
+        }
+        return ret;
+}
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @list: list of inodes being unmounted (sb->s_inodes)
+ *
+ * Called with inode_lock held, protecting the unmounting super block's list
+ * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
+ * We temporarily drop inode_lock, however, and CAN block.
+ */
+void fsnotify_unmount_inodes(struct list_head *list)
+{
+        struct inode *inode, *next_i, *need_iput = NULL;
+        list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+                struct inode *need_iput_tmp;
+                /*
+                 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+                 * I_WILL_FREE, or I_NEW which is fine because by that point
+                 * the inode cannot have any associated watches.
+                 */
+                if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+                        continue;
+                /*
+                 * If i_count is zero, the inode cannot have any watches and
+                 * doing an __iget/iput with MS_ACTIVE clear would actually
+                 * evict all inodes with zero i_count from icache which is
+                 * unnecessarily violent and may in fact be illegal to do.
+                 */
+                if (!atomic_read(&inode->i_count))
+                        continue;
+                need_iput_tmp = need_iput;
+                need_iput = NULL;
+                /* In case fsnotify_inode_delete() drops a reference. */
+                if (inode != need_iput_tmp)
+                        __iget(inode);
+                else
+                        need_iput_tmp = NULL;
+                /* In case the dropping of a reference would nuke next_i. */
+                if ((&next_i->i_sb_list != list) &&
+                    atomic_read(&next_i->i_count) &&
+                    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
+                        __iget(next_i);
+                        need_iput = next_i;
+                }
+                /*
+                 * We can safely drop inode_lock here because we hold
+                 * references on both inode and next_i.  Also no new inodes
+                 * will be added since the umount has begun.  Finally,
+                 * iprune_mutex keeps shrink_icache_memory() away.
+                 */
+                spin_unlock(&inode_lock);
+                if (need_iput_tmp)
+                        iput(need_iput_tmp);
+                /* for each watch, send FS_UNMOUNT and then remove it */
+                fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+                fsnotify_inode_delete(inode);
+                iput(inode);
+                spin_lock(&inode_lock);
+        }
+}
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 446792841023..5356884289a1 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,26 +1,30 @@
 config INOTIFY
        bool "Inotify file change notification support"
-        default y
+        default n
        ---help---
-          Say Y here to enable inotify support.  Inotify is a file change
+          Say Y here to enable legacy in kernel inotify support.  Inotify is a
-          notification system and a replacement for dnotify.  Inotify fixes
+          file change notification system.  It is a replacement for dnotify.
-          numerous shortcomings in dnotify and introduces several new features
+          This option only provides the legacy inotify in kernel API.  There
-          including multiple file events, one-shot support, and unmount
+          are no in tree kernel users of this interface since it is deprecated.
-          notification.
+          You only need this if you are loading an out of tree kernel module
+          that uses inotify.
          For more information, see <file:Documentation/filesystems/inotify.txt>
-          If unsure, say Y.
+          If unsure, say N.
 config INOTIFY_USER
        bool "Inotify support for userspace"
-        depends on INOTIFY
+        depends on FSNOTIFY
        default y
        ---help---
          Say Y here to enable inotify support for userspace, including the
          associated system calls.  Inotify allows monitoring of both files and
          directories via a single open fd.  Events are read from the file
          descriptor, which is also select()- and poll()-able.
+          Inotify fixes numerous shortcomings in dnotify and introduces several
+          new features including multiple file events, one-shot support, and
+          unmount notification.
          For more information, see <file:Documentation/filesystems/inotify.txt>
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index e290f3bb9d8d..943828171362 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_INOTIFY)           += inotify.o
-obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
+obj-$(CONFIG_INOTIFY_USER)      += inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 220c13f0d73d..40b1cf914ccb 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -32,6 +32,7 @@
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 static atomic_t inotify_cookie;
@@ -905,6 +906,25 @@ EXPORT_SYMBOL_GPL(inotify_rm_watch);
 */
 static int __init inotify_setup(void)
 {
+        BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+        BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+        BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+        BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
+        BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+        BUILD_BUG_ON(IN_OPEN != FS_OPEN);
+        BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
+        BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
+        BUILD_BUG_ON(IN_CREATE != FS_CREATE);
+        BUILD_BUG_ON(IN_DELETE != FS_DELETE);
+        BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
+        BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
+        BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+        BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+        BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
+        BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+        BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
        atomic_set(&inotify_cookie, 0);
        return 0;
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
new file mode 100644
index 000000000000..ea2605a58b8a
--- /dev/null
+++ b/fs/notify/inotify/inotify.h
@@ -0,0 +1,21 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/slab.h> /* struct kmem_cache */
+extern struct kmem_cache *event_priv_cachep;
+struct inotify_event_private_data {
+        struct fsnotify_event_private_data fsnotify_event_priv_data;
+        int wd;
+};
+struct inotify_inode_mark_entry {
+        /* fsnotify_mark_entry MUST be the first thing */
+        struct fsnotify_mark_entry fsn_entry;
+        int wd;
+};
+extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
new file mode 100644
index 000000000000..7ef75b83247e
--- /dev/null
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -0,0 +1,138 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *      John McCutchan  <ttb@tentacle.dhs.org>
+ *      Robert Love     <rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/path.h> /* struct path */
+#include <linux/slab.h> /* kmem_* */
+#include <linux/types.h>
+#include "inotify.h"
+static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+        struct fsnotify_mark_entry *entry;
+        struct inotify_inode_mark_entry *ientry;
+        struct inode *to_tell;
+        struct inotify_event_private_data *event_priv;
+        struct fsnotify_event_private_data *fsn_event_priv;
+        int wd, ret;
+        to_tell = event->to_tell;
+        spin_lock(&to_tell->i_lock);
+        entry = fsnotify_find_mark_entry(group, to_tell);
+        spin_unlock(&to_tell->i_lock);
+        /* race with watch removal?  We already passes should_send */
+        if (unlikely(!entry))
+                return 0;
+        ientry = container_of(entry, struct inotify_inode_mark_entry,
+                              fsn_entry);
+        wd = ientry->wd;
+        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        if (unlikely(!event_priv))
+                return -ENOMEM;
+        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+        fsn_event_priv->group = group;
+        event_priv->wd = wd;
+        ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
+        /* EEXIST is not an error */
+        if (ret == -EEXIST)
+                ret = 0;
+        /* did event_priv get attached? */
+        if (list_empty(&fsn_event_priv->event_list))
+                inotify_free_event_priv(fsn_event_priv);
+        /*
+         * If we hold the entry until after the event is on the queue
+         * IN_IGNORED won't be able to pass this event in the queue
+         */
+        fsnotify_put_mark(entry);
+        return ret;
+}
+static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+        inotify_destroy_mark_entry(entry, group);
+}
+static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
+{
+        struct fsnotify_mark_entry *entry;
+        bool send;
+        spin_lock(&inode->i_lock);
+        entry = fsnotify_find_mark_entry(group, inode);
+        spin_unlock(&inode->i_lock);
+        if (!entry)
+                return false;
+        mask = (mask & ~FS_EVENT_ON_CHILD);
+        send = (entry->mask & mask);
+        /* find took a reference */
+        fsnotify_put_mark(entry);
+        return send;
+}
+static int idr_callback(int id, void *p, void *data)
+{
+        BUG();
+        return 0;
+}
+static void inotify_free_group_priv(struct fsnotify_group *group)
+{
+        /* ideally the idr is empty and we won't hit the BUG in teh callback */
+        idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
+        idr_remove_all(&group->inotify_data.idr);
+        idr_destroy(&group->inotify_data.idr);
+}
+void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+{
+        struct inotify_event_private_data *event_priv;
+        event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
+                                  fsnotify_event_priv_data);
+        kmem_cache_free(event_priv_cachep, event_priv);
+}
+const struct fsnotify_ops inotify_fsnotify_ops = {
+        .handle_event = inotify_handle_event,
+        .should_send_event = inotify_should_send_event,
+        .free_group_priv = inotify_free_group_priv,
+        .free_event_priv = inotify_free_event_priv,
+        .freeing_mark = inotify_freeing_mark,
+};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1634319e2404..982a412ac5bc 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -8,6 +8,9 @@
 * Copyright (C) 2005 John McCutchan
 * Copyright 2006 Hewlett-Packard Development Company, L.P.
 *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
@@ -19,94 +22,48 @@
 * General Public License for more details.
 */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/mount.h>
+#include <linux/fs.h> /* struct inode */
-#include <linux/namei.h>
+#include <linux/fsnotify_backend.h>
-#include <linux/poll.h>
+#include <linux/idr.h>
-#include <linux/init.h>
+#include <linux/init.h> /* module_init */
-#include <linux/list.h>
 #include <linux/inotify.h>
+#include <linux/kernel.h> /* roundup() */
+#include <linux/magic.h> /* superblock magic number */
+#include <linux/mount.h> /* mntget */
+#include <linux/namei.h> /* LOOKUP_FOLLOW */
+#include <linux/path.h> /* struct path */
+#include <linux/sched.h> /* struct user */
+#include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
-#include <linux/magic.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+#include <linux/wait.h>
-#include <asm/ioctls.h>
+#include "inotify.h"
-static struct kmem_cache *watch_cachep __read_mostly;
+#include <asm/ioctls.h>
-static struct kmem_cache *event_cachep __read_mostly;
 static struct vfsmount *inotify_mnt __read_mostly;
+/* this just sits here and wastes global memory.  used to just pad userspace messages with zeros */
+static struct inotify_event nul_inotify_event;
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
-static int inotify_max_user_watches __read_mostly;
 static int inotify_max_queued_events __read_mostly;
+int inotify_max_user_watches __read_mostly;
-/*
+static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
- * Lock ordering:
+struct kmem_cache *event_priv_cachep __read_mostly;
- *
+static struct fsnotify_event *inotify_ignored_event;
- * inotify_dev->up_mutex (ensures we don't re-add the same watch)
- *      inode->inotify_mutex (protects inode's watch list)
- *              inotify_handle->mutex (protects inotify_handle's watch list)
- *                      inotify_dev->ev_mutex (protects device's event queue)
- */
 /*
- * Lifetimes of the main data structures:
+ * When inotify registers a new group it increments this and uses that
- *
+ * value as an offset to set the fsnotify group "name" and priority.
- * inotify_device: Lifetime is managed by reference count, from
- * sys_inotify_init() until release.  Additional references can bump the count
- * via get_inotify_dev() and drop the count via put_inotify_dev().
- *
- * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
- * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
- * first event, or to inotify_destroy().
 */
+static atomic_t inotify_grp_num;
-/*
- * struct inotify_device - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_device {
-        wait_queue_head_t       wq;             /* wait queue for i/o */
-        struct mutex            ev_mutex;       /* protects event queue */
-        struct mutex            up_mutex;       /* synchronizes watch updates */
-        struct list_head        events;         /* list of queued events */
-        struct user_struct      *user;          /* user who opened this dev */
-        struct inotify_handle   *ih;            /* inotify handle */
-        struct fasync_struct    *fa;            /* async notification */
-        atomic_t                count;          /* reference count */
-        unsigned int            queue_size;     /* size of the queue (bytes) */
-        unsigned int            event_count;    /* number of pending events */
-        unsigned int            max_events;     /* maximum number of events */
-};
-/*
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->ev_mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-        struct inotify_event    event;  /* the user-space event */
-        struct list_head        list;   /* entry in inotify_device's list */
-        char                    *name;  /* filename, if any */
-};
-/*
- * struct inotify_user_watch - our version of an inotify_watch, we add
- * a reference to the associated inotify_device.
- */
-struct inotify_user_watch {
-        struct inotify_device   *dev;   /* associated device */
-        struct inotify_watch    wdata;  /* inotify watch data */
-};
 #ifdef CONFIG_SYSCTL
@@ -149,280 +106,36 @@ ctl_table inotify_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
-static inline void get_inotify_dev(struct inotify_device *dev)
+static inline __u32 inotify_arg_to_mask(u32 arg)
-{
-        atomic_inc(&dev->count);
-}
-static inline void put_inotify_dev(struct inotify_device *dev)
-{
-        if (atomic_dec_and_test(&dev->count)) {
-                atomic_dec(&dev->user->inotify_devs);
-                free_uid(dev->user);
-                kfree(dev);
-        }
-}
-/*
- * free_inotify_user_watch - cleans up the watch and its references
- */
-static void free_inotify_user_watch(struct inotify_watch *w)
-{
-        struct inotify_user_watch *watch;
-        struct inotify_device *dev;
-        watch = container_of(w, struct inotify_user_watch, wdata);
-        dev = watch->dev;
-        atomic_dec(&dev->user->inotify_watches);
-        put_inotify_dev(dev);
-        kmem_cache_free(watch_cachep, watch);
-}
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-                                                  const char *name)
-{
-        struct inotify_kernel_event *kevent;
-        kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
-        if (unlikely(!kevent))
-                return NULL;
-        /* we hand this out to user-space, so zero it just in case */
-        memset(&kevent->event, 0, sizeof(struct inotify_event));
-        kevent->event.wd = wd;
-        kevent->event.mask = mask;
-        kevent->event.cookie = cookie;
-        INIT_LIST_HEAD(&kevent->list);
-        if (name) {
-                size_t len, rem, event_size = sizeof(struct inotify_event);
-                /*
-                 * We need to pad the filename so as to properly align an
-                 * array of inotify_event structures.  Because the structure is
-                 * small and the common case is a small filename, we just round
-                 * up to the next multiple of the structure's sizeof.  This is
-                 * simple and safe for all architectures.
-                 */
-                len = strlen(name) + 1;
-                rem = event_size - len;
-                if (len > event_size) {
-                        rem = event_size - (len % event_size);
-                        if (len % event_size == 0)
-                                rem = 0;
-                }
-                kevent->name = kmalloc(len + rem, GFP_NOFS);
-                if (unlikely(!kevent->name)) {
-                        kmem_cache_free(event_cachep, kevent);
-                        return NULL;
-                }
-                memcpy(kevent->name, name, len);
-                if (rem)
-                        memset(kevent->name + len, 0, rem);
-                kevent->event.len = len + rem;
-        } else {
-                kevent->event.len = 0;
-                kevent->name = NULL;
-        }
-        return kevent;
-}
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-        return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-/*
- * inotify_dev_get_last_event - return the last event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_last_event(struct inotify_device *dev)
 {
-        if (list_empty(&dev->events))
+        __u32 mask;
-                return NULL;
-        return list_entry(dev->events.prev, struct inotify_kernel_event, list);
-}
-/*
+        /* everything should accept their own ignored and cares about children */
- * inotify_dev_queue_event - event handler registered with core inotify, adds
+        mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
- * a new event to the given device
- *
- * Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
-                                    u32 cookie, const char *name,
-                                    struct inode *ignored)
-{
-        struct inotify_user_watch *watch;
-        struct inotify_device *dev;
-        struct inotify_kernel_event *kevent, *last;
-        watch = container_of(w, struct inotify_user_watch, wdata);
+        /* mask off the flags used to open the fd */
-        dev = watch->dev;
+        mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
-        mutex_lock(&dev->ev_mutex);
+        return mask;
-        /* we can safely put the watch as we don't reference it while
-         * generating the event
-         */
-        if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
-                put_inotify_watch(w); /* final put */
-        /* coalescing: drop this event if it is a dupe of the previous */
-        last = inotify_dev_get_last_event(dev);
-        if (last && last->event.mask == mask && last->event.wd == wd &&
-                        last->event.cookie == cookie) {
-                const char *lastname = last->name;
-                if (!name && !lastname)
-                        goto out;
-                if (name && lastname && !strcmp(lastname, name))
-                        goto out;
-        }
-        /* the queue overflowed and we already sent the Q_OVERFLOW event */
-        if (unlikely(dev->event_count > dev->max_events))
-                goto out;
-        /* if the queue overflows, we need to notify user space */
-        if (unlikely(dev->event_count == dev->max_events))
-                kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-        else
-                kevent = kernel_event(wd, mask, cookie, name);
-        if (unlikely(!kevent))
-                goto out;
-        /* queue the event and wake up anyone waiting */
-        dev->event_count++;
-        dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-        list_add_tail(&kevent->list, &dev->events);
-        wake_up_interruptible(&dev->wq);
-        kill_fasync(&dev->fa, SIGIO, POLL_IN);
-out:
-        mutex_unlock(&dev->ev_mutex);
-}
-/*
- * remove_kevent - cleans up the given kevent
- *
- * Caller must hold dev->ev_mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-                          struct inotify_kernel_event *kevent)
-{
-        list_del(&kevent->list);
-        dev->event_count--;
-        dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
-}
-/*
- * free_kevent - frees the given kevent.
- */
-static void free_kevent(struct inotify_kernel_event *kevent)
-{
-        kfree(kevent->name);
-        kmem_cache_free(event_cachep, kevent);
-}
-/*
- * inotify_dev_event_dequeue - destroy an event on the given device
- *
- * Caller must hold dev->ev_mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-        if (!list_empty(&dev->events)) {
-                struct inotify_kernel_event *kevent;
-                kevent = inotify_dev_get_event(dev);
-                remove_kevent(dev, kevent);
-                free_kevent(kevent);
-        }
-}
-/*
- * find_inode - resolve a user-given path to a specific inode
- */
-static int find_inode(const char __user *dirname, struct path *path,
-                      unsigned flags)
-{
-        int error;
-        error = user_path_at(AT_FDCWD, dirname, flags, path);
-        if (error)
-                return error;
-        /* you can only watch an inode if you have read permissions on it */
-        error = inode_permission(path->dentry->d_inode, MAY_READ);
-        if (error)
-                path_put(path);
-        return error;
 }
-/*
+static inline u32 inotify_mask_to_arg(__u32 mask)
- * create_watch - creates a watch on the given device.
- *
- * Callers must hold dev->up_mutex.
- */
-static int create_watch(struct inotify_device *dev, struct inode *inode,
-                        u32 mask)
 {
-        struct inotify_user_watch *watch;
+        return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
-        int ret;
+                       IN_Q_OVERFLOW);
-        if (atomic_read(&dev->user->inotify_watches) >=
-                        inotify_max_user_watches)
-                return -ENOSPC;
-        watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-        if (unlikely(!watch))
-                return -ENOMEM;
-        /* save a reference to device and bump the count to make it official */
-        get_inotify_dev(dev);
-        watch->dev = dev;
-        atomic_inc(&dev->user->inotify_watches);
-        inotify_init_watch(&watch->wdata);
-        ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
-        if (ret < 0)
-                free_inotify_user_watch(&watch->wdata);
-        return ret;
 }
-/* Device Interface */
+/* intofiy userspace file descriptor functions */
 static unsigned int inotify_poll(struct file *file, poll_table *wait)
 {
-        struct inotify_device *dev = file->private_data;
+        struct fsnotify_group *group = file->private_data;
        int ret = 0;
-        poll_wait(file, &dev->wq, wait);
+        poll_wait(file, &group->notification_waitq, wait);
-        mutex_lock(&dev->ev_mutex);
+        mutex_lock(&group->notification_mutex);
-        if (!list_empty(&dev->events))
+        if (!fsnotify_notify_queue_is_empty(group))
                ret = POLLIN | POLLRDNORM;
-        mutex_unlock(&dev->ev_mutex);
+        mutex_unlock(&group->notification_mutex);
        return ret;
 }
@@ -432,26 +145,29 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
 * enough to fit in "count". Return an error pointer if
 * not large enough.
 *
- * Called with the device ev_mutex held.
+ * Called with the group->notification_mutex held.
 */
-static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
-                                                  size_t count)
+                                            size_t count)
 {
        size_t event_size = sizeof(struct inotify_event);
-        struct inotify_kernel_event *kevent;
+        struct fsnotify_event *event;
-        if (list_empty(&dev->events))
+        if (fsnotify_notify_queue_is_empty(group))
                return NULL;
-        kevent = inotify_dev_get_event(dev);
+        event = fsnotify_peek_notify_event(group);
-        if (kevent->name)
-                event_size += kevent->event.len;
+        event_size += roundup(event->name_len, event_size);
        if (event_size > count)
                return ERR_PTR(-EINVAL);
-        remove_kevent(dev, kevent);
+        /* held the notification_mutex the whole time, so this is the
-        return kevent;
+         * same event we peeked above */
+        fsnotify_remove_notify_event(group);
+        return event;
 }
 /*
@@ -460,51 +176,90 @@ static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
 * We already checked that the event size is smaller than the
 * buffer we had in "get_one_event()" above.
 */
-static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+                                  struct fsnotify_event *event,
                                  char __user *buf)
 {
+        struct inotify_event inotify_event;
+        struct fsnotify_event_private_data *fsn_priv;
+        struct inotify_event_private_data *priv;
        size_t event_size = sizeof(struct inotify_event);
+        size_t name_len;
+        /* we get the inotify watch descriptor from the event private data */
+        spin_lock(&event->lock);
+        fsn_priv = fsnotify_remove_priv_from_event(group, event);
+        spin_unlock(&event->lock);
+        if (!fsn_priv)
+                inotify_event.wd = -1;
+        else {
+                priv = container_of(fsn_priv, struct inotify_event_private_data,
+                                    fsnotify_event_priv_data);
+                inotify_event.wd = priv->wd;
+                inotify_free_event_priv(fsn_priv);
+        }
+        /* round up event->name_len so it is a multiple of event_size */
+        name_len = roundup(event->name_len, event_size);
+        inotify_event.len = name_len;
+        inotify_event.mask = inotify_mask_to_arg(event->mask);
+        inotify_event.cookie = event->sync_cookie;
-        if (copy_to_user(buf, &kevent->event, event_size))
+        /* send the main event */
+        if (copy_to_user(buf, &inotify_event, event_size))
                return -EFAULT;
-        if (kevent->name) {
+        buf += event_size;
-                buf += event_size;
-                if (copy_to_user(buf, kevent->name, kevent->event.len))
+        /*
+         * fsnotify only stores the pathname, so here we have to send the pathname
+         * and then pad that pathname out to a multiple of sizeof(inotify_event)
+         * with zeros.  I get my zeros from the nul_inotify_event.
+         */
+        if (name_len) {
+                unsigned int len_to_zero = name_len - event->name_len;
+                /* copy the path name */
+                if (copy_to_user(buf, event->file_name, event->name_len))
                        return -EFAULT;
+                buf += event->name_len;
-                event_size += kevent->event.len;
+                /* fill userspace with 0's from nul_inotify_event */
+                if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
+                        return -EFAULT;
+                buf += len_to_zero;
+                event_size += name_len;
        }
        return event_size;
 }
 static ssize_t inotify_read(struct file *file, char __user *buf,
                            size_t count, loff_t *pos)
 {
-        struct inotify_device *dev;
+        struct fsnotify_group *group;
+        struct fsnotify_event *kevent;
        char __user *start;
        int ret;
        DEFINE_WAIT(wait);
        start = buf;
-        dev = file->private_data;
+        group = file->private_data;
        while (1) {
-                struct inotify_kernel_event *kevent;
+                prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
-                prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+                mutex_lock(&group->notification_mutex);
+                kevent = get_one_event(group, count);
-                mutex_lock(&dev->ev_mutex);
+                mutex_unlock(&group->notification_mutex);
-                kevent = get_one_event(dev, count);
-                mutex_unlock(&dev->ev_mutex);
                if (kevent) {
                        ret = PTR_ERR(kevent);
                        if (IS_ERR(kevent))
                                break;
-                        ret = copy_event_to_user(kevent, buf);
+                        ret = copy_event_to_user(group, kevent, buf);
-                        free_kevent(kevent);
+                        fsnotify_put_event(kevent);
                        if (ret < 0)
                                break;
                        buf += ret;
@@ -525,7 +280,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
                schedule();
        }
-        finish_wait(&dev->wq, &wait);
+        finish_wait(&group->notification_waitq, &wait);
        if (start != buf && ret != -EFAULT)
                ret = buf - start;
        return ret;
@@ -533,25 +288,19 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 static int inotify_fasync(int fd, struct file *file, int on)
 {
-        struct inotify_device *dev = file->private_data;
+        struct fsnotify_group *group = file->private_data;
-        return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO;
+        return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
 }
 static int inotify_release(struct inode *ignored, struct file *file)
 {
-        struct inotify_device *dev = file->private_data;
+        struct fsnotify_group *group = file->private_data;
-        inotify_destroy(dev->ih);
-        /* destroy all of the events on this device */
+        fsnotify_clear_marks_by_group(group);
-        mutex_lock(&dev->ev_mutex);
-        while (!list_empty(&dev->events))
-                inotify_dev_event_dequeue(dev);
-        mutex_unlock(&dev->ev_mutex);
-        /* free this device: the put matching the get in inotify_init() */
+        /* free this group, matching get was inotify_init->fsnotify_obtain_group */
-        put_inotify_dev(dev);
+        fsnotify_put_group(group);
        return 0;
 }
@@ -559,16 +308,27 @@ static int inotify_release(struct inode *ignored, struct file *file)
 static long inotify_ioctl(struct file *file, unsigned int cmd,
                          unsigned long arg)
 {
-        struct inotify_device *dev;
+        struct fsnotify_group *group;
+        struct fsnotify_event_holder *holder;
+        struct fsnotify_event *event;
        void __user *p;
        int ret = -ENOTTY;
+        size_t send_len = 0;
-        dev = file->private_data;
+        group = file->private_data;
        p = (void __user *) arg;
        switch (cmd) {
        case FIONREAD:
-                ret = put_user(dev->queue_size, (int __user *) p);
+                mutex_lock(&group->notification_mutex);
+                list_for_each_entry(holder, &group->notification_list, event_list) {
+                        event = holder->event;
+                        send_len += sizeof(struct inotify_event);
+                        send_len += roundup(event->name_len,
+                                             sizeof(struct inotify_event));
+                }
+                mutex_unlock(&group->notification_mutex);
+                ret = put_user(send_len, (int __user *) p);
                break;
        }
@@ -576,23 +336,233 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 }
 static const struct file_operations inotify_fops = {
-        .poll           = inotify_poll,
+        .poll           = inotify_poll,
-        .read           = inotify_read,
+        .read           = inotify_read,
-        .fasync         = inotify_fasync,
+        .fasync         = inotify_fasync,
-        .release        = inotify_release,
+        .release        = inotify_release,
-        .unlocked_ioctl = inotify_ioctl,
+        .unlocked_ioctl = inotify_ioctl,
        .compat_ioctl   = inotify_ioctl,
 };
-static const struct inotify_operations inotify_user_ops = {
-        .handle_event   = inotify_dev_queue_event,
-        .destroy_watch  = free_inotify_user_watch,
-};
+/*
+ * find_inode - resolve a user-given path to a specific inode
+ */
+static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
+{
+        int error;
+        error = user_path_at(AT_FDCWD, dirname, flags, path);
+        if (error)
+                return error;
+        /* you can only watch an inode if you have read permissions on it */
+        error = inode_permission(path->dentry->d_inode, MAY_READ);
+        if (error)
+                path_put(path);
+        return error;
+}
+/*
+ * When, for whatever reason, inotify is done with a mark (or what used to be a
+ * watch) we need to remove that watch from the idr and we need to send IN_IGNORED
+ * for the given wd.
+ *
+ * There is a bit of recursion here.  The loop looks like:
+ *      inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry ->
+ *      inotify_freeing_mark -> inotify_destory_mark_entry -> restart
+ * But the loop is broken in 2 places.  fsnotify_destroy_mark_by_entry sets
+ * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup)
+ * test below will not call back to fsnotify again.  But even if that test wasn't
+ * there this would still be safe since fsnotify_destroy_mark_by_entry() is
+ * safe from recursion.
+ */
+void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+        struct inotify_inode_mark_entry *ientry;
+        struct inotify_event_private_data *event_priv;
+        struct fsnotify_event_private_data *fsn_event_priv;
+        struct fsnotify_group *egroup;
+        struct idr *idr;
+        spin_lock(&entry->lock);
+        egroup = entry->group;
+        /* if egroup we aren't really done and something might still send events
+         * for this inode, on the callback we'll send the IN_IGNORED */
+        if (egroup) {
+                spin_unlock(&entry->lock);
+                fsnotify_destroy_mark_by_entry(entry);
+                return;
+        }
+        spin_unlock(&entry->lock);
+        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        if (unlikely(!event_priv))
+                goto skip_send_ignore;
+        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+        fsn_event_priv->group = group;
+        event_priv->wd = ientry->wd;
+        fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
+        /* did the private data get added? */
+        if (list_empty(&fsn_event_priv->event_list))
+                inotify_free_event_priv(fsn_event_priv);
+skip_send_ignore:
+        /* remove this entry from the idr */
+        spin_lock(&group->inotify_data.idr_lock);
+        idr = &group->inotify_data.idr;
+        idr_remove(idr, ientry->wd);
+        spin_unlock(&group->inotify_data.idr_lock);
+        /* removed from idr, drop that reference */
+        fsnotify_put_mark(entry);
+}
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+        struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
+        kmem_cache_free(inotify_inode_mark_cachep, ientry);
+}
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+        struct fsnotify_mark_entry *entry = NULL;
+        struct inotify_inode_mark_entry *ientry;
+        int ret = 0;
+        int add = (arg & IN_MASK_ADD);
+        __u32 mask;
+        __u32 old_mask, new_mask;
+        /* don't allow invalid bits: we don't want flags set */
+        mask = inotify_arg_to_mask(arg);
+        if (unlikely(!mask))
+                return -EINVAL;
+        ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+        if (unlikely(!ientry))
+                return -ENOMEM;
+        /* we set the mask at the end after attaching it */
+        fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
+        ientry->wd = 0;
+find_entry:
+        spin_lock(&inode->i_lock);
+        entry = fsnotify_find_mark_entry(group, inode);
+        spin_unlock(&inode->i_lock);
+        if (entry) {
+                kmem_cache_free(inotify_inode_mark_cachep, ientry);
+                ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        } else {
+                if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
+                        ret = -ENOSPC;
+                        goto out_err;
+                }
+                ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
+                if (ret == -EEXIST)
+                        goto find_entry;
+                else if (ret)
+                        goto out_err;
+                entry = &ientry->fsn_entry;
+retry:
+                ret = -ENOMEM;
+                if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
+                        goto out_err;
+                spin_lock(&group->inotify_data.idr_lock);
+                /* if entry is added to the idr we keep the reference obtained
+                 * through fsnotify_mark_add.  remember to drop this reference
+                 * when entry is removed from idr */
+                ret = idr_get_new_above(&group->inotify_data.idr, entry,
+                                        ++group->inotify_data.last_wd,
+                                        &ientry->wd);
+                spin_unlock(&group->inotify_data.idr_lock);
+                if (ret) {
+                        if (ret == -EAGAIN)
+                                goto retry;
+                        goto out_err;
+                }
+                atomic_inc(&group->inotify_data.user->inotify_watches);
+        }
+        spin_lock(&entry->lock);
+        old_mask = entry->mask;
+        if (add) {
+                entry->mask |= mask;
+                new_mask = entry->mask;
+        } else {
+                entry->mask = mask;
+                new_mask = entry->mask;
+        }
+        spin_unlock(&entry->lock);
+        if (old_mask != new_mask) {
+                /* more bits in old than in new? */
+                int dropped = (old_mask & ~new_mask);
+                /* more bits in this entry than the inode's mask? */
+                int do_inode = (new_mask & ~inode->i_fsnotify_mask);
+                /* more bits in this entry than the group? */
+                int do_group = (new_mask & ~group->mask);
+                /* update the inode with this new entry */
+                if (dropped || do_inode)
+                        fsnotify_recalc_inode_mask(inode);
+                /* update the group mask with the new mask */
+                if (dropped || do_group)
+                        fsnotify_recalc_group_mask(group);
+        }
+        return ientry->wd;
+out_err:
+        /* see this isn't supposed to happen, just kill the watch */
+        if (entry) {
+                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_put_mark(entry);
+        }
+        return ret;
+}
+static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
+{
+        struct fsnotify_group *group;
+        unsigned int grp_num;
+        /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+        grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
+        group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
+        if (IS_ERR(group))
+                return group;
+        group->max_events = max_events;
+        spin_lock_init(&group->inotify_data.idr_lock);
+        idr_init(&group->inotify_data.idr);
+        group->inotify_data.last_wd = 0;
+        group->inotify_data.user = user;
+        group->inotify_data.fa = NULL;
+        return group;
+}
+/* inotify syscalls */
 SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
-        struct inotify_device *dev;
+        struct fsnotify_group *group;
-        struct inotify_handle *ih;
        struct user_struct *user;
        struct file *filp;
        int fd, ret;
@@ -621,45 +591,27 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
                goto out_free_uid;
        }
-        dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+        /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
-        if (unlikely(!dev)) {
+        group = inotify_new_group(user, inotify_max_queued_events);
-                ret = -ENOMEM;
+        if (IS_ERR(group)) {
+                ret = PTR_ERR(group);
                goto out_free_uid;
        }
-        ih = inotify_init(&inotify_user_ops);
-        if (IS_ERR(ih)) {
-                ret = PTR_ERR(ih);
-                goto out_free_dev;
-        }
-        dev->ih = ih;
-        dev->fa = NULL;
        filp->f_op = &inotify_fops;
        filp->f_path.mnt = mntget(inotify_mnt);
        filp->f_path.dentry = dget(inotify_mnt->mnt_root);
        filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
        filp->f_mode = FMODE_READ;
        filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-        filp->private_data = dev;
+        filp->private_data = group;
-        INIT_LIST_HEAD(&dev->events);
-        init_waitqueue_head(&dev->wq);
-        mutex_init(&dev->ev_mutex);
-        mutex_init(&dev->up_mutex);
-        dev->event_count = 0;
-        dev->queue_size = 0;
-        dev->max_events = inotify_max_queued_events;
-        dev->user = user;
-        atomic_set(&dev->count, 0);
-        get_inotify_dev(dev);
        atomic_inc(&user->inotify_devs);
        fd_install(fd, filp);
        return fd;
-out_free_dev:
-        kfree(dev);
 out_free_uid:
        free_uid(user);
        put_filp(filp);
@@ -676,8 +628,8 @@ SYSCALL_DEFINE0(inotify_init)
 SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
                u32, mask)
 {
+        struct fsnotify_group *group;
        struct inode *inode;
-        struct inotify_device *dev;
        struct path path;
        struct file *filp;
        int ret, fput_needed;
@@ -698,20 +650,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
        if (mask & IN_ONLYDIR)
                flags |= LOOKUP_DIRECTORY;
-        ret = find_inode(pathname, &path, flags);
+        ret = inotify_find_inode(pathname, &path, flags);
-        if (unlikely(ret))
+        if (ret)
                goto fput_and_out;
-        /* inode held in place by reference to path; dev by fget on fd */
+        /* inode held in place by reference to path; group by fget on fd */
        inode = path.dentry->d_inode;
-        dev = filp->private_data;
+        group = filp->private_data;
-        mutex_lock(&dev->up_mutex);
+        /* create/update an inode mark */
-        ret = inotify_find_update_watch(dev->ih, inode, mask);
+        ret = inotify_update_watch(group, inode, mask);
-        if (ret == -ENOENT)
+        if (unlikely(ret))
-                ret = create_watch(dev, inode, mask);
+                goto path_put_and_out;
-        mutex_unlock(&dev->up_mutex);
+path_put_and_out:
        path_put(&path);
 fput_and_out:
        fput_light(filp, fput_needed);
@@ -720,9 +672,10 @@ fput_and_out:
 SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
+        struct fsnotify_group *group;
+        struct fsnotify_mark_entry *entry;
        struct file *filp;
-        struct inotify_device *dev;
+        int ret = 0, fput_needed;
-        int ret, fput_needed;
        filp = fget_light(fd, &fput_needed);
        if (unlikely(!filp))
@@ -734,10 +687,20 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
                goto out;
        }
-        dev = filp->private_data;
+        group = filp->private_data;
-        /* we free our watch data when we get IN_IGNORED */
+        spin_lock(&group->inotify_data.idr_lock);
-        ret = inotify_rm_wd(dev->ih, wd);
+        entry = idr_find(&group->inotify_data.idr, wd);
+        if (unlikely(!entry)) {
+                spin_unlock(&group->inotify_data.idr_lock);
+                ret = -EINVAL;
+                goto out;
+        }
+        fsnotify_get_mark(entry);
+        spin_unlock(&group->inotify_data.idr_lock);
+        inotify_destroy_mark_entry(entry, group);
+        fsnotify_put_mark(entry);
 out:
        fput_light(filp, fput_needed);
@@ -753,9 +716,9 @@ inotify_get_sb(struct file_system_type *fs_type, int flags,
 }
 static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
+    .name       = "inotifyfs",
-    .get_sb         = inotify_get_sb,
+    .get_sb     = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
+    .kill_sb    = kill_anon_super,
 };
 /*
@@ -775,18 +738,16 @@ static int __init inotify_user_setup(void)
        if (IS_ERR(inotify_mnt))
                panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
+        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
+        event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
+        inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
+        if (!inotify_ignored_event)
+                panic("unable to allocate the inotify ignored event\n");
        inotify_max_queued_events = 16384;
        inotify_max_user_instances = 128;
        inotify_max_user_watches = 8192;
-        watch_cachep = kmem_cache_create("inotify_watch_cache",
-                                         sizeof(struct inotify_user_watch),
-                                         0, SLAB_PANIC, NULL);
-        event_cachep = kmem_cache_create("inotify_event_cache",
-                                         sizeof(struct inotify_kernel_event),
-                                         0, SLAB_PANIC, NULL);
        return 0;
 }
 module_init(inotify_user_setup);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
new file mode 100644
index 000000000000..959b73e756fd
--- /dev/null
+++ b/fs/notify/notification.c
@@ -0,0 +1,411 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Basic idea behind the notification queue: An fsnotify group (like inotify)
+ * sends the userspace notification about events asyncronously some time after
+ * the event happened.  When inotify gets an event it will need to add that
+ * event to the group notify queue.  Since a single event might need to be on
+ * multiple group's notification queues we can't add the event directly to each
+ * queue and instead add a small "event_holder" to each queue.  This event_holder
+ * has a pointer back to the original event.  Since the majority of events are
+ * going to end up on one, and only one, notification queue we embed one
+ * event_holder into each event.  This means we have a single allocation instead
+ * of always needing two.  If the embedded event_holder is already in use by
+ * another group a new event_holder (from fsnotify_event_holder_cachep) will be
+ * allocated and used.
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+static struct kmem_cache *fsnotify_event_cachep;
+static struct kmem_cache *fsnotify_event_holder_cachep;
+/*
+ * This is a magic event we send when the q is too full.  Since it doesn't
+ * hold real event information we just keep one system wide and use it any time
+ * it is needed.  It's refcnt is set 1 at kernel init time and will never
+ * get set to 0 so it will never get 'freed'
+ */
+static struct fsnotify_event q_overflow_event;
+static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
+/**
+ * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
+ * Called from fsnotify_move, which is inlined into filesystem modules.
+ */
+u32 fsnotify_get_cookie(void)
+{
+        return atomic_inc_return(&fsnotify_sync_cookie);
+}
+EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
+/* return true if the notify queue is empty, false otherwise */
+bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
+{
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        return list_empty(&group->notification_list) ? true : false;
+}
+void fsnotify_get_event(struct fsnotify_event *event)
+{
+        atomic_inc(&event->refcnt);
+}
+void fsnotify_put_event(struct fsnotify_event *event)
+{
+        if (!event)
+                return;
+        if (atomic_dec_and_test(&event->refcnt)) {
+                if (event->data_type == FSNOTIFY_EVENT_PATH)
+                        path_put(&event->path);
+                BUG_ON(!list_empty(&event->private_data_list));
+                kfree(event->file_name);
+                kmem_cache_free(fsnotify_event_cachep, event);
+        }
+}
+struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
+{
+        return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
+}
+void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
+{
+        kmem_cache_free(fsnotify_event_holder_cachep, holder);
+}
+/*
+ * Find the private data that the group previously attached to this event when
+ * the group added the event to the notification queue (fsnotify_add_notify_event)
+ */
+struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+        struct fsnotify_event_private_data *lpriv;
+        struct fsnotify_event_private_data *priv = NULL;
+        assert_spin_locked(&event->lock);
+        list_for_each_entry(lpriv, &event->private_data_list, event_list) {
+                if (lpriv->group == group) {
+                        priv = lpriv;
+                        list_del(&priv->event_list);
+                        break;
+                }
+        }
+        return priv;
+}
+/*
+ * Check if 2 events contain the same information.  We do not compare private data
+ * but at this moment that isn't a problem for any know fsnotify listeners.
+ */
+static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+        if ((old->mask == new->mask) &&
+            (old->to_tell == new->to_tell) &&
+            (old->data_type == new->data_type)) {
+                switch (old->data_type) {
+                case (FSNOTIFY_EVENT_INODE):
+                        if (old->inode == new->inode)
+                                return true;
+                        break;
+                case (FSNOTIFY_EVENT_PATH):
+                        if ((old->path.mnt == new->path.mnt) &&
+                            (old->path.dentry == new->path.dentry))
+                                return true;
+                case (FSNOTIFY_EVENT_NONE):
+                        return true;
+                };
+        }
+        return false;
+}
+/*
+ * Add an event to the group notification queue.  The group can later pull this
+ * event off the queue to deal with.  If the event is successfully added to the
+ * group's notification queue, a reference is taken on event.
+ */
+int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+                              struct fsnotify_event_private_data *priv)
+{
+        struct fsnotify_event_holder *holder = NULL;
+        struct list_head *list = &group->notification_list;
+        struct fsnotify_event_holder *last_holder;
+        struct fsnotify_event *last_event;
+        /* easy to tell if priv was attached to the event */
+        INIT_LIST_HEAD(&priv->event_list);
+        /*
+         * There is one fsnotify_event_holder embedded inside each fsnotify_event.
+         * Check if we expect to be able to use that holder.  If not alloc a new
+         * holder.
+         * For the overflow event it's possible that something will use the in
+         * event holder before we get the lock so we may need to jump back and
+         * alloc a new holder, this can't happen for most events...
+         */
+        if (!list_empty(&event->holder.event_list)) {
+alloc_holder:
+                holder = fsnotify_alloc_event_holder();
+                if (!holder)
+                        return -ENOMEM;
+        }
+        mutex_lock(&group->notification_mutex);
+        if (group->q_len >= group->max_events) {
+                event = &q_overflow_event;
+                /* sorry, no private data on the overflow event */
+                priv = NULL;
+        }
+        spin_lock(&event->lock);
+        if (list_empty(&event->holder.event_list)) {
+                if (unlikely(holder))
+                        fsnotify_destroy_event_holder(holder);
+                holder = &event->holder;
+        } else if (unlikely(!holder)) {
+                /* between the time we checked above and got the lock the in
+                 * event holder was used, go back and get a new one */
+                spin_unlock(&event->lock);
+                mutex_unlock(&group->notification_mutex);
+                goto alloc_holder;
+        }
+        if (!list_empty(list)) {
+                last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
+                last_event = last_holder->event;
+                if (event_compare(last_event, event)) {
+                        spin_unlock(&event->lock);
+                        mutex_unlock(&group->notification_mutex);
+                        if (holder != &event->holder)
+                                fsnotify_destroy_event_holder(holder);
+                        return -EEXIST;
+                }
+        }
+        group->q_len++;
+        holder->event = event;
+        fsnotify_get_event(event);
+        list_add_tail(&holder->event_list, list);
+        if (priv)
+                list_add_tail(&priv->event_list, &event->private_data_list);
+        spin_unlock(&event->lock);
+        mutex_unlock(&group->notification_mutex);
+        wake_up(&group->notification_waitq);
+        return 0;
+}
+/*
+ * Remove and return the first event from the notification list.  There is a
+ * reference held on this event since it was on the list.  It is the responsibility
+ * of the caller to drop this reference.
+ */
+struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
+{
+        struct fsnotify_event *event;
+        struct fsnotify_event_holder *holder;
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+        event = holder->event;
+        spin_lock(&event->lock);
+        holder->event = NULL;
+        list_del_init(&holder->event_list);
+        spin_unlock(&event->lock);
+        /* event == holder means we are referenced through the in event holder */
+        if (holder != &event->holder)
+                fsnotify_destroy_event_holder(holder);
+        group->q_len--;
+        return event;
+}
+/*
+ * This will not remove the event, that must be done with fsnotify_remove_notify_event()
+ */
+struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
+{
+        struct fsnotify_event *event;
+        struct fsnotify_event_holder *holder;
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+        event = holder->event;
+        return event;
+}
+/*
+ * Called when a group is being torn down to clean up any outstanding
+ * event notifications.
+ */
+void fsnotify_flush_notify(struct fsnotify_group *group)
+{
+        struct fsnotify_event *event;
+        struct fsnotify_event_private_data *priv;
+        mutex_lock(&group->notification_mutex);
+        while (!fsnotify_notify_queue_is_empty(group)) {
+                event = fsnotify_remove_notify_event(group);
+                /* if they don't implement free_event_priv they better not have attached any */
+                if (group->ops->free_event_priv) {
+                        spin_lock(&event->lock);
+                        priv = fsnotify_remove_priv_from_event(group, event);
+                        spin_unlock(&event->lock);
+                        if (priv)
+                                group->ops->free_event_priv(priv);
+                }
+                fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
+        }
+        mutex_unlock(&group->notification_mutex);
+}
+static void initialize_event(struct fsnotify_event *event)
+{
+        event->holder.event = NULL;
+        INIT_LIST_HEAD(&event->holder.event_list);
+        atomic_set(&event->refcnt, 1);
+        spin_lock_init(&event->lock);
+        event->path.dentry = NULL;
+        event->path.mnt = NULL;
+        event->inode = NULL;
+        event->data_type = FSNOTIFY_EVENT_NONE;
+        INIT_LIST_HEAD(&event->private_data_list);
+        event->to_tell = NULL;
+        event->file_name = NULL;
+        event->name_len = 0;
+        event->sync_cookie = 0;
+}
+/*
+ * fsnotify_create_event - Allocate a new event which will be sent to each
+ * group's handle_event function if the group was interested in this
+ * particular event.
+ *
+ * @to_tell the inode which is supposed to receive the event (sometimes a
+ *      parent of the inode to which the event happened.
+ * @mask what actually happened.
+ * @data pointer to the object which was actually affected
+ * @data_type flag indication if the data is a file, path, inode, nothing...
+ * @name the filename, if available
+ */
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
+                                             int data_type, const char *name, u32 cookie)
+{
+        struct fsnotify_event *event;
+        event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+        if (!event)
+                return NULL;
+        initialize_event(event);
+        if (name) {
+                event->file_name = kstrdup(name, GFP_KERNEL);
+                if (!event->file_name) {
+                        kmem_cache_free(fsnotify_event_cachep, event);
+                        return NULL;
+                }
+                event->name_len = strlen(event->file_name);
+        }
+        event->sync_cookie = cookie;
+        event->to_tell = to_tell;
+        switch (data_type) {
+        case FSNOTIFY_EVENT_FILE: {
+                struct file *file = data;
+                struct path *path = &file->f_path;
+                event->path.dentry = path->dentry;
+                event->path.mnt = path->mnt;
+                path_get(&event->path);
+                event->data_type = FSNOTIFY_EVENT_PATH;
+                break;
+        }
+        case FSNOTIFY_EVENT_PATH: {
+                struct path *path = data;
+                event->path.dentry = path->dentry;
+                event->path.mnt = path->mnt;
+                path_get(&event->path);
+                event->data_type = FSNOTIFY_EVENT_PATH;
+                break;
+        }
+        case FSNOTIFY_EVENT_INODE:
+                event->inode = data;
+                event->data_type = FSNOTIFY_EVENT_INODE;
+                break;
+        case FSNOTIFY_EVENT_NONE:
+                event->inode = NULL;
+                event->path.dentry = NULL;
+                event->path.mnt = NULL;
+                break;
+        default:
+                BUG();
+        }
+        event->mask = mask;
+        return event;
+}
+__init int fsnotify_notification_init(void)
+{
+        fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
+        fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
+        initialize_event(&q_overflow_event);
+        q_overflow_event.mask = FS_Q_OVERFLOW;
+        return 0;
+}
+subsys_initcall(fsnotify_notification_init);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index f76951dcd4a6..6aa7c4713536 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -25,7 +25,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
-#include <linux/blkdev.h>       /* For bdev_hardsect_size(). */
+#include <linux/blkdev.h>       /* For bdev_logical_block_size(). */
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
@@ -2785,13 +2785,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
                goto err_out_now;
        /* We support sector sizes up to the PAGE_CACHE_SIZE. */
-        if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+        if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
                if (!silent)
                        ntfs_error(sb, "Device has unsupported sector size "
                                        "(%i).  The maximum supported sector "
                                        "size on this architecture is %lu "
                                        "bytes.",
-                                        bdev_hardsect_size(sb->s_bdev),
+                                        bdev_logical_block_size(sb->s_bdev),
                                        PAGE_CACHE_SIZE);
                goto err_out_now;
        }
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4f85eceab376..09cc25d04611 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1371,7 +1371,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
        bdevname(reg->hr_bdev, reg->hr_dev_name);
-        sectsize = bdev_hardsect_size(reg->hr_bdev);
+        sectsize = bdev_logical_block_size(reg->hr_bdev);
        if (sectsize != reg->hr_block_bytes) {
                mlog(ML_ERROR,
                     "blocksize %u incorrect for device, expected %d",
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9d37e0..5c6163f55039 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -713,7 +713,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
        *bh = NULL;
        /* may be > 512 */
-        *sector_size = bdev_hardsect_size(sb->s_bdev);
+        *sector_size = bdev_logical_block_size(sb->s_bdev);
        if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
                mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
                     *sector_size, OCFS2_MAX_BLOCKSIZE);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 99e33ef40be4..0af36085eb28 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *dev,
        return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
+ssize_t part_alignment_offset_show(struct device *dev,
+                                   struct device_attribute *attr, char *buf)
+{
+        struct hd_struct *p = dev_to_part(dev);
+        return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+}
 ssize_t part_stat_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
 {
@@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
@@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = {
        &dev_attr_partition.attr,
        &dev_attr_start.attr,
        &dev_attr_size.attr,
+        &dev_attr_alignment_offset.attr,
        &dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
        &dev_attr_fail.attr,
@@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        pdev = part_to_dev(p);
        p->start_sect = start;
+        p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
        p->nr_sects = len;
        p->partno = partno;
        p->policy = get_disk_ro(disk);
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 46297683cd34..fc71aab08460 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -76,7 +76,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
        Sector sect;
        res = 0;
-        blocksize = bdev_hardsect_size(bdev);
+        blocksize = bdev_logical_block_size(bdev);
        if (blocksize <= 0)
                goto out_exit;
        i_size = i_size_read(bdev->bd_inode);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 796511886f28..0028d2ef0662 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -110,7 +110,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
        Sector sect;
        unsigned char *data;
        u32 this_sector, this_size;
-        int sector_size = bdev_hardsect_size(bdev) / 512;
+        int sector_size = bdev_logical_block_size(bdev) / 512;
        int loopct = 0;         /* number of links followed
                                   without finding a data partition */
        int i;
@@ -415,7 +415,7 @@ static struct {
 
 int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-        int sector_size = bdev_hardsect_size(bdev) / 512;
+        int sector_size = bdev_logical_block_size(bdev) / 512;
        Sector sect;
        unsigned char *data;
        struct partition *p;
diff --git a/fs/pipe.c b/fs/pipe.c
index 13414ec45b8d..f7dd21ad85a6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -302,6 +302,20 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
        return 0;
 }
+/**
+ * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
+ * @pipe:       the pipe that the buffer belongs to
+ * @buf:        the buffer to put a reference to
+ *
+ * Description:
+ *      This function releases a reference to @buf.
+ */
+void generic_pipe_buf_release(struct pipe_inode_info *pipe,
+                              struct pipe_buffer *buf)
+{
+        page_cache_release(buf->page);
+}
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .can_merge = 1,
        .map = generic_pipe_buf_map,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3326bbf9ab95..1539e630c47d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2128,9 +2128,15 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
        if (copy_from_user(page, buf, count))
                goto out_free;
+        /* Guard against adverse ptrace interaction */
+        length = mutex_lock_interruptible(&task->cred_guard_mutex);
+        if (length < 0)
+                goto out_free;
        length = security_setprocattr(task,
                                      (char*)file->f_path.dentry->d_name.name,
                                      (void*)page, count);
+        mutex_unlock(&task->cred_guard_mutex);
 out_free:
        free_page((unsigned long) page);
 out:
diff --git a/fs/read_write.c b/fs/read_write.c
index 9d1e76bb9ee1..6c8c55dec2bc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -805,12 +805,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                goto out;
        if (!(in_file->f_mode & FMODE_READ))
                goto fput_in;
-        retval = -EINVAL;
-        in_inode = in_file->f_path.dentry->d_inode;
-        if (!in_inode)
-                goto fput_in;
-        if (!in_file->f_op || !in_file->f_op->splice_read)
-                goto fput_in;
        retval = -ESPIPE;
        if (!ppos)
                ppos = &in_file->f_pos;
@@ -834,6 +828,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
        retval = -EINVAL;
        if (!out_file->f_op || !out_file->f_op->sendpage)
                goto fput_out;
+        in_inode = in_file->f_path.dentry->d_inode;
        out_inode = out_file->f_path.dentry->d_inode;
        retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
        if (retval < 0)
diff --git a/fs/splice.c b/fs/splice.c
index 666953d59a35..73766d24f97b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -507,9 +507,131 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
        return ret;
 }
 EXPORT_SYMBOL(generic_file_splice_read);
+static const struct pipe_buf_operations default_pipe_buf_ops = {
+        .can_merge = 0,
+        .map = generic_pipe_buf_map,
+        .unmap = generic_pipe_buf_unmap,
+        .confirm = generic_pipe_buf_confirm,
+        .release = generic_pipe_buf_release,
+        .steal = generic_pipe_buf_steal,
+        .get = generic_pipe_buf_get,
+};
+static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
+                            unsigned long vlen, loff_t offset)
+{
+        mm_segment_t old_fs;
+        loff_t pos = offset;
+        ssize_t res;
+        old_fs = get_fs();
+        set_fs(get_ds());
+        /* The cast to a user pointer is valid due to the set_fs() */
+        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+        set_fs(old_fs);
+        return res;
+}
+static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
+                            loff_t pos)
+{
+        mm_segment_t old_fs;
+        ssize_t res;
+        old_fs = get_fs();
+        set_fs(get_ds());
+        /* The cast to a user pointer is valid due to the set_fs() */
+        res = vfs_write(file, (const char __user *)buf, count, &pos);
+        set_fs(old_fs);
+        return res;
+}
+ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+                                 struct pipe_inode_info *pipe, size_t len,
+                                 unsigned int flags)
+{
+        unsigned int nr_pages;
+        unsigned int nr_freed;
+        size_t offset;
+        struct page *pages[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_BUFFERS];
+        struct iovec vec[PIPE_BUFFERS];
+        pgoff_t index;
+        ssize_t res;
+        size_t this_len;
+        int error;
+        int i;
+        struct splice_pipe_desc spd = {
+                .pages = pages,
+                .partial = partial,
+                .flags = flags,
+                .ops = &default_pipe_buf_ops,
+                .spd_release = spd_release_page,
+        };
+        index = *ppos >> PAGE_CACHE_SHIFT;
+        offset = *ppos & ~PAGE_CACHE_MASK;
+        nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+                struct page *page;
+                page = alloc_page(GFP_USER);
+                error = -ENOMEM;
+                if (!page)
+                        goto err;
+                this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+                vec[i].iov_base = (void __user *) page_address(page);
+                vec[i].iov_len = this_len;
+                pages[i] = page;
+                spd.nr_pages++;
+                len -= this_len;
+                offset = 0;
+        }
+        res = kernel_readv(in, vec, spd.nr_pages, *ppos);
+        if (res < 0) {
+                error = res;
+                goto err;
+        }
+        error = 0;
+        if (!res)
+                goto err;
+        nr_freed = 0;
+        for (i = 0; i < spd.nr_pages; i++) {
+                this_len = min_t(size_t, vec[i].iov_len, res);
+                partial[i].offset = 0;
+                partial[i].len = this_len;
+                if (!this_len) {
+                        __free_page(pages[i]);
+                        pages[i] = NULL;
+                        nr_freed++;
+                }
+                res -= this_len;
+        }
+        spd.nr_pages -= nr_freed;
+        res = splice_to_pipe(pipe, &spd);
+        if (res > 0)
+                *ppos += res;
+        return res;
+err:
+        for (i = 0; i < spd.nr_pages; i++)
+                __free_page(pages[i]);
+        return error;
+}
+EXPORT_SYMBOL(default_file_splice_read);
 /*
 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 * using sendpage(). Return the number of bytes sent.
@@ -881,6 +1003,36 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 EXPORT_SYMBOL(generic_file_splice_write);
+static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+                          struct splice_desc *sd)
+{
+        int ret;
+        void *data;
+        ret = buf->ops->confirm(pipe, buf);
+        if (ret)
+                return ret;
+        data = buf->ops->map(pipe, buf, 0);
+        ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
+        buf->ops->unmap(pipe, buf, data);
+        return ret;
+}
+static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
+                                         struct file *out, loff_t *ppos,
+                                         size_t len, unsigned int flags)
+{
+        ssize_t ret;
+        ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
+        if (ret > 0)
+                *ppos += ret;
+        return ret;
+}
 /**
 * generic_splice_sendpage - splice data from a pipe to a socket
 * @pipe:       pipe to splice from
@@ -908,11 +1060,10 @@ EXPORT_SYMBOL(generic_splice_sendpage);
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
                           loff_t *ppos, size_t len, unsigned int flags)
 {
+        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
+                                loff_t *, size_t, unsigned int);
        int ret;
-        if (unlikely(!out->f_op || !out->f_op->splice_write))
-                return -EINVAL;
        if (unlikely(!(out->f_mode & FMODE_WRITE)))
                return -EBADF;
@@ -923,7 +1074,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
        if (unlikely(ret < 0))
                return ret;
-        return out->f_op->splice_write(pipe, out, ppos, len, flags);
+        splice_write = out->f_op->splice_write;
+        if (!splice_write)
+                splice_write = default_file_splice_write;
+        return splice_write(pipe, out, ppos, len, flags);
 }
 /*
@@ -933,11 +1088,10 @@ static long do_splice_to(struct file *in, loff_t *ppos,
                         struct pipe_inode_info *pipe, size_t len,
                         unsigned int flags)
 {
+        ssize_t (*splice_read)(struct file *, loff_t *,
+                               struct pipe_inode_info *, size_t, unsigned int);
        int ret;
-        if (unlikely(!in->f_op || !in->f_op->splice_read))
-                return -EINVAL;
        if (unlikely(!(in->f_mode & FMODE_READ)))
                return -EBADF;
@@ -945,7 +1099,11 @@ static long do_splice_to(struct file *in, loff_t *ppos,
        if (unlikely(ret < 0))
                return ret;
-        return in->f_op->splice_read(in, ppos, pipe, len, flags);
+        splice_read = in->f_op->splice_read;
+        if (!splice_read)
+                splice_read = default_file_splice_read;
+        return splice_read(in, ppos, pipe, len, flags);
 }
 /**
@@ -1112,6 +1270,9 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
        return ret;
 }
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+                               struct pipe_inode_info *opipe,
+                               size_t len, unsigned int flags);
 /*
 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
 * location, so checking ->i_pipe is not enough to verify that this is a
@@ -1132,12 +1293,32 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                      struct file *out, loff_t __user *off_out,
                      size_t len, unsigned int flags)
 {
-        struct pipe_inode_info *pipe;
+        struct pipe_inode_info *ipipe;
+        struct pipe_inode_info *opipe;
        loff_t offset, *off;
        long ret;
-        pipe = pipe_info(in->f_path.dentry->d_inode);
+        ipipe = pipe_info(in->f_path.dentry->d_inode);
-        if (pipe) {
+        opipe = pipe_info(out->f_path.dentry->d_inode);
+        if (ipipe && opipe) {
+                if (off_in || off_out)
+                        return -ESPIPE;
+                if (!(in->f_mode & FMODE_READ))
+                        return -EBADF;
+                if (!(out->f_mode & FMODE_WRITE))
+                        return -EBADF;
+                /* Splicing to self would be fun, but... */
+                if (ipipe == opipe)
+                        return -EINVAL;
+                return splice_pipe_to_pipe(ipipe, opipe, len, flags);
+        }
+        if (ipipe) {
                if (off_in)
                        return -ESPIPE;
                if (off_out) {
@@ -1149,7 +1330,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                } else
                        off = &out->f_pos;
-                ret = do_splice_from(pipe, out, off, len, flags);
+                ret = do_splice_from(ipipe, out, off, len, flags);
                if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
                        ret = -EFAULT;
@@ -1157,8 +1338,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                return ret;
        }
-        pipe = pipe_info(out->f_path.dentry->d_inode);
+        if (opipe) {
-        if (pipe) {
                if (off_out)
                        return -ESPIPE;
                if (off_in) {
@@ -1170,7 +1350,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                } else
                        off = &in->f_pos;
-                ret = do_splice_to(in, off, pipe, len, flags);
+                ret = do_splice_to(in, off, opipe, len, flags);
                if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
                        ret = -EFAULT;
@@ -1511,7 +1691,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 * Make sure there's data to read. Wait for input if we can, otherwise
 * return an appropriate error.
 */
-static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
        int ret;
@@ -1549,7 +1729,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 * Make sure there's writeable room. Wait for room if we can, otherwise
 * return an appropriate error.
 */
-static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
        int ret;
@@ -1587,6 +1767,124 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 }
 /*
+ * Splice contents of ipipe to opipe.
+ */
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+                               struct pipe_inode_info *opipe,
+                               size_t len, unsigned int flags)
+{
+        struct pipe_buffer *ibuf, *obuf;
+        int ret = 0, nbuf;
+        bool input_wakeup = false;
+retry:
+        ret = ipipe_prep(ipipe, flags);
+        if (ret)
+                return ret;
+        ret = opipe_prep(opipe, flags);
+        if (ret)
+                return ret;
+        /*
+         * Potential ABBA deadlock, work around it by ordering lock
+         * grabbing by pipe info address. Otherwise two different processes
+         * could deadlock (one doing tee from A -> B, the other from B -> A).
+         */
+        pipe_double_lock(ipipe, opipe);
+        do {
+                if (!opipe->readers) {
+                        send_sig(SIGPIPE, current, 0);
+                        if (!ret)
+                                ret = -EPIPE;
+                        break;
+                }
+                if (!ipipe->nrbufs && !ipipe->writers)
+                        break;
+                /*
+                 * Cannot make any progress, because either the input
+                 * pipe is empty or the output pipe is full.
+                 */
+                if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+                        /* Already processed some buffers, break */
+                        if (ret)
+                                break;
+                        if (flags & SPLICE_F_NONBLOCK) {
+                                ret = -EAGAIN;
+                                break;
+                        }
+                        /*
+                         * We raced with another reader/writer and haven't
+                         * managed to process any buffers.  A zero return
+                         * value means EOF, so retry instead.
+                         */
+                        pipe_unlock(ipipe);
+                        pipe_unlock(opipe);
+                        goto retry;
+                }
+                ibuf = ipipe->bufs + ipipe->curbuf;
+                nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+                obuf = opipe->bufs + nbuf;
+                if (len >= ibuf->len) {
+                        /*
+                         * Simply move the whole buffer from ipipe to opipe
+                         */
+                        *obuf = *ibuf;
+                        ibuf->ops = NULL;
+                        opipe->nrbufs++;
+                        ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+                        ipipe->nrbufs--;
+                        input_wakeup = true;
+                } else {
+                        /*
+                         * Get a reference to this pipe buffer,
+                         * so we can copy the contents over.
+                         */
+                        ibuf->ops->get(ipipe, ibuf);
+                        *obuf = *ibuf;
+                        /*
+                         * Don't inherit the gift flag, we need to
+                         * prevent multiple steals of this page.
+                         */
+                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+                        obuf->len = len;
+                        opipe->nrbufs++;
+                        ibuf->offset += obuf->len;
+                        ibuf->len -= obuf->len;
+                }
+                ret += obuf->len;
+                len -= obuf->len;
+        } while (len);
+        pipe_unlock(ipipe);
+        pipe_unlock(opipe);
+        /*
+         * If we put data in the output pipe, wakeup any potential readers.
+         */
+        if (ret > 0) {
+                smp_mb();
+                if (waitqueue_active(&opipe->wait))
+                        wake_up_interruptible(&opipe->wait);
+                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+        }
+        if (input_wakeup)
+                wakeup_pipe_writers(ipipe);
+        return ret;
+}
+/*
 * Link contents of ipipe to opipe.
 */
 static int link_pipe(struct pipe_inode_info *ipipe,
@@ -1690,9 +1988,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
                 * Keep going, unless we encounter an error. The ipipe/opipe
                 * ordering doesn't really matter.
                 */
-                ret = link_ipipe_prep(ipipe, flags);
+                ret = ipipe_prep(ipipe, flags);
                if (!ret) {
-                        ret = link_opipe_prep(opipe, flags);
+                        ret = opipe_prep(opipe, flags);
                        if (!ret)
                                ret = link_pipe(ipipe, opipe, len, flags);
                }
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 72348cc855a4..0ba44107d8f1 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1915,7 +1915,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
                ret = udf_load_vrs(sb, &uopt, silent, &fileset);
        } else {
-                uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+                uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
                ret = udf_load_vrs(sb, &uopt, silent, &fileset);
                if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
                        if (!silent)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e28800a9f2b5..1418b916fc27 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1501,7 +1501,7 @@ xfs_setsize_buftarg_early(
        struct block_device     *bdev)
 {
        return xfs_setsize_buftarg_flags(btp,
-                        PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
+                        PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
 }
 int