163 files changed, 8550 insertions, 4041 deletions
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 9c5e6b2cd11..c2183f3917c 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -22,6 +22,7 @@
 #include <linux/blkdev.h>
 #include <linux/mempool.h>
+#include <linux/export.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/slab.h>
diff --git a/fs/bio.c b/fs/bio.c
index 9bfade8a609..b1fe82cf88c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -255,7 +255,6 @@ void bio_init(struct bio *bio)
 {
        memset(bio, 0, sizeof(*bio));
        bio->bi_flags = 1 << BIO_UPTODATE;
-        bio->bi_comp_cpu = -1;
        atomic_set(&bio->bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
@@ -338,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio)
 *      RETURNS:
 *      Pointer to new bio on success, NULL on failure.
 */
-struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
+struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 {
        struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
@@ -366,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio)
 *   %__GFP_WAIT, the allocation is guaranteed to succeed.
 *
 **/
-struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
+struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 {
        struct bio *bio;
@@ -697,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd)
        kfree(bmd);
 }
-static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
+static struct bio_map_data *bio_alloc_map_data(int nr_segs,
+                                               unsigned int iov_count,
                                               gfp_t gfp_mask)
 {
        struct bio_map_data *bmd;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 95f786ec7f0..b07f1da1de4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
        if (!bdev->bd_disk)
                return;
-        if (disk_partitionable(bdev->bd_disk))
+        if (disk_part_scan_enabled(bdev->bd_disk))
                bdev->bd_invalidated = 1;
 }
@@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
        struct gendisk *disk;
+        struct module *owner;
        int ret;
        int partno;
        int perm = 0;
@@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        disk = get_gendisk(bdev->bd_dev, &partno);
        if (!disk)
                goto out;
+        owner = disk->fops->owner;
        disk_block_events(disk);
        mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                        bdev->bd_disk = NULL;
                                        mutex_unlock(&bdev->bd_mutex);
                                        disk_unblock_events(disk);
-                                        module_put(disk->fops->owner);
                                        put_disk(disk);
+                                        module_put(owner);
                                        goto restart;
                                }
                        }
@@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                goto out_unlock_bdev;
                }
                /* only one opener holds refs to the module and disk */
-                module_put(disk->fops->owner);
                put_disk(disk);
+                module_put(owner);
        }
        bdev->bd_openers++;
        if (for_part)
@@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 out_unlock_bdev:
        mutex_unlock(&bdev->bd_mutex);
        disk_unblock_events(disk);
-        module_put(disk->fops->owner);
        put_disk(disk);
+        module_put(owner);
 out:
        bdput(bdev);
@@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
        if (!bdev->bd_openers) {
                struct module *owner = disk->fops->owner;
-                put_disk(disk);
-                module_put(owner);
                disk_put_part(bdev->bd_part);
                bdev->bd_part = NULL;
                bdev->bd_disk = NULL;
                if (bdev != bdev->bd_contains)
                        victim = bdev->bd_contains;
                bdev->bd_contains = NULL;
+                put_disk(disk);
+                module_put(owner);
        }
        mutex_unlock(&bdev->bd_mutex);
        bdput(bdev);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21..c0ddfd29c5e 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
-           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
+           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
+           reada.o backref.o
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a1..89b156d85d6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                if (!value)
                        return ERR_PTR(-ENOMEM);
                size = __btrfs_getxattr(inode, name, value, size);
-                if (size > 0) {
+        }
-                        acl = posix_acl_from_xattr(value, size);
+        if (size > 0) {
-                        if (IS_ERR(acl)) {
+                acl = posix_acl_from_xattr(value, size);
-                                kfree(value);
-                                return acl;
-                        }
-                        set_cached_acl(inode, type, acl);
-                }
-                kfree(value);
        } else if (size == -ENOENT || size == -ENODATA || size == 0) {
                /* FIXME, who returns -ENOENT?  I think nobody */
                acl = NULL;
-                set_cached_acl(inode, type, acl);
        } else {
                acl = ERR_PTR(-EIO);
        }
+        kfree(value);
+        if (!IS_ERR(acl))
+                set_cached_acl(inode, type, acl);
        return acl;
 }
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 00000000000..22c64fff1bd
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "disk-io.h"
+#include "backref.h"
+struct __data_ref {
+        struct list_head list;
+        u64 inum;
+        u64 root;
+        u64 extent_data_item_offset;
+};
+struct __shared_ref {
+        struct list_head list;
+        u64 disk_byte;
+};
+static int __inode_info(u64 inum, u64 ioff, u8 key_type,
+                        struct btrfs_root *fs_root, struct btrfs_path *path,
+                        struct btrfs_key *found_key)
+{
+        int ret;
+        struct btrfs_key key;
+        struct extent_buffer *eb;
+        key.type = key_type;
+        key.objectid = inum;
+        key.offset = ioff;
+        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+        if (ret < 0)
+                return ret;
+        eb = path->nodes[0];
+        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
+                ret = btrfs_next_leaf(fs_root, path);
+                if (ret)
+                        return ret;
+                eb = path->nodes[0];
+        }
+        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
+        if (found_key->type != key.type || found_key->objectid != key.objectid)
+                return 1;
+        return 0;
+}
+/*
+ * this makes the path point to (inum INODE_ITEM ioff)
+ */
+int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
+                        struct btrfs_path *path)
+{
+        struct btrfs_key key;
+        return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
+                                &key);
+}
+static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
+                                struct btrfs_path *path,
+                                struct btrfs_key *found_key)
+{
+        return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
+                                found_key);
+}
+/*
+ * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
+ * of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+                                struct btrfs_inode_ref *iref,
+                                struct extent_buffer *eb_in, u64 parent,
+                                char *dest, u32 size)
+{
+        u32 len;
+        int slot;
+        u64 next_inum;
+        int ret;
+        s64 bytes_left = size - 1;
+        struct extent_buffer *eb = eb_in;
+        struct btrfs_key found_key;
+        if (bytes_left >= 0)
+                dest[bytes_left] = '\0';
+        while (1) {
+                len = btrfs_inode_ref_name_len(eb, iref);
+                bytes_left -= len;
+                if (bytes_left >= 0)
+                        read_extent_buffer(eb, dest + bytes_left,
+                                                (unsigned long)(iref + 1), len);
+                if (eb != eb_in)
+                        free_extent_buffer(eb);
+                ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+                if (ret)
+                        break;
+                next_inum = found_key.offset;
+                /* regular exit ahead */
+                if (parent == next_inum)
+                        break;
+                slot = path->slots[0];
+                eb = path->nodes[0];
+                /* make sure we can use eb after releasing the path */
+                if (eb != eb_in)
+                        atomic_inc(&eb->refs);
+                btrfs_release_path(path);
+                iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+                parent = next_inum;
+                --bytes_left;
+                if (bytes_left >= 0)
+                        dest[bytes_left] = '/';
+        }
+        btrfs_release_path(path);
+        if (ret)
+                return ERR_PTR(ret);
+        return dest + bytes_left;
+}
+/*
+ * this makes the path point to (logical EXTENT_ITEM *)
+ * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
+ * tree blocks and <0 on error.
+ */
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+                        struct btrfs_path *path, struct btrfs_key *found_key)
+{
+        int ret;
+        u64 flags;
+        u32 item_size;
+        struct extent_buffer *eb;
+        struct btrfs_extent_item *ei;
+        struct btrfs_key key;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.objectid = logical;
+        key.offset = (u64)-1;
+        ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+        if (ret < 0)
+                return ret;
+        ret = btrfs_previous_item(fs_info->extent_root, path,
+                                        0, BTRFS_EXTENT_ITEM_KEY);
+        if (ret < 0)
+                return ret;
+        btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
+        if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
+            found_key->objectid > logical ||
+            found_key->objectid + found_key->offset <= logical)
+                return -ENOENT;
+        eb = path->nodes[0];
+        item_size = btrfs_item_size_nr(eb, path->slots[0]);
+        BUG_ON(item_size < sizeof(*ei));
+        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+        flags = btrfs_extent_flags(eb, ei);
+        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+                return BTRFS_EXTENT_FLAG_TREE_BLOCK;
+        if (flags & BTRFS_EXTENT_FLAG_DATA)
+                return BTRFS_EXTENT_FLAG_DATA;
+        return -EIO;
+}
+/*
+ * helper function to iterate extent inline refs. ptr must point to a 0 value
+ * for the first call and may be modified. it is used to track state.
+ * if more refs exist, 0 is returned and the next call to
+ * __get_extent_inline_ref must pass the modified ptr parameter to get the
+ * next ref. after the last ref was processed, 1 is returned.
+ * returns <0 on error
+ */
+static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
+                                struct btrfs_extent_item *ei, u32 item_size,
+                                struct btrfs_extent_inline_ref **out_eiref,
+                                int *out_type)
+{
+        unsigned long end;
+        u64 flags;
+        struct btrfs_tree_block_info *info;
+        if (!*ptr) {
+                /* first call */
+                flags = btrfs_extent_flags(eb, ei);
+                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                        info = (struct btrfs_tree_block_info *)(ei + 1);
+                        *out_eiref =
+                                (struct btrfs_extent_inline_ref *)(info + 1);
+                } else {
+                        *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
+                }
+                *ptr = (unsigned long)*out_eiref;
+                if ((void *)*ptr >= (void *)ei + item_size)
+                        return -ENOENT;
+        }
+        end = (unsigned long)ei + item_size;
+        *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
+        *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
+        *ptr += btrfs_extent_inline_ref_size(*out_type);
+        WARN_ON(*ptr > end);
+        if (*ptr == end)
+                return 1; /* last */
+        return 0;
+}
+/*
+ * reads the tree block backref for an extent. tree level and root are returned
+ * through out_level and out_root. ptr must point to a 0 value for the first
+ * call and may be modified (see __get_extent_inline_ref comment).
+ * returns 0 if data was provided, 1 if there was no more data to provide or
+ * <0 on error.
+ */
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+                                struct btrfs_extent_item *ei, u32 item_size,
+                                u64 *out_root, u8 *out_level)
+{
+        int ret;
+        int type;
+        struct btrfs_tree_block_info *info;
+        struct btrfs_extent_inline_ref *eiref;
+        if (*ptr == (unsigned long)-1)
+                return 1;
+        while (1) {
+                ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
+                                                &eiref, &type);
+                if (ret < 0)
+                        return ret;
+                if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+                    type == BTRFS_SHARED_BLOCK_REF_KEY)
+                        break;
+                if (ret == 1)
+                        return 1;
+        }
+        /* we can treat both ref types equally here */
+        info = (struct btrfs_tree_block_info *)(ei + 1);
+        *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
+        *out_level = btrfs_tree_block_level(eb, info);
+        if (ret == 1)
+                *ptr = (unsigned long)-1;
+        return 0;
+}
+static int __data_list_add(struct list_head *head, u64 inum,
+                                u64 extent_data_item_offset, u64 root)
+{
+        struct __data_ref *ref;
+        ref = kmalloc(sizeof(*ref), GFP_NOFS);
+        if (!ref)
+                return -ENOMEM;
+        ref->inum = inum;
+        ref->extent_data_item_offset = extent_data_item_offset;
+        ref->root = root;
+        list_add_tail(&ref->list, head);
+        return 0;
+}
+static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
+                                struct btrfs_extent_data_ref *dref)
+{
+        return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
+                                btrfs_extent_data_ref_offset(eb, dref),
+                                btrfs_extent_data_ref_root(eb, dref));
+}
+static int __shared_list_add(struct list_head *head, u64 disk_byte)
+{
+        struct __shared_ref *ref;
+        ref = kmalloc(sizeof(*ref), GFP_NOFS);
+        if (!ref)
+                return -ENOMEM;
+        ref->disk_byte = disk_byte;
+        list_add_tail(&ref->list, head);
+        return 0;
+}
+static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
+                                           u64 logical, u64 inum,
+                                           u64 extent_data_item_offset,
+                                           u64 extent_offset,
+                                           struct btrfs_path *path,
+                                           struct list_head *data_refs,
+                                           iterate_extent_inodes_t *iterate,
+                                           void *ctx)
+{
+        u64 ref_root;
+        u32 item_size;
+        struct btrfs_key key;
+        struct extent_buffer *eb;
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_inline_ref *eiref;
+        struct __data_ref *ref;
+        int ret;
+        int type;
+        int last;
+        unsigned long ptr = 0;
+        WARN_ON(!list_empty(data_refs));
+        ret = extent_from_logical(fs_info, logical, path, &key);
+        if (ret & BTRFS_EXTENT_FLAG_DATA)
+                ret = -EIO;
+        if (ret < 0)
+                goto out;
+        eb = path->nodes[0];
+        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+        item_size = btrfs_item_size_nr(eb, path->slots[0]);
+        ret = 0;
+        ref_root = 0;
+        /*
+         * as done in iterate_extent_inodes, we first build a list of refs to
+         * iterate, then free the path and then iterate them to avoid deadlocks.
+         */
+        do {
+                last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
+                                                &eiref, &type);
+                if (last < 0) {
+                        ret = last;
+                        goto out;
+                }
+                if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+                    type == BTRFS_SHARED_BLOCK_REF_KEY) {
+                        ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
+                        ret = __data_list_add(data_refs, inum,
+                                                extent_data_item_offset,
+                                                ref_root);
+                }
+        } while (!ret && !last);
+        btrfs_release_path(path);
+        if (ref_root == 0) {
+                printk(KERN_ERR "btrfs: failed to find tree block ref "
+                        "for shared data backref %llu\n", logical);
+                WARN_ON(1);
+                ret = -EIO;
+        }
+out:
+        while (!list_empty(data_refs)) {
+                ref = list_first_entry(data_refs, struct __data_ref, list);
+                list_del(&ref->list);
+                if (!ret)
+                        ret = iterate(ref->inum, extent_offset +
+                                        ref->extent_data_item_offset,
+                                        ref->root, ctx);
+                kfree(ref);
+        }
+        return ret;
+}
+static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
+                                    u64 logical, u64 orig_extent_item_objectid,
+                                    u64 extent_offset, struct btrfs_path *path,
+                                    struct list_head *data_refs,
+                                    iterate_extent_inodes_t *iterate,
+                                    void *ctx)
+{
+        u64 disk_byte;
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *eb;
+        int slot;
+        int nritems;
+        int ret;
+        int found = 0;
+        eb = read_tree_block(fs_info->tree_root, logical,
+                                fs_info->tree_root->leafsize, 0);
+        if (!eb)
+                return -EIO;
+        /*
+         * from the shared data ref, we only have the leaf but we need
+         * the key. thus, we must look into all items and see that we
+         * find one (some) with a reference to our extent item.
+         */
+        nritems = btrfs_header_nritems(eb);
+        for (slot = 0; slot < nritems; ++slot) {
+                btrfs_item_key_to_cpu(eb, &key, slot);
+                if (key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+                if (!fi) {
+                        free_extent_buffer(eb);
+                        return -EIO;
+                }
+                disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+                if (disk_byte != orig_extent_item_objectid) {
+                        if (found)
+                                break;
+                        else
+                                continue;
+                }
+                ++found;
+                ret = __iter_shared_inline_ref_inodes(fs_info, logical,
+                                                        key.objectid,
+                                                        key.offset,
+                                                        extent_offset, path,
+                                                        data_refs,
+                                                        iterate, ctx);
+                if (ret)
+                        break;
+        }
+        if (!found) {
+                printk(KERN_ERR "btrfs: failed to follow shared data backref "
+                        "to parent %llu\n", logical);
+                WARN_ON(1);
+                ret = -EIO;
+        }
+        free_extent_buffer(eb);
+        return ret;
+}
+/*
+ * calls iterate() for every inode that references the extent identified by
+ * the given parameters. will use the path given as a parameter and return it
+ * released.
+ * when the iterator function returns a non-zero value, iteration stops.
+ */
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+                                struct btrfs_path *path,
+                                u64 extent_item_objectid,
+                                u64 extent_offset,
+                                iterate_extent_inodes_t *iterate, void *ctx)
+{
+        unsigned long ptr = 0;
+        int last;
+        int ret;
+        int type;
+        u64 logical;
+        u32 item_size;
+        struct btrfs_extent_inline_ref *eiref;
+        struct btrfs_extent_data_ref *dref;
+        struct extent_buffer *eb;
+        struct btrfs_extent_item *ei;
+        struct btrfs_key key;
+        struct list_head data_refs = LIST_HEAD_INIT(data_refs);
+        struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
+        struct __data_ref *ref_d;
+        struct __shared_ref *ref_s;
+        eb = path->nodes[0];
+        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+        item_size = btrfs_item_size_nr(eb, path->slots[0]);
+        /* first we iterate the inline refs, ... */
+        do {
+                last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
+                                                &eiref, &type);
+                if (last == -ENOENT) {
+                        ret = 0;
+                        break;
+                }
+                if (last < 0) {
+                        ret = last;
+                        break;
+                }
+                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+                        dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
+                        ret = __data_list_add_eb(&data_refs, eb, dref);
+                } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+                        logical = btrfs_extent_inline_ref_offset(eb, eiref);
+                        ret = __shared_list_add(&shared_refs, logical);
+                }
+        } while (!ret && !last);
+        /* ... then we proceed to in-tree references and ... */
+        while (!ret) {
+                ++path->slots[0];
+                if (path->slots[0] > btrfs_header_nritems(eb)) {
+                        ret = btrfs_next_leaf(fs_info->extent_root, path);
+                        if (ret) {
+                                if (ret == 1)
+                                        ret = 0; /* we're done */
+                                break;
+                        }
+                        eb = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+                if (key.objectid != extent_item_objectid)
+                        break;
+                if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                        dref = btrfs_item_ptr(eb, path->slots[0],
+                                                struct btrfs_extent_data_ref);
+                        ret = __data_list_add_eb(&data_refs, eb, dref);
+                } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+                        ret = __shared_list_add(&shared_refs, key.offset);
+                }
+        }
+        btrfs_release_path(path);
+        /*
+         * ... only at the very end we can process the refs we found. this is
+         * because the iterator function we call is allowed to make tree lookups
+         * and we have to avoid deadlocks. additionally, we need more tree
+         * lookups ourselves for shared data refs.
+         */
+        while (!list_empty(&data_refs)) {
+                ref_d = list_first_entry(&data_refs, struct __data_ref, list);
+                list_del(&ref_d->list);
+                if (!ret)
+                        ret = iterate(ref_d->inum, extent_offset +
+                                        ref_d->extent_data_item_offset,
+                                        ref_d->root, ctx);
+                kfree(ref_d);
+        }
+        while (!list_empty(&shared_refs)) {
+                ref_s = list_first_entry(&shared_refs, struct __shared_ref,
+                                        list);
+                list_del(&ref_s->list);
+                if (!ret)
+                        ret = __iter_shared_inline_ref(fs_info,
+                                                        ref_s->disk_byte,
+                                                        extent_item_objectid,
+                                                        extent_offset, path,
+                                                        &data_refs,
+                                                        iterate, ctx);
+                kfree(ref_s);
+        }
+        return ret;
+}
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+                                struct btrfs_path *path,
+                                iterate_extent_inodes_t *iterate, void *ctx)
+{
+        int ret;
+        u64 offset;
+        struct btrfs_key found_key;
+        ret = extent_from_logical(fs_info, logical, path,
+                                        &found_key);
+        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+                ret = -EINVAL;
+        if (ret < 0)
+                return ret;
+        offset = logical - found_key.objectid;
+        ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
+                                        offset, iterate, ctx);
+        return ret;
+}
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+                                struct btrfs_path *path,
+                                iterate_irefs_t *iterate, void *ctx)
+{
+        int ret;
+        int slot;
+        u32 cur;
+        u32 len;
+        u32 name_len;
+        u64 parent = 0;
+        int found = 0;
+        struct extent_buffer *eb;
+        struct btrfs_item *item;
+        struct btrfs_inode_ref *iref;
+        struct btrfs_key found_key;
+        while (1) {
+                ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
+                                        &found_key);
+                if (ret < 0)
+                        break;
+                if (ret) {
+                        ret = found ? 0 : -ENOENT;
+                        break;
+                }
+                ++found;
+                parent = found_key.offset;
+                slot = path->slots[0];
+                eb = path->nodes[0];
+                /* make sure we can use eb after releasing the path */
+                atomic_inc(&eb->refs);
+                btrfs_release_path(path);
+                item = btrfs_item_nr(eb, slot);
+                iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+                for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
+                        name_len = btrfs_inode_ref_name_len(eb, iref);
+                        /* path must be released before calling iterate()! */
+                        ret = iterate(parent, iref, eb, ctx);
+                        if (ret) {
+                                free_extent_buffer(eb);
+                                break;
+                        }
+                        len = sizeof(*iref) + name_len;
+                        iref = (struct btrfs_inode_ref *)((char *)iref + len);
+                }
+                free_extent_buffer(eb);
+        }
+        btrfs_release_path(path);
+        return ret;
+}
+/*
+ * returns 0 if the path could be dumped (probably truncated)
+ * returns <0 in case of an error
+ */
+static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
+                                struct extent_buffer *eb, void *ctx)
+{
+        struct inode_fs_paths *ipath = ctx;
+        char *fspath;
+        char *fspath_min;
+        int i = ipath->fspath->elem_cnt;
+        const int s_ptr = sizeof(char *);
+        u32 bytes_left;
+        bytes_left = ipath->fspath->bytes_left > s_ptr ?
+                                        ipath->fspath->bytes_left - s_ptr : 0;
+        fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
+        fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
+                                inum, fspath_min, bytes_left);
+        if (IS_ERR(fspath))
+                return PTR_ERR(fspath);
+        if (fspath > fspath_min) {
+                ipath->fspath->val[i] = (u64)(unsigned long)fspath;
+                ++ipath->fspath->elem_cnt;
+                ipath->fspath->bytes_left = fspath - fspath_min;
+        } else {
+                ++ipath->fspath->elem_missed;
+                ipath->fspath->bytes_missing += fspath_min - fspath;
+                ipath->fspath->bytes_left = 0;
+        }
+        return 0;
+}
+/*
+ * this dumps all file system paths to the inode into the ipath struct, provided
+ * is has been created large enough. each path is zero-terminated and accessed
+ * from ipath->fspath->val[i].
+ * when it returns, there are ipath->fspath->elem_cnt number of paths available
+ * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
+ * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
+ * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
+ * have been needed to return all paths.
+ */
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
+{
+        return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
+                                inode_to_path, ipath);
+}
+/*
+ * allocates space to return multiple file system paths for an inode.
+ * total_bytes to allocate are passed, note that space usable for actual path
+ * information will be total_bytes - sizeof(struct inode_fs_paths).
+ * the returned pointer must be freed with free_ipath() in the end.
+ */
+struct btrfs_data_container *init_data_container(u32 total_bytes)
+{
+        struct btrfs_data_container *data;
+        size_t alloc_bytes;
+        alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
+        data = kmalloc(alloc_bytes, GFP_NOFS);
+        if (!data)
+                return ERR_PTR(-ENOMEM);
+        if (total_bytes >= sizeof(*data)) {
+                data->bytes_left = total_bytes - sizeof(*data);
+                data->bytes_missing = 0;
+        } else {
+                data->bytes_missing = sizeof(*data) - total_bytes;
+                data->bytes_left = 0;
+        }
+        data->elem_cnt = 0;
+        data->elem_missed = 0;
+        return data;
+}
+/*
+ * allocates space to return multiple file system paths for an inode.
+ * total_bytes to allocate are passed, note that space usable for actual path
+ * information will be total_bytes - sizeof(struct inode_fs_paths).
+ * the returned pointer must be freed with free_ipath() in the end.
+ */
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+                                        struct btrfs_path *path)
+{
+        struct inode_fs_paths *ifp;
+        struct btrfs_data_container *fspath;
+        fspath = init_data_container(total_bytes);
+        if (IS_ERR(fspath))
+                return (void *)fspath;
+        ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
+        if (!ifp) {
+                kfree(fspath);
+                return ERR_PTR(-ENOMEM);
+        }
+        ifp->btrfs_path = path;
+        ifp->fspath = fspath;
+        ifp->fs_root = fs_root;
+        return ifp;
+}
+void free_ipath(struct inode_fs_paths *ipath)
+{
+        kfree(ipath);
+}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 00000000000..92618837cb8
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_BACKREF__
+#define __BTRFS_BACKREF__
+#include "ioctl.h"
+struct inode_fs_paths {
+        struct btrfs_path               *btrfs_path;
+        struct btrfs_root               *fs_root;
+        struct btrfs_data_container     *fspath;
+};
+typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
+                void *ctx);
+typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
+                                struct extent_buffer *eb, void *ctx);
+int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
+                        struct btrfs_path *path);
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+                        struct btrfs_path *path, struct btrfs_key *found_key);
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+                                struct btrfs_extent_item *ei, u32 item_size,
+                                u64 *out_root, u8 *out_level);
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+                                struct btrfs_path *path,
+                                u64 extent_item_objectid,
+                                u64 extent_offset,
+                                iterate_extent_inodes_t *iterate, void *ctx);
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+                                struct btrfs_path *path,
+                                iterate_extent_inodes_t *iterate, void *ctx);
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+struct btrfs_data_container *init_data_container(u32 total_bytes);
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+                                        struct btrfs_path *path);
+void free_ipath(struct inode_fs_paths *ipath);
+#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd..634608d2a6d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
         */
        u64 delalloc_bytes;
-        /* total number of bytes that may be used for this inode for
-         * delalloc
-         */
-        u64 reserved_bytes;
        /*
         * the size of the file stored in the metadata on disk.  data=ordered
         * means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
         */
        u64 disk_i_size;
-        /* flags field from the on disk inode */
-        u32 flags;
        /*
         * if this is a directory then index_cnt is the counter for the index
         * number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
        u64 last_unlink_trans;
        /*
+         * Number of bytes outstanding that are going to need csums.  This is
+         * used in ENOSPC accounting.
+         */
+        u64 csum_bytes;
+        /* flags field from the on disk inode */
+        u32 flags;
+        /*
         * Counters to keep track of the number of extent item's we may use due
         * to delalloc and such.  outstanding_extents is the number of extent
         * items we think we'll end up using, and reserved_extents is the number
@@ -146,14 +147,12 @@ struct btrfs_inode {
         * the btrfs file release call will add this inode to the
         * ordered operations list so that we make sure to flush out any
         * new data the application may have written before commit.
-         *
-         * yes, its silly to have a single bitflag, but we might grow more
-         * of these.
         */
        unsigned ordered_data_close:1;
        unsigned orphan_meta_reserved:1;
        unsigned dummy_inode:1;
        unsigned in_defrag:1;
+        unsigned delalloc_meta_reserved:1;
        /*
         * always compress this one file
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f173..14f1c5a0b2d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
 static inline int compressed_bio_size(struct btrfs_root *root,
                                      unsigned long disk_size)
 {
-        u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
        return sizeof(struct compressed_bio) +
                ((disk_size + root->sectorsize - 1) / root->sectorsize) *
                csum_size;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8..dede441bdee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct extent_buffer *buf)
 {
+        /* ensure we can see the force_cow */
+        smp_rmb();
+        /*
+         * We do not need to cow a block if
+         * 1) this block is not created or changed in this transaction;
+         * 2) this block does not belong to TREE_RELOC tree;
+         * 3) the root is not forced COW.
+         *
+         * What is forced COW:
+         *    when we create snapshot during commiting the transaction,
+         *    after we've finished coping src root, we must COW the shared
+         *    block to ensure the metadata consistency.
+         */
        if (btrfs_header_generation(buf) == trans->transid &&
            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
-              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
+            !root->force_cow)
                return 0;
        return 1;
 }
@@ -902,9 +917,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
-        if (level < BTRFS_MAX_LEVEL - 1)
+        if (level < BTRFS_MAX_LEVEL - 1) {
                parent = path->nodes[level + 1];
-        pslot = path->slots[level + 1];
+                pslot = path->slots[level + 1];
+        }
        /*
         * deal with the case where there is only one pointer in the root
@@ -1107,9 +1123,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
        mid = path->nodes[level];
        WARN_ON(btrfs_header_generation(mid) != trans->transid);
-        if (level < BTRFS_MAX_LEVEL - 1)
+        if (level < BTRFS_MAX_LEVEL - 1) {
                parent = path->nodes[level + 1];
-        pslot = path->slots[level + 1];
+                pslot = path->slots[level + 1];
+        }
        if (!parent)
                return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f4..50634abef9b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
 #include <linux/kobject.h>
 #include <trace/events/btrfs.h>
 #include <asm/kmap_types.h>
+#include <linux/pagemap.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
 #define BTRFS_LABEL_SIZE 256
 /*
+ * just in case we somehow lose the roots and are not able to mount,
+ * we store an array of the roots from previous transactions
+ * in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+        __le64 tree_root;
+        __le64 tree_root_gen;
+        __le64 chunk_root;
+        __le64 chunk_root_gen;
+        __le64 extent_root;
+        __le64 extent_root_gen;
+        __le64 fs_root;
+        __le64 fs_root_gen;
+        __le64 dev_root;
+        __le64 dev_root_gen;
+        __le64 csum_root;
+        __le64 csum_root_gen;
+        __le64 total_bytes;
+        __le64 bytes_used;
+        __le64 num_devices;
+        /* future */
+        __le64 unsed_64[4];
+        u8 tree_root_level;
+        u8 chunk_root_level;
+        u8 extent_root_level;
+        u8 fs_root_level;
+        u8 dev_root_level;
+        u8 csum_root_level;
+        /* future and to align */
+        u8 unused_8[10];
+} __attribute__ ((__packed__));
+/*
 * the super block basically lists the main trees of the FS
 * it currently lacks any block count etc etc
 */
@@ -405,6 +447,7 @@ struct btrfs_super_block {
        /* future expansion */
        __le64 reserved[31];
        u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+        struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 } __attribute__ ((__packed__));
 /*
@@ -772,14 +815,8 @@ struct btrfs_space_info {
 struct btrfs_block_rsv {
        u64 size;
        u64 reserved;
-        u64 freed[2];
        struct btrfs_space_info *space_info;
-        struct list_head list;
        spinlock_t lock;
-        atomic_t usage;
-        unsigned int priority:8;
-        unsigned int durable:1;
-        unsigned int refill_used:1;
        unsigned int full:1;
 };
@@ -811,7 +848,8 @@ struct btrfs_free_cluster {
 enum btrfs_caching_type {
        BTRFS_CACHE_NO          = 0,
        BTRFS_CACHE_STARTED     = 1,
-        BTRFS_CACHE_FINISHED    = 2,
+        BTRFS_CACHE_FAST        = 2,
+        BTRFS_CACHE_FINISHED    = 3,
 };
 enum btrfs_disk_cache_state {
@@ -840,10 +878,10 @@ struct btrfs_block_group_cache {
        spinlock_t lock;
        u64 pinned;
        u64 reserved;
-        u64 reserved_pinned;
        u64 bytes_super;
        u64 flags;
        u64 sectorsize;
+        u64 cache_generation;
        unsigned int ro:1;
        unsigned int dirty:1;
        unsigned int iref:1;
@@ -899,6 +937,10 @@ struct btrfs_fs_info {
        spinlock_t block_group_cache_lock;
        struct rb_root block_group_cache_tree;
+        /* keep track of unallocated space */
+        spinlock_t free_chunk_lock;
+        u64 free_chunk_space;
        struct extent_io_tree freed_extents[2];
        struct extent_io_tree *pinned_extents;
@@ -916,14 +958,11 @@ struct btrfs_fs_info {
        struct btrfs_block_rsv trans_block_rsv;
        /* block reservation for chunk tree */
        struct btrfs_block_rsv chunk_block_rsv;
+        /* block reservation for delayed operations */
+        struct btrfs_block_rsv delayed_block_rsv;
        struct btrfs_block_rsv empty_block_rsv;
-        /* list of block reservations that cross multiple transactions */
-        struct list_head durable_block_rsv_list;
-        struct mutex durable_block_rsv_mutex;
        u64 generation;
        u64 last_trans_committed;
@@ -942,8 +981,8 @@ struct btrfs_fs_info {
        wait_queue_head_t transaction_blocked_wait;
        wait_queue_head_t async_submit_wait;
-        struct btrfs_super_block super_copy;
+        struct btrfs_super_block *super_copy;
-        struct btrfs_super_block super_for_commit;
+        struct btrfs_super_block *super_for_commit;
        struct block_device *__bdev;
        struct super_block *sb;
        struct inode *btree_inode;
@@ -1036,6 +1075,7 @@ struct btrfs_fs_info {
        struct btrfs_workers endio_freespace_worker;
        struct btrfs_workers submit_workers;
        struct btrfs_workers caching_workers;
+        struct btrfs_workers readahead_workers;
        /*
         * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1159,13 @@ struct btrfs_fs_info {
        u64 fs_state;
        struct btrfs_delayed_root *delayed_root;
+        /* readahead tree */
+        spinlock_t reada_lock;
+        struct radix_tree_root reada_tree;
+        /* next backup root to be overwritten */
+        int backup_root_index;
 };
 /*
@@ -1225,6 +1272,8 @@ struct btrfs_root {
         * for stat.  It may be used for more later
         */
        dev_t anon_dev;
+        int force_cow;
 };
 struct btrfs_ioctl_defrag_range_args {
@@ -1363,6 +1412,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_ENOSPC_DEBUG         (1 << 15)
 #define BTRFS_MOUNT_AUTO_DEFRAG         (1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE     (1 << 17)
+#define BTRFS_MOUNT_RECOVERY            (1 << 18)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -1978,6 +2028,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
        return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
 }
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+                   tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+                   tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+                   tree_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+                   chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+                   chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+                   chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+                   extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+                   extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+                   extent_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+                   fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+                   fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+                   fs_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+                   dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+                   dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+                   dev_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+                   csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+                   csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+                   csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+                   total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+                   bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+                   num_devices, 64);
 /* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2129,6 +2228,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
                (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
 }
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+        return mapping_gfp_mask(mapping) & ~__GFP_FS;
+}
 /* extent-tree.c */
 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
                                                 unsigned num_items)
@@ -2137,6 +2241,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
                3 * num_items;
 }
+/*
+ * Doing a truncate won't result in new nodes or leaves, just what we need for
+ * COW.
+ */
+static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
+                                                 unsigned num_items)
+{
+        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+                num_items;
+}
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
@@ -2146,6 +2261,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    u64 bytenr, u64 num_bytes);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 objectid, u64 offset, u64 bytenr);
@@ -2196,8 +2314,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      u64 root_objectid, u64 owner, u64 offset);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
-                                u64 num_bytes, int reserve, int sinfo);
+                                       u64 start, u64 len);
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2358,26 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv);
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+int btrfs_block_rsv_add(struct btrfs_root *root,
-                                 struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv,
                        u64 num_bytes);
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                          struct btrfs_root *root,
+                                struct btrfs_block_rsv *block_rsv,
+                                u64 num_bytes);
+int btrfs_block_rsv_check(struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv,
-                          u64 min_reserved, int min_factor);
+                          u64 min_reserved);
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+                                   struct btrfs_block_rsv *block_rsv,
+                                   u64 min_reserved);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes);
 void btrfs_block_rsv_release(struct btrfs_root *root,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes);
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root,
-                                    struct btrfs_block_rsv *rsv);
 int btrfs_set_block_group_ro(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
 int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2379,6 +2498,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
        smp_mb();
        return fs_info->closing;
 }
+static inline void free_fs_info(struct btrfs_fs_info *fs_info)
+{
+        kfree(fs_info->delayed_root);
+        kfree(fs_info->extent_root);
+        kfree(fs_info->tree_root);
+        kfree(fs_info->chunk_root);
+        kfree(fs_info->dev_root);
+        kfree(fs_info->csum_root);
+        kfree(fs_info->super_copy);
+        kfree(fs_info->super_for_commit);
+        kfree(fs_info);
+}
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2579,11 +2710,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_cleanup(struct btrfs_root *root);
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
-                                struct btrfs_pending_snapshot *pending,
-                                u64 *bytes_to_reserve);
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
-                                struct btrfs_pending_snapshot *pending);
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2697,4 +2823,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
                         struct btrfs_scrub_progress *progress);
+/* reada.c */
+struct reada_control {
+        struct btrfs_root       *root;          /* tree to prefetch */
+        struct btrfs_key        key_start;
+        struct btrfs_key        key_end;        /* exclusive */
+        atomic_t                elems;
+        struct kref             refcnt;
+        wait_queue_head_t       wait;
+};
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+                              struct btrfs_key *start, struct btrfs_key *end);
+int btrfs_reada_wait(void *handle);
+void btrfs_reada_detach(void *handle);
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+                         u64 start, int err);
 #endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index ae4d9cd1096..5b163572e0c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
                return 0;
        src_rsv = trans->block_rsv;
-        dst_rsv = &root->fs_info->global_block_rsv;
+        dst_rsv = &root->fs_info->delayed_block_rsv;
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
        if (!item->bytes_reserved)
                return;
-        rsv = &root->fs_info->global_block_rsv;
+        rsv = &root->fs_info->delayed_block_rsv;
        btrfs_block_rsv_release(root, rsv,
                                item->bytes_reserved);
 }
@@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 static int btrfs_delayed_inode_reserve_metadata(
                                        struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
+                                        struct inode *inode,
                                        struct btrfs_delayed_node *node)
 {
        struct btrfs_block_rsv *src_rsv;
        struct btrfs_block_rsv *dst_rsv;
        u64 num_bytes;
        int ret;
+        int release = false;
-        if (!trans->bytes_reserved)
-                return 0;
        src_rsv = trans->block_rsv;
-        dst_rsv = &root->fs_info->global_block_rsv;
+        dst_rsv = &root->fs_info->delayed_block_rsv;
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+        /*
+         * btrfs_dirty_inode will update the inode under btrfs_join_transaction
+         * which doesn't reserve space for speed.  This is a problem since we
+         * still need to reserve space for this update, so try to reserve the
+         * space.
+         *
+         * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
+         * we're accounted for.
+         */
+        if (!trans->bytes_reserved &&
+            src_rsv != &root->fs_info->delalloc_block_rsv) {
+                ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+                /*
+                 * Since we're under a transaction reserve_metadata_bytes could
+                 * try to commit the transaction which will make it return
+                 * EAGAIN to make us stop the transaction we have, so return
+                 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
+                 */
+                if (ret == -EAGAIN)
+                        ret = -ENOSPC;
+                if (!ret)
+                        node->bytes_reserved = num_bytes;
+                return ret;
+        } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+                spin_lock(&BTRFS_I(inode)->lock);
+                if (BTRFS_I(inode)->delalloc_meta_reserved) {
+                        BTRFS_I(inode)->delalloc_meta_reserved = 0;
+                        spin_unlock(&BTRFS_I(inode)->lock);
+                        release = true;
+                        goto migrate;
+                }
+                spin_unlock(&BTRFS_I(inode)->lock);
+                /* Ok we didn't have space pre-reserved.  This shouldn't happen
+                 * too often but it can happen if we do delalloc to an existing
+                 * inode which gets dirtied because of the time update, and then
+                 * isn't touched again until after the transaction commits and
+                 * then we try to write out the data.  First try to be nice and
+                 * reserve something strictly for us.  If not be a pain and try
+                 * to steal from the delalloc block rsv.
+                 */
+                ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+                if (!ret)
+                        goto out;
+                ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+                if (!ret)
+                        goto out;
+                /*
+                 * Ok this is a problem, let's just steal from the global rsv
+                 * since this really shouldn't happen that often.
+                 */
+                WARN_ON(1);
+                ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
+                                              dst_rsv, num_bytes);
+                goto out;
+        }
+migrate:
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+out:
+        /*
+         * Migrate only takes a reservation, it doesn't touch the size of the
+         * block_rsv.  This is to simplify people who don't normally have things
+         * migrated from their block rsv.  If they go to release their
+         * reservation, that will decrease the size as well, so if migrate
+         * reduced size we'd end up with a negative size.  But for the
+         * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
+         * but we could in fact do this reserve/migrate dance several times
+         * between the time we did the original reservation and we'd clean it
+         * up.  So to take care of this, release the space for the meta
+         * reservation here.  I think it may be time for a documentation page on
+         * how block rsvs. work.
+         */
        if (!ret)
                node->bytes_reserved = num_bytes;
+        if (release)
+                btrfs_block_rsv_release(root, src_rsv, num_bytes);
        return ret;
 }
@@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
        if (!node->bytes_reserved)
                return;
-        rsv = &root->fs_info->global_block_rsv;
+        rsv = &root->fs_info->delayed_block_rsv;
        btrfs_block_rsv_release(root, rsv,
                                node->bytes_reserved);
        node->bytes_reserved = 0;
@@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
        path->leave_spinning = 1;
        block_rsv = trans->block_rsv;
-        trans->block_rsv = &root->fs_info->global_block_rsv;
+        trans->block_rsv = &root->fs_info->delayed_block_rsv;
        delayed_root = btrfs_get_delayed_root(root);
@@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
        path->leave_spinning = 1;
        block_rsv = trans->block_rsv;
-        trans->block_rsv = &node->root->fs_info->global_block_rsv;
+        trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
        ret = btrfs_insert_delayed_items(trans, path, node->root, node);
        if (!ret)
@@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
                goto free_path;
        block_rsv = trans->block_rsv;
-        trans->block_rsv = &root->fs_info->global_block_rsv;
+        trans->block_rsv = &root->fs_info->delayed_block_rsv;
        ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
        if (!ret)
@@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
                goto release_node;
        }
-        ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
+        ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
-        /*
+                                                   delayed_node);
-         * we must reserve enough space when we start a new transaction,
+        if (ret)
-         * so reserving metadata failure is impossible
+                goto release_node;
-         */
-        BUG_ON(ret);
        fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
        delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07ea91879a9..632f8f3cc9d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                           int verify)
 {
-        u16 csum_size =
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                btrfs_super_csum_size(&root->fs_info->super_copy);
        char *result = NULL;
        unsigned long len;
        unsigned long cur_len;
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
        clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
        while (1) {
-                ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+                ret = read_extent_buffer_pages(io_tree, eb, start,
+                                               WAIT_COMPLETE,
                                               btree_get_extent, mirror_num);
                if (!ret &&
                    !verify_parent_transid(io_tree, eb, parent_transid))
@@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
        end = eb->start + end - 1;
 err:
+        if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+                clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+                btree_readahead_hook(root, eb, eb->start, ret);
+        }
        free_extent_buffer(eb);
 out:
        return ret;
 }
+static int btree_io_failed_hook(struct bio *failed_bio,
+                         struct page *page, u64 start, u64 end,
+                         int mirror_num, struct extent_state *state)
+{
+        struct extent_io_tree *tree;
+        unsigned long len;
+        struct extent_buffer *eb;
+        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        if (page->private == EXTENT_PAGE_PRIVATE)
+                goto out;
+        if (!page->private)
+                goto out;
+        len = page->private >> 2;
+        WARN_ON(len == 0);
+        eb = alloc_extent_buffer(tree, start, len, page);
+        if (eb == NULL)
+                goto out;
+        if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+                clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+                btree_readahead_hook(root, eb, eb->start, -EIO);
+        }
+        free_extent_buffer(eb);
+out:
+        return -EIO;    /* we fixed nothing */
+}
 static void end_workqueue_bio(struct bio *bio, int err)
 {
        struct end_io_wq *end_io_wq = bio->bi_private;
@@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)
 {
        struct extent_io_tree *tree;
        tree = &BTRFS_I(page->mapping->host)->io_tree;
-        return extent_read_full_page(tree, page, btree_get_extent);
+        return extent_read_full_page(tree, page, btree_get_extent, 0);
 }
 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
        if (!buf)
                return 0;
        read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-                                 buf, 0, 0, btree_get_extent, 0);
+                                 buf, 0, WAIT_NONE, btree_get_extent, 0);
        free_extent_buffer(buf);
        return ret;
 }
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+                         int mirror_num, struct extent_buffer **eb)
+{
+        struct extent_buffer *buf = NULL;
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+        int ret;
+        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+        if (!buf)
+                return 0;
+        set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+        ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+                                       btree_get_extent, mirror_num);
+        if (ret) {
+                free_extent_buffer(buf);
+                return ret;
+        }
+        if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+                free_extent_buffer(buf);
+                return -EIO;
+        } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+                *eb = buf;
+        } else {
+                free_extent_buffer(buf);
+        }
+        return 0;
+}
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize)
 {
@@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        generation = btrfs_root_generation(&root->root_item);
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+        root->commit_root = NULL;
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
        if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
                free_extent_buffer(root->node);
+                root->node = NULL;
                return -EIO;
        }
        root->commit_root = btrfs_root_node(root);
@@ -1577,6 +1648,235 @@ sleep:
        return 0;
 }
+/*
+ * this will find the highest generation in the array of
+ * root backups.  The index of the highest array is returned,
+ * or -1 if we can't find anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest  root in the array with the generation
+ * in the super block.  If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+{
+        u64 cur;
+        int newest_index = -1;
+        struct btrfs_root_backup *root_backup;
+        int i;
+        for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+                root_backup = info->super_copy->super_roots + i;
+                cur = btrfs_backup_tree_root_gen(root_backup);
+                if (cur == newest_gen)
+                        newest_index = i;
+        }
+        /* check to see if we actually wrapped around */
+        if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+                root_backup = info->super_copy->super_roots;
+                cur = btrfs_backup_tree_root_gen(root_backup);
+                if (cur == newest_gen)
+                        newest_index = 0;
+        }
+        return newest_index;
+}
+/*
+ * find the oldest backup so we know where to store new entries
+ * in the backup array.  This will set the backup_root_index
+ * field in the fs_info struct
+ */
+static void find_oldest_super_backup(struct btrfs_fs_info *info,
+                                     u64 newest_gen)
+{
+        int newest_index = -1;
+        newest_index = find_newest_super_backup(info, newest_gen);
+        /* if there was garbage in there, just move along */
+        if (newest_index == -1) {
+                info->backup_root_index = 0;
+        } else {
+                info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+        }
+}
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+        int next_backup;
+        struct btrfs_root_backup *root_backup;
+        int last_backup;
+        next_backup = info->backup_root_index;
+        last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+                BTRFS_NUM_BACKUP_ROOTS;
+        /*
+         * just overwrite the last backup if we're at the same generation
+         * this happens only at umount
+         */
+        root_backup = info->super_for_commit->super_roots + last_backup;
+        if (btrfs_backup_tree_root_gen(root_backup) ==
+            btrfs_header_generation(info->tree_root->node))
+                next_backup = last_backup;
+        root_backup = info->super_for_commit->super_roots + next_backup;
+        /*
+         * make sure all of our padding and empty slots get zero filled
+         * regardless of which ones we use today
+         */
+        memset(root_backup, 0, sizeof(*root_backup));
+        info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+        btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+        btrfs_set_backup_tree_root_gen(root_backup,
+                               btrfs_header_generation(info->tree_root->node));
+        btrfs_set_backup_tree_root_level(root_backup,
+                               btrfs_header_level(info->tree_root->node));
+        btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+        btrfs_set_backup_chunk_root_gen(root_backup,
+                               btrfs_header_generation(info->chunk_root->node));
+        btrfs_set_backup_chunk_root_level(root_backup,
+                               btrfs_header_level(info->chunk_root->node));
+        btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+        btrfs_set_backup_extent_root_gen(root_backup,
+                               btrfs_header_generation(info->extent_root->node));
+        btrfs_set_backup_extent_root_level(root_backup,
+                               btrfs_header_level(info->extent_root->node));
+        /*
+         * we might commit during log recovery, which happens before we set
+         * the fs_root.  Make sure it is valid before we fill it in.
+         */
+        if (info->fs_root && info->fs_root->node) {
+                btrfs_set_backup_fs_root(root_backup,
+                                         info->fs_root->node->start);
+                btrfs_set_backup_fs_root_gen(root_backup,
+                               btrfs_header_generation(info->fs_root->node));
+                btrfs_set_backup_fs_root_level(root_backup,
+                               btrfs_header_level(info->fs_root->node));
+        }
+        btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+        btrfs_set_backup_dev_root_gen(root_backup,
+                               btrfs_header_generation(info->dev_root->node));
+        btrfs_set_backup_dev_root_level(root_backup,
+                                       btrfs_header_level(info->dev_root->node));
+        btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+        btrfs_set_backup_csum_root_gen(root_backup,
+                               btrfs_header_generation(info->csum_root->node));
+        btrfs_set_backup_csum_root_level(root_backup,
+                               btrfs_header_level(info->csum_root->node));
+        btrfs_set_backup_total_bytes(root_backup,
+                             btrfs_super_total_bytes(info->super_copy));
+        btrfs_set_backup_bytes_used(root_backup,
+                             btrfs_super_bytes_used(info->super_copy));
+        btrfs_set_backup_num_devices(root_backup,
+                             btrfs_super_num_devices(info->super_copy));
+        /*
+         * if we don't copy this out to the super_copy, it won't get remembered
+         * for the next commit
+         */
+        memcpy(&info->super_copy->super_roots,
+               &info->super_for_commit->super_roots,
+               sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+/*
+ * this copies info out of the root backup array and back into
+ * the in-memory super block.  It is meant to help iterate through
+ * the array, so you send it the number of backups you've already
+ * tried and the last backup index you used.
+ *
+ * this returns -1 when it has tried all the backups
+ */
+static noinline int next_root_backup(struct btrfs_fs_info *info,
+                                     struct btrfs_super_block *super,
+                                     int *num_backups_tried, int *backup_index)
+{
+        struct btrfs_root_backup *root_backup;
+        int newest = *backup_index;
+        if (*num_backups_tried == 0) {
+                u64 gen = btrfs_super_generation(super);
+                newest = find_newest_super_backup(info, gen);
+                if (newest == -1)
+                        return -1;
+                *backup_index = newest;
+                *num_backups_tried = 1;
+        } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+                /* we've tried all the backups, all done */
+                return -1;
+        } else {
+                /* jump to the next oldest backup */
+                newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+                        BTRFS_NUM_BACKUP_ROOTS;
+                *backup_index = newest;
+                *num_backups_tried += 1;
+        }
+        root_backup = super->super_roots + newest;
+        btrfs_set_super_generation(super,
+                                   btrfs_backup_tree_root_gen(root_backup));
+        btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+        btrfs_set_super_root_level(super,
+                                   btrfs_backup_tree_root_level(root_backup));
+        btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+        /*
+         * fixme: the total bytes and num_devices need to match or we should
+         * need a fsck
+         */
+        btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+        btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+        return 0;
+}
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+{
+        free_extent_buffer(info->tree_root->node);
+        free_extent_buffer(info->tree_root->commit_root);
+        free_extent_buffer(info->dev_root->node);
+        free_extent_buffer(info->dev_root->commit_root);
+        free_extent_buffer(info->extent_root->node);
+        free_extent_buffer(info->extent_root->commit_root);
+        free_extent_buffer(info->csum_root->node);
+        free_extent_buffer(info->csum_root->commit_root);
+        info->tree_root->node = NULL;
+        info->tree_root->commit_root = NULL;
+        info->dev_root->node = NULL;
+        info->dev_root->commit_root = NULL;
+        info->extent_root->node = NULL;
+        info->extent_root->commit_root = NULL;
+        info->csum_root->node = NULL;
+        info->csum_root->commit_root = NULL;
+        if (chunk_root) {
+                free_extent_buffer(info->chunk_root->node);
+                free_extent_buffer(info->chunk_root->commit_root);
+                info->chunk_root->node = NULL;
+                info->chunk_root->commit_root = NULL;
+        }
+}
 struct btrfs_root *open_ctree(struct super_block *sb,
                              struct btrfs_fs_devices *fs_devices,
                              char *options)
@@ -1590,29 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        u64 features;
        struct btrfs_key location;
        struct buffer_head *bh;
-        struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
+        struct btrfs_super_block *disk_super;
-                                                 GFP_NOFS);
-        struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
-                                                 GFP_NOFS);
        struct btrfs_root *tree_root = btrfs_sb(sb);
-        struct btrfs_fs_info *fs_info = NULL;
+        struct btrfs_fs_info *fs_info = tree_root->fs_info;
-        struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
+        struct btrfs_root *extent_root;
-                                                GFP_NOFS);
+        struct btrfs_root *csum_root;
-        struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
+        struct btrfs_root *chunk_root;
-                                              GFP_NOFS);
+        struct btrfs_root *dev_root;
        struct btrfs_root *log_tree_root;
        int ret;
        int err = -EINVAL;
+        int num_backups_tried = 0;
-        struct btrfs_super_block *disk_super;
+        int backup_index = 0;
-        if (!extent_root || !tree_root || !tree_root->fs_info ||
+        extent_root = fs_info->extent_root =
-            !chunk_root || !dev_root || !csum_root) {
+                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        csum_root = fs_info->csum_root =
+                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        chunk_root = fs_info->chunk_root =
+                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        dev_root = fs_info->dev_root =
+                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        if (!extent_root || !csum_root || !chunk_root || !dev_root) {
                err = -ENOMEM;
                goto fail;
        }
-        fs_info = tree_root->fs_info;
        ret = init_srcu_struct(&fs_info->subvol_srcu);
        if (ret) {
@@ -1648,15 +1951,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
+        spin_lock_init(&fs_info->free_chunk_lock);
        mutex_init(&fs_info->reloc_mutex);
        init_completion(&fs_info->kobj_unregister);
-        fs_info->tree_root = tree_root;
-        fs_info->extent_root = extent_root;
-        fs_info->csum_root = csum_root;
-        fs_info->chunk_root = chunk_root;
-        fs_info->dev_root = dev_root;
-        fs_info->fs_devices = fs_devices;
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
        btrfs_mapping_init(&fs_info->mapping_tree);
@@ -1665,8 +1963,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_init_block_rsv(&fs_info->trans_block_rsv);
        btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
        btrfs_init_block_rsv(&fs_info->empty_block_rsv);
-        INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
+        btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
-        mutex_init(&fs_info->durable_block_rsv_mutex);
        atomic_set(&fs_info->nr_async_submits, 0);
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1974,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
        fs_info->trans_no_join = 0;
+        fs_info->free_chunk_space = 0;
+        /* readahead state */
+        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+        spin_lock_init(&fs_info->reada_lock);
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
@@ -1766,14 +2068,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                goto fail_alloc;
        }
-        memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
+        memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
-        memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
+        memcpy(fs_info->super_for_commit, fs_info->super_copy,
-               sizeof(fs_info->super_for_commit));
+               sizeof(*fs_info->super_for_commit));
        brelse(bh);
-        memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+        memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
-        disk_super = &fs_info->super_copy;
+        disk_super = fs_info->super_copy;
        if (!btrfs_super_root(disk_super))
                goto fail_alloc;
@@ -1783,6 +2085,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
        /*
+         * run through our array of backup supers and setup
+         * our ring pointer to the oldest one
+         */
+        generation = btrfs_super_generation(disk_super);
+        find_oldest_super_backup(fs_info, generation);
+        /*
         * In the long term, we'll store the compression type in the super
         * block, and it'll be used for per file compression control.
         */
@@ -1870,6 +2179,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
                           fs_info->thread_pool_size,
                           &fs_info->generic_worker);
+        btrfs_init_workers(&fs_info->readahead_workers, "readahead",
+                           fs_info->thread_pool_size,
+                           &fs_info->generic_worker);
        /*
         * endios are largely parallel and should have a very
@@ -1880,6 +2192,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->endio_write_workers.idle_thresh = 2;
        fs_info->endio_meta_write_workers.idle_thresh = 2;
+        fs_info->readahead_workers.idle_thresh = 2;
        btrfs_start_workers(&fs_info->workers, 1);
        btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1893,6 +2206,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
        btrfs_start_workers(&fs_info->delayed_workers, 1);
        btrfs_start_workers(&fs_info->caching_workers, 1);
+        btrfs_start_workers(&fs_info->readahead_workers, 1);
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1939,7 +2253,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
                printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
                       sb->s_id);
-                goto fail_chunk_root;
+                goto fail_tree_roots;
        }
        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
        chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1954,11 +2268,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (ret) {
                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
                       sb->s_id);
-                goto fail_chunk_root;
+                goto fail_tree_roots;
        }
        btrfs_close_extra_devices(fs_devices);
+retry_root_backup:
        blocksize = btrfs_level_size(tree_root,
                                     btrfs_super_root_level(disk_super));
        generation = btrfs_super_generation(disk_super);
@@ -1966,32 +2281,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        tree_root->node = read_tree_block(tree_root,
                                          btrfs_super_root(disk_super),
                                          blocksize, generation);
-        if (!tree_root->node)
+        if (!tree_root->node ||
-                goto fail_chunk_root;
+            !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
-        if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
                printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
                       sb->s_id);
-                goto fail_tree_root;
+                goto recovery_tree_root;
        }
        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
        tree_root->commit_root = btrfs_root_node(tree_root);
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
        if (ret)
-                goto fail_tree_root;
+                goto recovery_tree_root;
        extent_root->track_dirty = 1;
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
        if (ret)
-                goto fail_extent_root;
+                goto recovery_tree_root;
        dev_root->track_dirty = 1;
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
        if (ret)
-                goto fail_dev_root;
+                goto recovery_tree_root;
        csum_root->track_dirty = 1;
@@ -2124,22 +2440,13 @@ fail_cleaner:
 fail_block_groups:
        btrfs_free_block_groups(fs_info);
-        free_extent_buffer(csum_root->node);
-        free_extent_buffer(csum_root->commit_root);
+fail_tree_roots:
-fail_dev_root:
+        free_root_pointers(fs_info, 1);
-        free_extent_buffer(dev_root->node);
-        free_extent_buffer(dev_root->commit_root);
-fail_extent_root:
-        free_extent_buffer(extent_root->node);
-        free_extent_buffer(extent_root->commit_root);
-fail_tree_root:
-        free_extent_buffer(tree_root->node);
-        free_extent_buffer(tree_root->commit_root);
-fail_chunk_root:
-        free_extent_buffer(chunk_root->node);
-        free_extent_buffer(chunk_root->commit_root);
 fail_sb_buffer:
        btrfs_stop_workers(&fs_info->generic_worker);
+        btrfs_stop_workers(&fs_info->readahead_workers);
        btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->delalloc_workers);
        btrfs_stop_workers(&fs_info->workers);
@@ -2152,25 +2459,37 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_stop_workers(&fs_info->caching_workers);
 fail_alloc:
-        kfree(fs_info->delayed_root);
 fail_iput:
+        btrfs_mapping_tree_free(&fs_info->mapping_tree);
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
-        btrfs_close_devices(fs_info->fs_devices);
-        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 fail_bdi:
        bdi_destroy(&fs_info->bdi);
 fail_srcu:
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
-        kfree(extent_root);
+        btrfs_close_devices(fs_info->fs_devices);
-        kfree(tree_root);
+        free_fs_info(fs_info);
-        kfree(fs_info);
-        kfree(chunk_root);
-        kfree(dev_root);
-        kfree(csum_root);
        return ERR_PTR(err);
+recovery_tree_root:
+        if (!btrfs_test_opt(tree_root, RECOVERY))
+                goto fail_tree_roots;
+        free_root_pointers(fs_info, 0);
+        /* don't use the log in recovery mode, it won't be valid */
+        btrfs_set_super_log_root(disk_super, 0);
+        /* we can't trust the free space cache either */
+        btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+        ret = next_root_backup(fs_info, fs_info->super_copy,
+                               &num_backups_tried, &backup_index);
+        if (ret == -1)
+                goto fail_block_groups;
+        goto retry_root_backup;
 }
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2254,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device,
        int errors = 0;
        u32 crc;
        u64 bytenr;
-        int last_barrier = 0;
        if (max_mirrors == 0)
                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
-        /* make sure only the last submit_bh does a barrier */
-        if (do_barriers) {
-                for (i = 0; i < max_mirrors; i++) {
-                        bytenr = btrfs_sb_offset(i);
-                        if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-                            device->total_bytes)
-                                break;
-                        last_barrier = i;
-                }
-        }
        for (i = 0; i < max_mirrors; i++) {
                bytenr = btrfs_sb_offset(i);
                if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2315,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device,
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
-                if (i == last_barrier && do_barriers)
+                /*
-                        ret = submit_bh(WRITE_FLUSH_FUA, bh);
+                 * we fua the first super.  The others we allow
-                else
+                 * to go down lazy.
-                        ret = submit_bh(WRITE_SYNC, bh);
+                 */
+                ret = submit_bh(WRITE_FUA, bh);
                if (ret)
                        errors++;
        }
        return errors < i ? 0 : -1;
 }
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio, int err)
+{
+        if (err) {
+                if (err == -EOPNOTSUPP)
+                        set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+        }
+        if (bio->bi_private)
+                complete(bio->bi_private);
+        bio_put(bio);
+}
+/*
+ * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
+ * sent down.  With wait == 1, it waits for the previous flush.
+ *
+ * any device where the flush fails with eopnotsupp are flagged as not-barrier
+ * capable
+ */
+static int write_dev_flush(struct btrfs_device *device, int wait)
+{
+        struct bio *bio;
+        int ret = 0;
+        if (device->nobarriers)
+                return 0;
+        if (wait) {
+                bio = device->flush_bio;
+                if (!bio)
+                        return 0;
+                wait_for_completion(&device->flush_wait);
+                if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+                        printk("btrfs: disabling barriers on dev %s\n",
+                               device->name);
+                        device->nobarriers = 1;
+                }
+                if (!bio_flagged(bio, BIO_UPTODATE)) {
+                        ret = -EIO;
+                }
+                /* drop the reference from the wait == 0 run */
+                bio_put(bio);
+                device->flush_bio = NULL;
+                return ret;
+        }
+        /*
+         * one reference for us, and we leave it for the
+         * caller
+         */
+        device->flush_bio = NULL;;
+        bio = bio_alloc(GFP_NOFS, 0);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_end_io = btrfs_end_empty_barrier;
+        bio->bi_bdev = device->bdev;
+        init_completion(&device->flush_wait);
+        bio->bi_private = &device->flush_wait;
+        device->flush_bio = bio;
+        bio_get(bio);
+        submit_bio(WRITE_FLUSH, bio);
+        return 0;
+}
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+        struct list_head *head;
+        struct btrfs_device *dev;
+        int errors = 0;
+        int ret;
+        /* send down all the barriers */
+        head = &info->fs_devices->devices;
+        list_for_each_entry_rcu(dev, head, dev_list) {
+                if (!dev->bdev) {
+                        errors++;
+                        continue;
+                }
+                if (!dev->in_fs_metadata || !dev->writeable)
+                        continue;
+                ret = write_dev_flush(dev, 0);
+                if (ret)
+                        errors++;
+        }
+        /* wait for all the barriers */
+        list_for_each_entry_rcu(dev, head, dev_list) {
+                if (!dev->bdev) {
+                        errors++;
+                        continue;
+                }
+                if (!dev->in_fs_metadata || !dev->writeable)
+                        continue;
+                ret = write_dev_flush(dev, 1);
+                if (ret)
+                        errors++;
+        }
+        if (errors)
+                return -EIO;
+        return 0;
+}
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
        struct list_head *head;
@@ -2338,14 +2764,19 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        int total_errors = 0;
        u64 flags;
-        max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+        max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
        do_barriers = !btrfs_test_opt(root, NOBARRIER);
+        backup_super_roots(root->fs_info);
-        sb = &root->fs_info->super_for_commit;
+        sb = root->fs_info->super_for_commit;
        dev_item = &sb->dev_item;
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        head = &root->fs_info->fs_devices->devices;
+        if (do_barriers)
+                barrier_all_devices(root->fs_info);
        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev) {
                        total_errors++;
@@ -2545,8 +2976,6 @@ int close_ctree(struct btrfs_root *root)
        /* clear out the rbtree of defraggable inodes */
        btrfs_run_defrag_inodes(root->fs_info);
-        btrfs_put_block_group_cache(fs_info);
        /*
         * Here come 2 situations when btrfs is broken to flip readonly:
         *
@@ -2572,6 +3001,8 @@ int close_ctree(struct btrfs_root *root)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
+        btrfs_put_block_group_cache(fs_info);
        kthread_stop(root->fs_info->transaction_kthread);
        kthread_stop(root->fs_info->cleaner_kthread);
@@ -2603,7 +3034,6 @@ int close_ctree(struct btrfs_root *root)
        del_fs_roots(fs_info);
        iput(fs_info->btree_inode);
-        kfree(fs_info->delayed_root);
        btrfs_stop_workers(&fs_info->generic_worker);
        btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2617,6 +3047,7 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->submit_workers);
        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_stop_workers(&fs_info->caching_workers);
+        btrfs_stop_workers(&fs_info->readahead_workers);
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2624,12 +3055,7 @@ int close_ctree(struct btrfs_root *root)
        bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
-        kfree(fs_info->extent_root);
+        free_fs_info(fs_info);
-        kfree(fs_info->tree_root);
-        kfree(fs_info->chunk_root);
-        kfree(fs_info->dev_root);
-        kfree(fs_info->csum_root);
-        kfree(fs_info);
        return 0;
 }
@@ -2735,7 +3161,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
        return ret;
 }
-int btree_lock_page_hook(struct page *page)
+static int btree_lock_page_hook(struct page *page, void *data,
+                                void (*flush_fn)(void *))
 {
        struct inode *inode = page->mapping->host;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +3179,10 @@ int btree_lock_page_hook(struct page *page)
        if (!eb)
                goto out;
-        btrfs_tree_lock(eb);
+        if (!btrfs_try_tree_write_lock(eb)) {
+                flush_fn(data);
+                btrfs_tree_lock(eb);
+        }
        btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +3197,10 @@ int btree_lock_page_hook(struct page *page)
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
 out:
-        lock_page(page);
+        if (!trylock_page(page)) {
+                flush_fn(data);
+                lock_page(page);
+        }
        return 0;
 }
@@ -3123,6 +3556,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 static struct extent_io_ops btree_extent_io_ops = {
        .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
+        .readpage_io_failed_hook = btree_io_failed_hook,
        .submit_bio_hook = btree_submit_bio_hook,
        /* note we're sharing with inode.c for the merge bio hook */
        .merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67..c99d0a8f13f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
                                      u32 blocksize, u64 parent_transid);
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
                         u64 parent_transid);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+                         int mirror_num, struct extent_buffer **eb);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
                                                   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root);
-int btree_lock_page_hook(struct page *page);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462..2ad813674d7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
 #include <linux/rcupdate.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -52,6 +53,21 @@ enum {
        CHUNK_ALLOC_LIMITED = 2,
 };
+/*
+ * Control how reservations are dealt with.
+ *
+ * RESERVE_FREE - freeing a reservation.
+ * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
+ *   ENOSPC accounting
+ * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
+ *   bytes_may_use as the ENOSPC accounting is done elsewhere
+ */
+enum {
+        RESERVE_FREE = 0,
+        RESERVE_ALLOC = 1,
+        RESERVE_ALLOC_NO_ACCOUNT = 2,
+};
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups);
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                       u64 num_bytes, int reserve);
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
        if (atomic_dec_and_test(&cache->count)) {
                WARN_ON(cache->pinned > 0);
                WARN_ON(cache->reserved > 0);
-                WARN_ON(cache->reserved_pinned > 0);
                kfree(cache->free_space_ctl);
                kfree(cache);
        }
@@ -450,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                             struct btrfs_root *root,
                             int load_cache_only)
 {
+        DEFINE_WAIT(wait);
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_caching_control *caching_ctl;
        int ret = 0;
-        smp_mb();
+        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-        if (cache->cached != BTRFS_CACHE_NO)
+        BUG_ON(!caching_ctl);
+        INIT_LIST_HEAD(&caching_ctl->list);
+        mutex_init(&caching_ctl->mutex);
+        init_waitqueue_head(&caching_ctl->wait);
+        caching_ctl->block_group = cache;
+        caching_ctl->progress = cache->key.objectid;
+        atomic_set(&caching_ctl->count, 1);
+        caching_ctl->work.func = caching_thread;
+        spin_lock(&cache->lock);
+        /*
+         * This should be a rare occasion, but this could happen I think in the
+         * case where one thread starts to load the space cache info, and then
+         * some other thread starts a transaction commit which tries to do an
+         * allocation while the other thread is still loading the space cache
+         * info.  The previous loop should have kept us from choosing this block
+         * group, but if we've moved to the state where we will wait on caching
+         * block groups we need to first check if we're doing a fast load here,
+         * so we can wait for it to finish, otherwise we could end up allocating
+         * from a block group who's cache gets evicted for one reason or
+         * another.
+         */
+        while (cache->cached == BTRFS_CACHE_FAST) {
+                struct btrfs_caching_control *ctl;
+                ctl = cache->caching_ctl;
+                atomic_inc(&ctl->count);
+                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+                spin_unlock(&cache->lock);
+                schedule();
+                finish_wait(&ctl->wait, &wait);
+                put_caching_control(ctl);
+                spin_lock(&cache->lock);
+        }
+        if (cache->cached != BTRFS_CACHE_NO) {
+                spin_unlock(&cache->lock);
+                kfree(caching_ctl);
                return 0;
+        }
+        WARN_ON(cache->caching_ctl);
+        cache->caching_ctl = caching_ctl;
+        cache->cached = BTRFS_CACHE_FAST;
+        spin_unlock(&cache->lock);
        /*
         * We can't do the read from on-disk cache during a commit since we need
@@ -465,57 +528,53 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
         * we likely hold important locks.
         */
        if (trans && (!trans->transaction->in_commit) &&
-            (root && root != root->fs_info->tree_root)) {
+            (root && root != root->fs_info->tree_root) &&
-                spin_lock(&cache->lock);
+            btrfs_test_opt(root, SPACE_CACHE)) {
-                if (cache->cached != BTRFS_CACHE_NO) {
-                        spin_unlock(&cache->lock);
-                        return 0;
-                }
-                cache->cached = BTRFS_CACHE_STARTED;
-                spin_unlock(&cache->lock);
                ret = load_free_space_cache(fs_info, cache);
                spin_lock(&cache->lock);
                if (ret == 1) {
+                        cache->caching_ctl = NULL;
                        cache->cached = BTRFS_CACHE_FINISHED;
                        cache->last_byte_to_unpin = (u64)-1;
                } else {
-                        cache->cached = BTRFS_CACHE_NO;
+                        if (load_cache_only) {
+                                cache->caching_ctl = NULL;
+                                cache->cached = BTRFS_CACHE_NO;
+                        } else {
+                                cache->cached = BTRFS_CACHE_STARTED;
+                        }
                }
                spin_unlock(&cache->lock);
+                wake_up(&caching_ctl->wait);
                if (ret == 1) {
+                        put_caching_control(caching_ctl);
                        free_excluded_extents(fs_info->extent_root, cache);
                        return 0;
                }
+        } else {
+                /*
+                 * We are not going to do the fast caching, set cached to the
+                 * appropriate value and wakeup any waiters.
+                 */
+                spin_lock(&cache->lock);
+                if (load_cache_only) {
+                        cache->caching_ctl = NULL;
+                        cache->cached = BTRFS_CACHE_NO;
+                } else {
+                        cache->cached = BTRFS_CACHE_STARTED;
+                }
+                spin_unlock(&cache->lock);
+                wake_up(&caching_ctl->wait);
        }
-        if (load_cache_only)
+        if (load_cache_only) {
-                return 0;
+                put_caching_control(caching_ctl);
-        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-        BUG_ON(!caching_ctl);
-        INIT_LIST_HEAD(&caching_ctl->list);
-        mutex_init(&caching_ctl->mutex);
-        init_waitqueue_head(&caching_ctl->wait);
-        caching_ctl->block_group = cache;
-        caching_ctl->progress = cache->key.objectid;
-        /* one for caching kthread, one for caching block group list */
-        atomic_set(&caching_ctl->count, 2);
-        caching_ctl->work.func = caching_thread;
-        spin_lock(&cache->lock);
-        if (cache->cached != BTRFS_CACHE_NO) {
-                spin_unlock(&cache->lock);
-                kfree(caching_ctl);
                return 0;
        }
-        cache->caching_ctl = caching_ctl;
-        cache->cached = BTRFS_CACHE_STARTED;
-        spin_unlock(&cache->lock);
        down_write(&fs_info->extent_commit_sem);
+        atomic_inc(&caching_ctl->count);
        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
        up_write(&fs_info->extent_commit_sem);
@@ -1770,18 +1829,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 {
        int ret;
        u64 discarded_bytes = 0;
-        struct btrfs_multi_bio *multi = NULL;
+        struct btrfs_bio *bbio = NULL;
        /* Tell the block device(s) that the sectors can be discarded */
        ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
-                              bytenr, &num_bytes, &multi, 0);
+                              bytenr, &num_bytes, &bbio, 0);
        if (!ret) {
-                struct btrfs_bio_stripe *stripe = multi->stripes;
+                struct btrfs_bio_stripe *stripe = bbio->stripes;
                int i;
-                for (i = 0; i < multi->num_stripes; i++, stripe++) {
+                for (i = 0; i < bbio->num_stripes; i++, stripe++) {
                        if (!stripe->dev->can_discard)
                                continue;
@@ -1800,7 +1859,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                         */
                        ret = 0;
                }
-                kfree(multi);
+                kfree(bbio);
        }
        if (actual_bytes)
@@ -2700,6 +2759,13 @@ again:
                goto again;
        }
+        /* We've already setup this transaction, go ahead and exit */
+        if (block_group->cache_generation == trans->transid &&
+            i_size_read(inode)) {
+                dcs = BTRFS_DC_SETUP;
+                goto out_put;
+        }
        /*
         * We want to set the generation to 0, that way if anything goes wrong
         * from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2815,15 @@ again:
        if (!ret)
                dcs = BTRFS_DC_SETUP;
        btrfs_free_reserved_data_space(inode, num_pages);
 out_put:
        iput(inode);
 out_free:
        btrfs_release_path(path);
 out:
        spin_lock(&block_group->lock);
+        if (!ret)
+                block_group->cache_generation = trans->transid;
        block_group->disk_cache_state = dcs;
        spin_unlock(&block_group->lock);
@@ -3122,16 +3191,13 @@ commit_trans:
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
-        BTRFS_I(inode)->reserved_bytes += bytes;
        spin_unlock(&data_sinfo->lock);
        return 0;
 }
 /*
- * called when we are clearing an delalloc extent from the
+ * Called if we need to clear a data reservation for this inode.
- * inode's io_tree or there was an error for whatever reason
- * after calling btrfs_check_data_free_space
 */
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 {
@@ -3144,7 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
        data_sinfo = BTRFS_I(inode)->space_info;
        spin_lock(&data_sinfo->lock);
        data_sinfo->bytes_may_use -= bytes;
-        BTRFS_I(inode)->reserved_bytes -= bytes;
        spin_unlock(&data_sinfo->lock);
 }
@@ -3165,6 +3230,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
                              struct btrfs_space_info *sinfo, u64 alloc_bytes,
                              int force)
 {
+        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
        u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
        u64 thresh;
@@ -3173,11 +3239,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
                return 1;
        /*
+         * We need to take into account the global rsv because for all intents
+         * and purposes it's used space.  Don't worry about locking the
+         * global_rsv, it doesn't change except when the transaction commits.
+         */
+        num_allocated += global_rsv->size;
+        /*
         * in limited mode, we want to have some free space up to
         * about 1% of the FS size.
         */
        if (force == CHUNK_ALLOC_LIMITED) {
-                thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+                thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
                thresh = max_t(u64, 64 * 1024 * 1024,
                               div_factor_fine(thresh, 1));
@@ -3199,7 +3272,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
        if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
                return 0;
-        thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
        /* 256MB or 5% of the FS */
        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3302,24 +3375,26 @@ out:
 /*
 * shrink metadata reservation for delalloc
 */
-static int shrink_delalloc(struct btrfs_trans_handle *trans,
+static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
-                           struct btrfs_root *root, u64 to_reclaim, int sync)
+                           bool wait_ordered)
 {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_space_info *space_info;
+        struct btrfs_trans_handle *trans;
        u64 reserved;
        u64 max_reclaim;
        u64 reclaimed = 0;
        long time_left;
-        int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+        unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
        int loops = 0;
        unsigned long progress;
+        trans = (struct btrfs_trans_handle *)current->journal_info;
        block_rsv = &root->fs_info->delalloc_block_rsv;
        space_info = block_rsv->space_info;
        smp_mb();
-        reserved = space_info->bytes_reserved;
+        reserved = space_info->bytes_may_use;
        progress = space_info->reservation_progress;
        if (reserved == 0)
@@ -3334,18 +3409,20 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
        }
        max_reclaim = min(reserved, to_reclaim);
+        nr_pages = max_t(unsigned long, nr_pages,
+                         max_reclaim >> PAGE_CACHE_SHIFT);
        while (loops < 1024) {
                /* have the flusher threads jump in and do some IO */
                smp_mb();
                nr_pages = min_t(unsigned long, nr_pages,
                       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
-                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
+                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
+                                                WB_REASON_FS_FREE_SPACE);
                spin_lock(&space_info->lock);
-                if (reserved > space_info->bytes_reserved)
+                if (reserved > space_info->bytes_may_use)
-                        reclaimed += reserved - space_info->bytes_reserved;
+                        reclaimed += reserved - space_info->bytes_may_use;
-                reserved = space_info->bytes_reserved;
+                reserved = space_info->bytes_may_use;
                spin_unlock(&space_info->lock);
                loops++;
@@ -3356,11 +3433,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                if (trans && trans->transaction->blocked)
                        return -EAGAIN;
-                time_left = schedule_timeout_interruptible(1);
+                if (wait_ordered && !trans) {
+                        btrfs_wait_ordered_extents(root, 0, 0);
+                } else {
+                        time_left = schedule_timeout_interruptible(1);
-                /* We were interrupted, exit */
+                        /* We were interrupted, exit */
-                if (time_left)
+                        if (time_left)
-                        break;
+                                break;
+                }
                /* we've kicked the IO a few times, if anything has been freed,
                 * exit.  There is no sense in looping here for a long time
@@ -3375,34 +3456,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                }
        }
-        if (reclaimed >= to_reclaim && !trans)
-                btrfs_wait_ordered_extents(root, 0, 0);
        return reclaimed >= to_reclaim;
 }
-/*
+/**
- * Retries tells us how many times we've called reserve_metadata_bytes.  The
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
- * idea is if this is the first call (retries == 0) then we will add to our
+ * @root - the root we're allocating for
- * reserved count if we can't make the allocation in order to hold our place
+ * @bytes - the number of bytes we want to reserve
- * while we go and try and free up space.  That way for retries > 1 we don't try
+ * @force - force the commit
- * and add space, we just check to see if the amount of unused space is >= the
- * total space, meaning that our reservation is valid.
 *
- * However if we don't intend to retry this reservation, pass -1 as retries so
+ * This will check to make sure that committing the transaction will actually
- * that it short circuits this logic.
+ * get us somewhere and then commit the transaction if it does.  Otherwise it
+ * will return -ENOSPC.
 */
-static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
+static int may_commit_transaction(struct btrfs_root *root,
-                                  struct btrfs_root *root,
+                                  struct btrfs_space_info *space_info,
+                                  u64 bytes, int force)
+{
+        struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+        struct btrfs_trans_handle *trans;
+        trans = (struct btrfs_trans_handle *)current->journal_info;
+        if (trans)
+                return -EAGAIN;
+        if (force)
+                goto commit;
+        /* See if there is enough pinned space to make this reservation */
+        spin_lock(&space_info->lock);
+        if (space_info->bytes_pinned >= bytes) {
+                spin_unlock(&space_info->lock);
+                goto commit;
+        }
+        spin_unlock(&space_info->lock);
+        /*
+         * See if there is some space in the delayed insertion reservation for
+         * this reservation.
+         */
+        if (space_info != delayed_rsv->space_info)
+                return -ENOSPC;
+        spin_lock(&delayed_rsv->lock);
+        if (delayed_rsv->size < bytes) {
+                spin_unlock(&delayed_rsv->lock);
+                return -ENOSPC;
+        }
+        spin_unlock(&delayed_rsv->lock);
+commit:
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans))
+                return -ENOSPC;
+        return btrfs_commit_transaction(trans, root);
+}
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - wether or not we can flush to make our reservation
+ *
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv.  If there is not enough space it will make an attempt to
+ * flush out space to make room.  It will do this by flushing delalloc if
+ * possible or committing the transaction.  If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int reserve_metadata_bytes(struct btrfs_root *root,
                                  struct btrfs_block_rsv *block_rsv,
                                  u64 orig_bytes, int flush)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
-        u64 unused;
+        u64 used;
        u64 num_bytes = orig_bytes;
        int retries = 0;
        int ret = 0;
        bool committed = false;
        bool flushing = false;
+        bool wait_ordered = false;
 again:
        ret = 0;
@@ -3419,7 +3556,7 @@ again:
                 * deadlock since we are waiting for the flusher to finish, but
                 * hold the current transaction open.
                 */
-                if (trans)
+                if (current->journal_info)
                        return -EAGAIN;
                ret = wait_event_interruptible(space_info->wait,
                                               !space_info->flush);
@@ -3431,9 +3568,9 @@ again:
        }
        ret = -ENOSPC;
-        unused = space_info->bytes_used + space_info->bytes_reserved +
+        used = space_info->bytes_used + space_info->bytes_reserved +
-                 space_info->bytes_pinned + space_info->bytes_readonly +
+                space_info->bytes_pinned + space_info->bytes_readonly +
-                 space_info->bytes_may_use;
+                space_info->bytes_may_use;
        /*
         * The idea here is that we've not already over-reserved the block group
@@ -3442,10 +3579,9 @@ again:
         * lets start flushing stuff first and then come back and try to make
         * our reservation.
         */
-        if (unused <= space_info->total_bytes) {
+        if (used <= space_info->total_bytes) {
-                unused = space_info->total_bytes - unused;
+                if (used + orig_bytes <= space_info->total_bytes) {
-                if (unused >= num_bytes) {
+                        space_info->bytes_may_use += orig_bytes;
-                        space_info->bytes_reserved += orig_bytes;
                        ret = 0;
                } else {
                        /*
@@ -3461,10 +3597,64 @@ again:
                 * amount plus the amount of bytes that we need for this
                 * reservation.
                 */
-                num_bytes = unused - space_info->total_bytes +
+                wait_ordered = true;
+                num_bytes = used - space_info->total_bytes +
                        (orig_bytes * (retries + 1));
        }
+        if (ret) {
+                u64 profile = btrfs_get_alloc_profile(root, 0);
+                u64 avail;
+                /*
+                 * If we have a lot of space that's pinned, don't bother doing
+                 * the overcommit dance yet and just commit the transaction.
+                 */
+                avail = (space_info->total_bytes - space_info->bytes_used) * 8;
+                do_div(avail, 10);
+                if (space_info->bytes_pinned >= avail && flush && !committed) {
+                        space_info->flush = 1;
+                        flushing = true;
+                        spin_unlock(&space_info->lock);
+                        ret = may_commit_transaction(root, space_info,
+                                                     orig_bytes, 1);
+                        if (ret)
+                                goto out;
+                        committed = true;
+                        goto again;
+                }
+                spin_lock(&root->fs_info->free_chunk_lock);
+                avail = root->fs_info->free_chunk_space;
+                /*
+                 * If we have dup, raid1 or raid10 then only half of the free
+                 * space is actually useable.
+                 */
+                if (profile & (BTRFS_BLOCK_GROUP_DUP |
+                               BTRFS_BLOCK_GROUP_RAID1 |
+                               BTRFS_BLOCK_GROUP_RAID10))
+                        avail >>= 1;
+                /*
+                 * If we aren't flushing don't let us overcommit too much, say
+                 * 1/8th of the space.  If we can flush, let it overcommit up to
+                 * 1/2 of the space.
+                 */
+                if (flush)
+                        avail >>= 3;
+                else
+                        avail >>= 1;
+                 spin_unlock(&root->fs_info->free_chunk_lock);
+                if (used + num_bytes < space_info->total_bytes + avail) {
+                        space_info->bytes_may_use += orig_bytes;
+                        ret = 0;
+                } else {
+                        wait_ordered = true;
+                }
+        }
        /*
         * Couldn't make our reservation, save our place so while we're trying
         * to reclaim space we can actually use it instead of somebody else
@@ -3484,7 +3674,7 @@ again:
         * We do synchronous shrinking since we don't actually unreserve
         * metadata until after the IO is completed.
         */
-        ret = shrink_delalloc(trans, root, num_bytes, 1);
+        ret = shrink_delalloc(root, num_bytes, wait_ordered);
        if (ret < 0)
                goto out;
@@ -3496,35 +3686,17 @@ again:
         * so go back around and try again.
         */
        if (retries < 2) {
+                wait_ordered = true;
                retries++;
                goto again;
        }
-        /*
-         * Not enough space to be reclaimed, don't bother committing the
-         * transaction.
-         */
-        spin_lock(&space_info->lock);
-        if (space_info->bytes_pinned < orig_bytes)
-                ret = -ENOSPC;
-        spin_unlock(&space_info->lock);
-        if (ret)
-                goto out;
-        ret = -EAGAIN;
-        if (trans)
-                goto out;
        ret = -ENOSPC;
        if (committed)
                goto out;
-        trans = btrfs_join_transaction(root);
+        ret = may_commit_transaction(root, space_info, orig_bytes, 0);
-        if (IS_ERR(trans))
-                goto out;
-        ret = btrfs_commit_transaction(trans, root);
        if (!ret) {
-                trans = NULL;
                committed = true;
                goto again;
        }
@@ -3542,10 +3714,12 @@ out:
 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
                                             struct btrfs_root *root)
 {
-        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_rsv *block_rsv = NULL;
-        if (root->ref_cows)
+        if (root->ref_cows || root == root->fs_info->csum_root)
                block_rsv = trans->block_rsv;
-        else
+        if (!block_rsv)
                block_rsv = root->block_rsv;
        if (!block_rsv)
@@ -3616,7 +3790,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
                }
                if (num_bytes) {
                        spin_lock(&space_info->lock);
-                        space_info->bytes_reserved -= num_bytes;
+                        space_info->bytes_may_use -= num_bytes;
                        space_info->reservation_progress++;
                        spin_unlock(&space_info->lock);
                }
@@ -3640,9 +3814,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
 {
        memset(rsv, 0, sizeof(*rsv));
        spin_lock_init(&rsv->lock);
-        atomic_set(&rsv->usage, 1);
-        rsv->priority = 6;
-        INIT_LIST_HEAD(&rsv->list);
 }
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3663,38 +3834,20 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv)
 {
-        if (rsv && atomic_dec_and_test(&rsv->usage)) {
+        btrfs_block_rsv_release(root, rsv, (u64)-1);
-                btrfs_block_rsv_release(root, rsv, (u64)-1);
+        kfree(rsv);
-                if (!rsv->durable)
-                        kfree(rsv);
-        }
-}
-/*
- * make the block_rsv struct be able to capture freed space.
- * the captured space will re-add to the the block_rsv struct
- * after transaction commit
- */
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
-                                 struct btrfs_block_rsv *block_rsv)
-{
-        block_rsv->durable = 1;
-        mutex_lock(&fs_info->durable_block_rsv_mutex);
-        list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
-        mutex_unlock(&fs_info->durable_block_rsv_mutex);
 }
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+static inline int __block_rsv_add(struct btrfs_root *root,
-                        struct btrfs_root *root,
+                                  struct btrfs_block_rsv *block_rsv,
-                        struct btrfs_block_rsv *block_rsv,
+                                  u64 num_bytes, int flush)
-                        u64 num_bytes)
 {
        int ret;
        if (num_bytes == 0)
                return 0;
-        ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
+        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, 1);
                return 0;
@@ -3703,55 +3856,80 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
        return ret;
 }
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+int btrfs_block_rsv_add(struct btrfs_root *root,
-                          struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv,
-                          struct btrfs_block_rsv *block_rsv,
+                        u64 num_bytes)
-                          u64 min_reserved, int min_factor)
+{
+        return __block_rsv_add(root, block_rsv, num_bytes, 1);
+}
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+                                struct btrfs_block_rsv *block_rsv,
+                                u64 num_bytes)
+{
+        return __block_rsv_add(root, block_rsv, num_bytes, 0);
+}
+int btrfs_block_rsv_check(struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv, int min_factor)
 {
        u64 num_bytes = 0;
-        int commit_trans = 0;
        int ret = -ENOSPC;
        if (!block_rsv)
                return 0;
        spin_lock(&block_rsv->lock);
-        if (min_factor > 0)
+        num_bytes = div_factor(block_rsv->size, min_factor);
-                num_bytes = div_factor(block_rsv->size, min_factor);
+        if (block_rsv->reserved >= num_bytes)
-        if (min_reserved > num_bytes)
+                ret = 0;
-                num_bytes = min_reserved;
+        spin_unlock(&block_rsv->lock);
-        if (block_rsv->reserved >= num_bytes) {
+        return ret;
+}
+static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
+                                           struct btrfs_block_rsv *block_rsv,
+                                           u64 min_reserved, int flush)
+{
+        u64 num_bytes = 0;
+        int ret = -ENOSPC;
+        if (!block_rsv)
+                return 0;
+        spin_lock(&block_rsv->lock);
+        num_bytes = min_reserved;
+        if (block_rsv->reserved >= num_bytes)
                ret = 0;
-        } else {
+        else
                num_bytes -= block_rsv->reserved;
-                if (block_rsv->durable &&
-                    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
-                        commit_trans = 1;
-        }
        spin_unlock(&block_rsv->lock);
        if (!ret)
                return 0;
-        if (block_rsv->refill_used) {
+        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
-                ret = reserve_metadata_bytes(trans, root, block_rsv,
+        if (!ret) {
-                                             num_bytes, 0);
+                block_rsv_add_bytes(block_rsv, num_bytes, 0);
-                if (!ret) {
-                        block_rsv_add_bytes(block_rsv, num_bytes, 0);
-                        return 0;
-                }
-        }
-        if (commit_trans) {
-                if (trans)
-                        return -EAGAIN;
-                trans = btrfs_join_transaction(root);
-                BUG_ON(IS_ERR(trans));
-                ret = btrfs_commit_transaction(trans, root);
                return 0;
        }
-        return -ENOSPC;
+        return ret;
+}
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+                           struct btrfs_block_rsv *block_rsv,
+                           u64 min_reserved)
+{
+        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
+}
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+                                   struct btrfs_block_rsv *block_rsv,
+                                   u64 min_reserved)
+{
+        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
 }
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3783,7 +3961,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
        u64 num_bytes;
        u64 meta_used;
        u64 data_used;
-        int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+        int csum_size = btrfs_super_csum_size(fs_info->super_copy);
        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
        spin_lock(&sinfo->lock);
@@ -3827,12 +4005,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
        if (sinfo->total_bytes > num_bytes) {
                num_bytes = sinfo->total_bytes - num_bytes;
                block_rsv->reserved += num_bytes;
-                sinfo->bytes_reserved += num_bytes;
+                sinfo->bytes_may_use += num_bytes;
        }
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
-                sinfo->bytes_reserved -= num_bytes;
+                sinfo->bytes_may_use -= num_bytes;
                sinfo->reservation_progress++;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
@@ -3848,16 +4026,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
        fs_info->chunk_block_rsv.space_info = space_info;
-        fs_info->chunk_block_rsv.priority = 10;
        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
        fs_info->global_block_rsv.space_info = space_info;
-        fs_info->global_block_rsv.priority = 10;
-        fs_info->global_block_rsv.refill_used = 1;
        fs_info->delalloc_block_rsv.space_info = space_info;
        fs_info->trans_block_rsv.space_info = space_info;
        fs_info->empty_block_rsv.space_info = space_info;
-        fs_info->empty_block_rsv.priority = 10;
+        fs_info->delayed_block_rsv.space_info = space_info;
        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3865,10 +4040,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
-        btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
-        btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
        update_global_block_rsv(fs_info);
 }
@@ -3881,37 +4052,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
        WARN_ON(fs_info->chunk_block_rsv.size > 0);
        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
-}
+        WARN_ON(fs_info->delayed_block_rsv.size > 0);
+        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root,
-                                    struct btrfs_block_rsv *rsv)
-{
-        struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
-        u64 num_bytes;
-        int ret;
-        /*
-         * Truncate should be freeing data, but give us 2 items just in case it
-         * needs to use some space.  We may want to be smarter about this in the
-         * future.
-         */
-        num_bytes = btrfs_calc_trans_metadata_size(root, 2);
-        /* We already have enough bytes, just return */
-        if (rsv->reserved >= num_bytes)
-                return 0;
-        num_bytes -= rsv->reserved;
-        /*
-         * You should have reserved enough space before hand to do this, so this
-         * should not fail.
-         */
-        ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
-        BUG_ON(ret);
-        return 0;
 }
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3920,9 +4062,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
        if (!trans->bytes_reserved)
                return;
-        BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
-        btrfs_block_rsv_release(root, trans->block_rsv,
-                                trans->bytes_reserved);
        trans->bytes_reserved = 0;
 }
@@ -3964,33 +4104,99 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
+/**
+ * drop_outstanding_extent - drop an outstanding extent
+ * @inode: the inode we're dropping the extent for
+ *
+ * This is called when we are freeing up an outstanding extent, either called
+ * after an error or after an extent is written.  This will return the number of
+ * reserved extents that need to be freed.  This must be called with
+ * BTRFS_I(inode)->lock held.
+ */
 static unsigned drop_outstanding_extent(struct inode *inode)
 {
+        unsigned drop_inode_space = 0;
        unsigned dropped_extents = 0;
-        spin_lock(&BTRFS_I(inode)->lock);
        BUG_ON(!BTRFS_I(inode)->outstanding_extents);
        BTRFS_I(inode)->outstanding_extents--;
+        if (BTRFS_I(inode)->outstanding_extents == 0 &&
+            BTRFS_I(inode)->delalloc_meta_reserved) {
+                drop_inode_space = 1;
+                BTRFS_I(inode)->delalloc_meta_reserved = 0;
+        }
        /*
         * If we have more or the same amount of outsanding extents than we have
         * reserved then we need to leave the reserved extents count alone.
         */
        if (BTRFS_I(inode)->outstanding_extents >=
            BTRFS_I(inode)->reserved_extents)
-                goto out;
+                return drop_inode_space;
        dropped_extents = BTRFS_I(inode)->reserved_extents -
                BTRFS_I(inode)->outstanding_extents;
        BTRFS_I(inode)->reserved_extents -= dropped_extents;
-out:
+        return dropped_extents + drop_inode_space;
-        spin_unlock(&BTRFS_I(inode)->lock);
-        return dropped_extents;
 }
-static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+/**
+ * calc_csum_metadata_size - return the amount of metada space that must be
+ *      reserved/free'd for the given bytes.
+ * @inode: the inode we're manipulating
+ * @num_bytes: the number of bytes in question
+ * @reserve: 1 if we are reserving space, 0 if we are freeing space
+ *
+ * This adjusts the number of csum_bytes in the inode and then returns the
+ * correct amount of metadata that must either be reserved or freed.  We
+ * calculate how many checksums we can fit into one leaf and then divide the
+ * number of bytes that will need to be checksumed by this value to figure out
+ * how many checksums will be required.  If we are adding bytes then the number
+ * may go up and we will return the number of additional bytes that must be
+ * reserved.  If it is going down we will return the number of bytes that must
+ * be freed.
+ *
+ * This must be called with BTRFS_I(inode)->lock held.
+ */
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
+                                   int reserve)
 {
-        return num_bytes >>= 3;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 csum_size;
+        int num_csums_per_leaf;
+        int num_csums;
+        int old_csums;
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
+            BTRFS_I(inode)->csum_bytes == 0)
+                return 0;
+        old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+        if (reserve)
+                BTRFS_I(inode)->csum_bytes += num_bytes;
+        else
+                BTRFS_I(inode)->csum_bytes -= num_bytes;
+        csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+        num_csums_per_leaf = (int)div64_u64(csum_size,
+                                            sizeof(struct btrfs_csum_item) +
+                                            sizeof(struct btrfs_disk_key));
+        num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+        num_csums = num_csums + num_csums_per_leaf - 1;
+        num_csums = num_csums / num_csums_per_leaf;
+        old_csums = old_csums + num_csums_per_leaf - 1;
+        old_csums = old_csums / num_csums_per_leaf;
+        /* No change, no need to reserve more */
+        if (old_csums == num_csums)
+                return 0;
+        if (reserve)
+                return btrfs_calc_trans_metadata_size(root,
+                                                      num_csums - old_csums);
+        return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
 }
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -3999,9 +4205,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
        u64 to_reserve = 0;
        unsigned nr_extents = 0;
+        int flush = 1;
        int ret;
-        if (btrfs_transaction_in_commit(root->fs_info))
+        if (btrfs_is_free_space_inode(root, inode))
+                flush = 0;
+        if (flush && btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
        num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4014,21 +4224,41 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                nr_extents = BTRFS_I(inode)->outstanding_extents -
                        BTRFS_I(inode)->reserved_extents;
                BTRFS_I(inode)->reserved_extents += nr_extents;
+        }
-                to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+        /*
+         * Add an item to reserve for updating the inode when we complete the
+         * delalloc io.
+         */
+        if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+                nr_extents++;
+                BTRFS_I(inode)->delalloc_meta_reserved = 1;
        }
+        to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+        to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
        spin_unlock(&BTRFS_I(inode)->lock);
-        to_reserve += calc_csum_metadata_size(inode, num_bytes);
+        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
-        ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
        if (ret) {
+                u64 to_free = 0;
                unsigned dropped;
+                spin_lock(&BTRFS_I(inode)->lock);
+                dropped = drop_outstanding_extent(inode);
+                to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+                spin_unlock(&BTRFS_I(inode)->lock);
+                to_free += btrfs_calc_trans_metadata_size(root, dropped);
                /*
-                 * We don't need the return value since our reservation failed,
+                 * Somebody could have come in and twiddled with the
-                 * we just need to clean up our counter.
+                 * reservation, so if we have to free more than we would have
+                 * reserved from this reservation go ahead and release those
+                 * bytes.
                 */
-                dropped = drop_outstanding_extent(inode);
+                to_free -= to_reserve;
-                WARN_ON(dropped > 1);
+                if (to_free)
+                        btrfs_block_rsv_release(root, block_rsv, to_free);
                return ret;
        }
@@ -4037,6 +4267,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        return 0;
 }
+/**
+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * @inode: the inode to release the reservation for
+ * @num_bytes: the number of bytes we're releasing
+ *
+ * This will release the metadata reservation for an inode.  This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations.
+ */
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4044,9 +4283,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        unsigned dropped;
        num_bytes = ALIGN(num_bytes, root->sectorsize);
+        spin_lock(&BTRFS_I(inode)->lock);
        dropped = drop_outstanding_extent(inode);
-        to_free = calc_csum_metadata_size(inode, num_bytes);
+        to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+        spin_unlock(&BTRFS_I(inode)->lock);
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4054,6 +4295,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
                                to_free);
 }
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * @inode: inode we're writing to
+ * @num_bytes: the number of bytes we want to allocate
+ *
+ * This will do the following things
+ *
+ * o reserve space in the data space info for num_bytes
+ * o reserve space in the metadata space info based on number of outstanding
+ *   extents and how much csums will be needed
+ * o add to the inodes ->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *
+ * This will return 0 for success and -ENOSPC if there is no space left.
+ */
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 {
        int ret;
@@ -4071,6 +4327,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
        return 0;
 }
+/**
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @num_bytes: the number of bytes we want to free up
+ *
+ * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
+ * called in the case that we don't need the metadata AND data reservations
+ * anymore.  So if there is an error or we insert an inline extent.
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ */
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
 {
        btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4090,12 +4359,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
        /* block accounting for super block */
        spin_lock(&info->delalloc_lock);
-        old_val = btrfs_super_bytes_used(&info->super_copy);
+        old_val = btrfs_super_bytes_used(info->super_copy);
        if (alloc)
                old_val += num_bytes;
        else
                old_val -= num_bytes;
-        btrfs_set_super_bytes_used(&info->super_copy, old_val);
+        btrfs_set_super_bytes_used(info->super_copy, old_val);
        spin_unlock(&info->delalloc_lock);
        while (total) {
@@ -4123,7 +4392,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                spin_lock(&cache->space_info->lock);
                spin_lock(&cache->lock);
-                if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+                if (btrfs_test_opt(root, SPACE_CACHE) &&
                    cache->disk_cache_state < BTRFS_DC_CLEAR)
                        cache->disk_cache_state = BTRFS_DC_CLEAR;
@@ -4135,7 +4404,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
-                        cache->space_info->reservation_progress++;
                        cache->space_info->bytes_used += num_bytes;
                        cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
@@ -4187,7 +4455,6 @@ static int pin_down_extent(struct btrfs_root *root,
        if (reserved) {
                cache->reserved -= num_bytes;
                cache->space_info->bytes_reserved -= num_bytes;
-                cache->space_info->reservation_progress++;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
@@ -4215,45 +4482,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
 }
 /*
- * update size of reserved extents. this function may return -EAGAIN
+ * this function must be called within transaction
- * if 'reserve' is true or 'sinfo' is false.
+ */
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    u64 bytenr, u64 num_bytes)
+{
+        struct btrfs_block_group_cache *cache;
+        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+        BUG_ON(!cache);
+        /*
+         * pull in the free space cache (if any) so that our pin
+         * removes the free space from the cache.  We have load_only set
+         * to one because the slow code to read in the free extents does check
+         * the pinned extents.
+         */
+        cache_block_group(cache, trans, root, 1);
+        pin_down_extent(root, cache, bytenr, num_bytes, 0);
+        /* remove us from the free space cache (if we're there at all) */
+        btrfs_remove_free_space(cache, bytenr, num_bytes);
+        btrfs_put_block_group(cache);
+        return 0;
+}
+/**
+ * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * @cache:      The cache we are manipulating
+ * @num_bytes:  The number of bytes in question
+ * @reserve:    One of the reservation enums
+ *
+ * This is called by the allocator when it reserves space, or by somebody who is
+ * freeing space that was never actually used on disk.  For example if you
+ * reserve some space for a new leaf in transaction A and before transaction A
+ * commits you free that leaf, you call this with reserve set to 0 in order to
+ * clear the reservation.
+ *
+ * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ * ENOSPC accounting.  For data we handle the reservation through clearing the
+ * delalloc bits in the io_tree.  We have to do this since we could end up
+ * allocating less disk space for the amount of data we have reserved in the
+ * case of compression.
+ *
+ * If this is a reservation and the block group has become read only we cannot
+ * make the reservation and return -EAGAIN, otherwise this function always
+ * succeeds.
 */
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                u64 num_bytes, int reserve, int sinfo)
+                                       u64 num_bytes, int reserve)
 {
+        struct btrfs_space_info *space_info = cache->space_info;
        int ret = 0;
-        if (sinfo) {
+        spin_lock(&space_info->lock);
-                struct btrfs_space_info *space_info = cache->space_info;
+        spin_lock(&cache->lock);
-                spin_lock(&space_info->lock);
+        if (reserve != RESERVE_FREE) {
-                spin_lock(&cache->lock);
-                if (reserve) {
-                        if (cache->ro) {
-                                ret = -EAGAIN;
-                        } else {
-                                cache->reserved += num_bytes;
-                                space_info->bytes_reserved += num_bytes;
-                        }
-                } else {
-                        if (cache->ro)
-                                space_info->bytes_readonly += num_bytes;
-                        cache->reserved -= num_bytes;
-                        space_info->bytes_reserved -= num_bytes;
-                        space_info->reservation_progress++;
-                }
-                spin_unlock(&cache->lock);
-                spin_unlock(&space_info->lock);
-        } else {
-                spin_lock(&cache->lock);
                if (cache->ro) {
                        ret = -EAGAIN;
                } else {
-                        if (reserve)
+                        cache->reserved += num_bytes;
-                                cache->reserved += num_bytes;
+                        space_info->bytes_reserved += num_bytes;
-                        else
+                        if (reserve == RESERVE_ALLOC) {
-                                cache->reserved -= num_bytes;
+                                BUG_ON(space_info->bytes_may_use < num_bytes);
+                                space_info->bytes_may_use -= num_bytes;
+                        }
                }
-                spin_unlock(&cache->lock);
+        } else {
+                if (cache->ro)
+                        space_info->bytes_readonly += num_bytes;
+                cache->reserved -= num_bytes;
+                space_info->bytes_reserved -= num_bytes;
+                space_info->reservation_progress++;
        }
+        spin_unlock(&cache->lock);
+        spin_unlock(&space_info->lock);
        return ret;
 }
@@ -4319,13 +4623,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
                spin_lock(&cache->lock);
                cache->pinned -= len;
                cache->space_info->bytes_pinned -= len;
-                if (cache->ro) {
+                if (cache->ro)
                        cache->space_info->bytes_readonly += len;
-                } else if (cache->reserved_pinned > 0) {
-                        len = min(len, cache->reserved_pinned);
-                        cache->reserved_pinned -= len;
-                        cache->space_info->bytes_reserved += len;
-                }
                spin_unlock(&cache->lock);
                spin_unlock(&cache->space_info->lock);
        }
@@ -4340,11 +4639,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_io_tree *unpin;
-        struct btrfs_block_rsv *block_rsv;
-        struct btrfs_block_rsv *next_rsv;
        u64 start;
        u64 end;
-        int idx;
        int ret;
        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4367,30 +4663,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
-        mutex_lock(&fs_info->durable_block_rsv_mutex);
-        list_for_each_entry_safe(block_rsv, next_rsv,
-                                 &fs_info->durable_block_rsv_list, list) {
-                idx = trans->transid & 0x1;
-                if (block_rsv->freed[idx] > 0) {
-                        block_rsv_add_bytes(block_rsv,
-                                            block_rsv->freed[idx], 0);
-                        block_rsv->freed[idx] = 0;
-                }
-                if (atomic_read(&block_rsv->usage) == 0) {
-                        btrfs_block_rsv_release(root, block_rsv, (u64)-1);
-                        if (block_rsv->freed[0] == 0 &&
-                            block_rsv->freed[1] == 0) {
-                                list_del_init(&block_rsv->list);
-                                kfree(block_rsv);
-                        }
-                } else {
-                        btrfs_block_rsv_release(root, block_rsv, 0);
-                }
-        }
-        mutex_unlock(&fs_info->durable_block_rsv_mutex);
        return 0;
 }
@@ -4668,7 +4940,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct extent_buffer *buf,
                           u64 parent, int last_ref)
 {
-        struct btrfs_block_rsv *block_rsv;
        struct btrfs_block_group_cache *cache = NULL;
        int ret;
@@ -4683,64 +4954,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
        if (!last_ref)
                return;
-        block_rsv = get_block_rsv(trans, root);
        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-        if (block_rsv->space_info != cache->space_info)
-                goto out;
        if (btrfs_header_generation(buf) == trans->transid) {
                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                        ret = check_ref_cleanup(trans, root, buf->start);
                        if (!ret)
-                                goto pin;
+                                goto out;
                }
                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                        pin_down_extent(root, cache, buf->start, buf->len, 1);
-                        goto pin;
+                        goto out;
                }
                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
                btrfs_add_free_space(cache, buf->start, buf->len);
-                ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
+                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
-                if (ret == -EAGAIN) {
-                        /* block group became read-only */
-                        btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
-                        goto out;
-                }
-                ret = 1;
-                spin_lock(&block_rsv->lock);
-                if (block_rsv->reserved < block_rsv->size) {
-                        block_rsv->reserved += buf->len;
-                        ret = 0;
-                }
-                spin_unlock(&block_rsv->lock);
-                if (ret) {
-                        spin_lock(&cache->space_info->lock);
-                        cache->space_info->bytes_reserved -= buf->len;
-                        cache->space_info->reservation_progress++;
-                        spin_unlock(&cache->space_info->lock);
-                }
-                goto out;
-        }
-pin:
-        if (block_rsv->durable && !cache->ro) {
-                ret = 0;
-                spin_lock(&cache->lock);
-                if (!cache->ro) {
-                        cache->reserved_pinned += buf->len;
-                        ret = 1;
-                }
-                spin_unlock(&cache->lock);
-                if (ret) {
-                        spin_lock(&block_rsv->lock);
-                        block_rsv->freed[trans->transid & 0x1] += buf->len;
-                        spin_unlock(&block_rsv->lock);
-                }
        }
 out:
        /*
@@ -4876,17 +5107,20 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = orig_root->fs_info->extent_root;
        struct btrfs_free_cluster *last_ptr = NULL;
        struct btrfs_block_group_cache *block_group = NULL;
+        struct btrfs_block_group_cache *used_block_group;
        int empty_cluster = 2 * 1024 * 1024;
        int allowed_chunk_alloc = 0;
        int done_chunk_alloc = 0;
        struct btrfs_space_info *space_info;
-        int last_ptr_loop = 0;
        int loop = 0;
        int index = 0;
+        int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
+                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
        bool use_cluster = true;
+        bool have_caching_bg = false;
        u64 ideal_cache_percent = 0;
        u64 ideal_cache_offset = 0;
@@ -4939,6 +5173,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 ideal_cache:
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
+                used_block_group = block_group;
                /*
                 * we don't want to use the block group if it doesn't match our
                 * allocation bits, or if its not cached.
@@ -4969,12 +5204,14 @@ ideal_cache:
                }
        }
 search:
+        have_caching_bg = false;
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups[index],
                            list) {
                u64 offset;
                int cached;
+                used_block_group = block_group;
                btrfs_get_block_group(block_group);
                search_start = block_group->key.objectid;
@@ -4998,13 +5235,15 @@ search:
                }
 have_block_group:
-                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+                cached = block_group_cache_done(block_group);
+                if (unlikely(!cached)) {
                        u64 free_percent;
+                        found_uncached_bg = true;
                        ret = cache_block_group(block_group, trans,
                                                orig_root, 1);
                        if (block_group->cached == BTRFS_CACHE_FINISHED)
-                                goto have_block_group;
+                                goto alloc;
                        free_percent = btrfs_block_group_used(&block_group->item);
                        free_percent *= 100;
@@ -5026,7 +5265,6 @@ have_block_group:
                                                        orig_root, 0);
                                BUG_ON(ret);
                        }
-                        found_uncached_bg = true;
                        /*
                         * If loop is set for cached only, try the next block
@@ -5036,94 +5274,80 @@ have_block_group:
                                goto loop;
                }
-                cached = block_group_cache_done(block_group);
+alloc:
-                if (unlikely(!cached))
-                        found_uncached_bg = true;
                if (unlikely(block_group->ro))
                        goto loop;
                spin_lock(&block_group->free_space_ctl->tree_lock);
                if (cached &&
                    block_group->free_space_ctl->free_space <
-                    num_bytes + empty_size) {
+                    num_bytes + empty_cluster + empty_size) {
                        spin_unlock(&block_group->free_space_ctl->tree_lock);
                        goto loop;
                }
                spin_unlock(&block_group->free_space_ctl->tree_lock);
                /*
-                 * Ok we want to try and use the cluster allocator, so lets look
+                 * Ok we want to try and use the cluster allocator, so
-                 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
+                 * lets look there
-                 * have tried the cluster allocator plenty of times at this
-                 * point and not have found anything, so we are likely way too
-                 * fragmented for the clustering stuff to find anything, so lets
-                 * just skip it and let the allocator find whatever block it can
-                 * find
                 */
-                if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+                if (last_ptr) {
                        /*
                         * the refill lock keeps out other
                         * people trying to start a new cluster
                         */
                        spin_lock(&last_ptr->refill_lock);
-                        if (last_ptr->block_group &&
+                        used_block_group = last_ptr->block_group;
-                            (last_ptr->block_group->ro ||
+                        if (used_block_group != block_group &&
-                            !block_group_bits(last_ptr->block_group, data))) {
+                            (!used_block_group ||
-                                offset = 0;
+                             used_block_group->ro ||
+                             !block_group_bits(used_block_group, data))) {
+                                used_block_group = block_group;
                                goto refill_cluster;
                        }
-                        offset = btrfs_alloc_from_cluster(block_group, last_ptr,
+                        if (used_block_group != block_group)
-                                                 num_bytes, search_start);
+                                btrfs_get_block_group(used_block_group);
+                        offset = btrfs_alloc_from_cluster(used_block_group,
+                          last_ptr, num_bytes, used_block_group->key.objectid);
                        if (offset) {
                                /* we have a block, we're done */
                                spin_unlock(&last_ptr->refill_lock);
                                goto checks;
                        }
-                        spin_lock(&last_ptr->lock);
+                        WARN_ON(last_ptr->block_group != used_block_group);
-                        /*
+                        if (used_block_group != block_group) {
-                         * whoops, this cluster doesn't actually point to
+                                btrfs_put_block_group(used_block_group);
-                         * this block group.  Get a ref on the block
+                                used_block_group = block_group;
-                         * group is does point to and try again
-                         */
-                        if (!last_ptr_loop && last_ptr->block_group &&
-                            last_ptr->block_group != block_group &&
-                            index <=
-                                 get_block_group_index(last_ptr->block_group)) {
-                                btrfs_put_block_group(block_group);
-                                block_group = last_ptr->block_group;
-                                btrfs_get_block_group(block_group);
-                                spin_unlock(&last_ptr->lock);
-                                spin_unlock(&last_ptr->refill_lock);
-                                last_ptr_loop = 1;
-                                search_start = block_group->key.objectid;
-                                /*
-                                 * we know this block group is properly
-                                 * in the list because
-                                 * btrfs_remove_block_group, drops the
-                                 * cluster before it removes the block
-                                 * group from the list
-                                 */
-                                goto have_block_group;
                        }
-                        spin_unlock(&last_ptr->lock);
 refill_cluster:
+                        BUG_ON(used_block_group != block_group);
+                        /* If we are on LOOP_NO_EMPTY_SIZE, we can't
+                         * set up a new clusters, so lets just skip it
+                         * and let the allocator find whatever block
+                         * it can find.  If we reach this point, we
+                         * will have tried the cluster allocator
+                         * plenty of times and not have found
+                         * anything, so we are likely way too
+                         * fragmented for the clustering stuff to find
+                         * anything.  */
+                        if (loop >= LOOP_NO_EMPTY_SIZE) {
+                                spin_unlock(&last_ptr->refill_lock);
+                                goto unclustered_alloc;
+                        }
                        /*
                         * this cluster didn't work out, free it and
                         * start over
                         */
                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
-                        last_ptr_loop = 0;
                        /* allocate a cluster in this block group */
                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
-                                               offset, num_bytes,
+                                               search_start, num_bytes,
                                               empty_cluster + empty_size);
                        if (ret == 0) {
                                /*
@@ -5159,6 +5383,7 @@ refill_cluster:
                        goto loop;
                }
+unclustered_alloc:
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
                /*
@@ -5177,20 +5402,22 @@ refill_cluster:
                        failed_alloc = true;
                        goto have_block_group;
                } else if (!offset) {
+                        if (!cached)
+                                have_caching_bg = true;
                        goto loop;
                }
 checks:
                search_start = stripe_align(root, offset);
                /* move on to the next group */
                if (search_start + num_bytes >= search_end) {
-                        btrfs_add_free_space(block_group, offset, num_bytes);
+                        btrfs_add_free_space(used_block_group, offset, num_bytes);
                        goto loop;
                }
                /* move on to the next group */
                if (search_start + num_bytes >
-                    block_group->key.objectid + block_group->key.offset) {
+                    used_block_group->key.objectid + used_block_group->key.offset) {
-                        btrfs_add_free_space(block_group, offset, num_bytes);
+                        btrfs_add_free_space(used_block_group, offset, num_bytes);
                        goto loop;
                }
@@ -5198,14 +5425,14 @@ checks:
                ins->offset = num_bytes;
                if (offset < search_start)
-                        btrfs_add_free_space(block_group, offset,
+                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
+                ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
-                                            (data & BTRFS_BLOCK_GROUP_DATA));
+                                                  alloc_type);
                if (ret == -EAGAIN) {
-                        btrfs_add_free_space(block_group, offset, num_bytes);
+                        btrfs_add_free_space(used_block_group, offset, num_bytes);
                        goto loop;
                }
@@ -5214,19 +5441,26 @@ checks:
                ins->offset = num_bytes;
                if (offset < search_start)
-                        btrfs_add_free_space(block_group, offset,
+                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
+                if (used_block_group != block_group)
+                        btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
                break;
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
                BUG_ON(index != get_block_group_index(block_group));
+                if (used_block_group != block_group)
+                        btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
+        if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+                goto search;
        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
                goto search;
@@ -5325,7 +5559,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
        int index = 0;
        spin_lock(&info->lock);
-        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+        printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
+               (unsigned long long)info->flags,
               (unsigned long long)(info->total_bytes - info->bytes_used -
                                    info->bytes_pinned - info->bytes_reserved -
                                    info->bytes_readonly),
@@ -5411,7 +5646,8 @@ again:
        return ret;
 }
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+                                        u64 start, u64 len, int pin)
 {
        struct btrfs_block_group_cache *cache;
        int ret = 0;
@@ -5426,8 +5662,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        if (btrfs_test_opt(root, DISCARD))
                ret = btrfs_discard_extent(root, start, len, NULL);
-        btrfs_add_free_space(cache, start, len);
+        if (pin)
-        btrfs_update_reserved_bytes(cache, len, 0, 1);
+                pin_down_extent(root, cache, start, len, 1);
+        else {
+                btrfs_add_free_space(cache, start, len);
+                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+        }
        btrfs_put_block_group(cache);
        trace_btrfs_reserved_extent_free(root, start, len);
@@ -5435,6 +5675,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        return ret;
 }
+int btrfs_free_reserved_extent(struct btrfs_root *root,
+                                        u64 start, u64 len)
+{
+        return __btrfs_free_reserved_extent(root, start, len, 0);
+}
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+                                       u64 start, u64 len)
+{
+        return __btrfs_free_reserved_extent(root, start, len, 1);
+}
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      u64 parent, u64 root_objectid,
@@ -5630,7 +5882,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                put_caching_control(caching_ctl);
        }
-        ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
+        ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+                                          RESERVE_ALLOC_NO_ACCOUNT);
        BUG_ON(ret);
        btrfs_put_block_group(block_group);
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5687,8 +5940,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        block_rsv = get_block_rsv(trans, root);
        if (block_rsv->size == 0) {
-                ret = reserve_metadata_bytes(trans, root, block_rsv,
+                ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
-                                             blocksize, 0);
                /*
                 * If we couldn't reserve metadata bytes try and use some from
                 * the global reserve.
@@ -5708,13 +5960,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        if (!ret)
                return block_rsv;
        if (ret) {
-                WARN_ON(1);
+                static DEFINE_RATELIMIT_STATE(_rs,
-                ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
+                                DEFAULT_RATELIMIT_INTERVAL,
-                                             0);
+                                /*DEFAULT_RATELIMIT_BURST*/ 2);
+                if (__ratelimit(&_rs)) {
+                        printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
+                        WARN_ON(1);
+                }
+                ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
                if (!ret) {
-                        spin_lock(&block_rsv->lock);
-                        block_rsv->size += blocksize;
-                        spin_unlock(&block_rsv->lock);
                        return block_rsv;
                } else if (ret && block_rsv != global_rsv) {
                        ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6592,12 +6846,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
                    cache->bytes_super - btrfs_block_group_used(&cache->item);
        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
-            sinfo->bytes_may_use + sinfo->bytes_readonly +
+            sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
-            cache->reserved_pinned + num_bytes + min_allocable_bytes <=
+            min_allocable_bytes <= sinfo->total_bytes) {
-            sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
-                sinfo->bytes_reserved += cache->reserved_pinned;
-                cache->reserved_pinned = 0;
                cache->ro = 1;
                ret = 0;
        }
@@ -6964,7 +7215,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                                        struct btrfs_space_info,
                                        list);
                if (space_info->bytes_pinned > 0 ||
-                    space_info->bytes_reserved > 0) {
+                    space_info->bytes_reserved > 0 ||
+                    space_info->bytes_may_use > 0) {
                        WARN_ON(1);
                        dump_space_info(space_info, 0, 0);
                }
@@ -7006,14 +7258,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                return -ENOMEM;
        path->reada = 1;
-        cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
-        if (cache_gen != 0 &&
+        if (btrfs_test_opt(root, SPACE_CACHE) &&
-            btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+            btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
                need_clear = 1;
        if (btrfs_test_opt(root, CLEAR_CACHE))
                need_clear = 1;
-        if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
-                printk(KERN_INFO "btrfs: disk space caching is enabled\n");
        while (1) {
                ret = find_first_block_group(root, path, &key);
@@ -7252,7 +7502,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                goto out;
        }
-        inode = lookup_free_space_inode(root, block_group, path);
+        inode = lookup_free_space_inode(tree_root, block_group, path);
        if (!IS_ERR(inode)) {
                ret = btrfs_orphan_add(trans, inode);
                BUG_ON(ret);
@@ -7268,7 +7518,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                        spin_unlock(&block_group->lock);
                }
                /* One for our lookup ref */
-                iput(inode);
+                btrfs_add_delayed_iput(inode);
        }
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7339,7 +7589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
        int mixed = 0;
        int ret;
-        disk_super = &fs_info->super_copy;
+        disk_super = fs_info->super_copy;
        if (!btrfs_super_root(disk_super))
                return 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f..49f3c9dc09f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
 #include "compat.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
+#include "volumes.h"
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -894,6 +895,202 @@ search_again:
        goto again;
 }
+/**
+ * convert_extent - convert all bits in a given range from one bit to another
+ * @tree:       the io tree to search
+ * @start:      the start offset in bytes
+ * @end:        the end offset in bytes (inclusive)
+ * @bits:       the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @mask:       the allocation mask
+ *
+ * This will go through and set bits for the given range.  If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits.  This is only meant to be used by things that are mergeable, ie
+ * converting from say DELALLOC to DIRTY.  This is not meant to be used with
+ * boundary bits like LOCK.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                       int bits, int clear_bits, gfp_t mask)
+{
+        struct extent_state *state;
+        struct extent_state *prealloc = NULL;
+        struct rb_node *node;
+        int err = 0;
+        u64 last_start;
+        u64 last_end;
+again:
+        if (!prealloc && (mask & __GFP_WAIT)) {
+                prealloc = alloc_extent_state(mask);
+                if (!prealloc)
+                        return -ENOMEM;
+        }
+        spin_lock(&tree->lock);
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, start);
+        if (!node) {
+                prealloc = alloc_extent_state_atomic(prealloc);
+                if (!prealloc) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                err = insert_state(tree, prealloc, start, end, &bits);
+                prealloc = NULL;
+                BUG_ON(err == -EEXIST);
+                goto out;
+        }
+        state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+        last_start = state->start;
+        last_end = state->end;
+        /*
+         * | ---- desired range ---- |
+         * | state |
+         *
+         * Just lock what we found and keep going
+         */
+        if (state->start == start && state->end <= end) {
+                struct rb_node *next_node;
+                set_state_bits(tree, state, &bits);
+                clear_state_bit(tree, state, &clear_bits, 0);
+                merge_state(tree, state);
+                if (last_end == (u64)-1)
+                        goto out;
+                start = last_end + 1;
+                next_node = rb_next(&state->rb_node);
+                if (next_node && start < end && prealloc && !need_resched()) {
+                        state = rb_entry(next_node, struct extent_state,
+                                         rb_node);
+                        if (state->start == start)
+                                goto hit_next;
+                }
+                goto search_again;
+        }
+        /*
+         *     | ---- desired range ---- |
+         * | state |
+         *   or
+         * | ------------- state -------------- |
+         *
+         * We need to split the extent we found, and may flip bits on
+         * second half.
+         *
+         * If the extent we found extends past our
+         * range, we just split and search again.  It'll get split
+         * again the next time though.
+         *
+         * If the extent we found is inside our range, we set the
+         * desired bit on it.
+         */
+        if (state->start < start) {
+                prealloc = alloc_extent_state_atomic(prealloc);
+                if (!prealloc) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                err = split_state(tree, state, prealloc, start);
+                BUG_ON(err == -EEXIST);
+                prealloc = NULL;
+                if (err)
+                        goto out;
+                if (state->end <= end) {
+                        set_state_bits(tree, state, &bits);
+                        clear_state_bit(tree, state, &clear_bits, 0);
+                        merge_state(tree, state);
+                        if (last_end == (u64)-1)
+                                goto out;
+                        start = last_end + 1;
+                }
+                goto search_again;
+        }
+        /*
+         * | ---- desired range ---- |
+         *     | state | or               | state |
+         *
+         * There's a hole, we need to insert something in it and
+         * ignore the extent we found.
+         */
+        if (state->start > start) {
+                u64 this_end;
+                if (end < last_start)
+                        this_end = end;
+                else
+                        this_end = last_start - 1;
+                prealloc = alloc_extent_state_atomic(prealloc);
+                if (!prealloc) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                /*
+                 * Avoid to free 'prealloc' if it can be merged with
+                 * the later extent.
+                 */
+                err = insert_state(tree, prealloc, start, this_end,
+                                   &bits);
+                BUG_ON(err == -EEXIST);
+                if (err) {
+                        free_extent_state(prealloc);
+                        prealloc = NULL;
+                        goto out;
+                }
+                prealloc = NULL;
+                start = this_end + 1;
+                goto search_again;
+        }
+        /*
+         * | ---- desired range ---- |
+         *                        | state |
+         * We need to split the extent, and set the bit
+         * on the first half
+         */
+        if (state->start <= end && state->end > end) {
+                prealloc = alloc_extent_state_atomic(prealloc);
+                if (!prealloc) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                err = split_state(tree, state, prealloc, end + 1);
+                BUG_ON(err == -EEXIST);
+                set_state_bits(tree, prealloc, &bits);
+                clear_state_bit(tree, prealloc, &clear_bits, 0);
+                merge_state(tree, prealloc);
+                prealloc = NULL;
+                goto out;
+        }
+        goto search_again;
+out:
+        spin_unlock(&tree->lock);
+        if (prealloc)
+                free_extent_state(prealloc);
+        return err;
+search_again:
+        if (start > end)
+                goto out;
+        spin_unlock(&tree->lock);
+        if (mask & __GFP_WAIT)
+                cond_resched();
+        goto again;
+}
 /* wrappers around set/clear extent bit */
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
@@ -919,7 +1116,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask)
 {
        return set_extent_bit(tree, start, end,
-                              EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+                              EXTENT_DELALLOC | EXTENT_UPTODATE,
                              0, NULL, cached_state, mask);
 }
@@ -1599,6 +1796,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
        return 0;
 }
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+        struct page *page;
+        u64 start;
+        u64 len;
+        u64 logical;
+        unsigned long bio_flags;
+        int this_mirror;
+        int failed_mirror;
+        int in_validation;
+};
+static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
+                                int did_repair)
+{
+        int ret;
+        int err = 0;
+        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+        set_state_private(failure_tree, rec->start, 0);
+        ret = clear_extent_bits(failure_tree, rec->start,
+                                rec->start + rec->len - 1,
+                                EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+        if (ret)
+                err = ret;
+        if (did_repair) {
+                ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+                                        rec->start + rec->len - 1,
+                                        EXTENT_DAMAGED, GFP_NOFS);
+                if (ret && !err)
+                        err = ret;
+        }
+        kfree(rec);
+        return err;
+}
+static void repair_io_failure_callback(struct bio *bio, int err)
+{
+        complete(bio->bi_private);
+}
+/*
+ * this bypasses the standard btrfs submit functions deliberately, as
+ * the standard behavior is to write all copies in a raid setup. here we only
+ * want to write the one bad copy. so we do the mapping for ourselves and issue
+ * submit_bio directly.
+ * to avoid any synchonization issues, wait for the data after writing, which
+ * actually prevents the read that triggered the error from finishing.
+ * currently, there can be no more than two copies of every data bit. thus,
+ * exactly one rewrite is required.
+ */
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+                        u64 length, u64 logical, struct page *page,
+                        int mirror_num)
+{
+        struct bio *bio;
+        struct btrfs_device *dev;
+        DECLARE_COMPLETION_ONSTACK(compl);
+        u64 map_length = 0;
+        u64 sector;
+        struct btrfs_bio *bbio = NULL;
+        int ret;
+        BUG_ON(!mirror_num);
+        bio = bio_alloc(GFP_NOFS, 1);
+        if (!bio)
+                return -EIO;
+        bio->bi_private = &compl;
+        bio->bi_end_io = repair_io_failure_callback;
+        bio->bi_size = 0;
+        map_length = length;
+        ret = btrfs_map_block(map_tree, WRITE, logical,
+                              &map_length, &bbio, mirror_num);
+        if (ret) {
+                bio_put(bio);
+                return -EIO;
+        }
+        BUG_ON(mirror_num != bbio->mirror_num);
+        sector = bbio->stripes[mirror_num-1].physical >> 9;
+        bio->bi_sector = sector;
+        dev = bbio->stripes[mirror_num-1].dev;
+        kfree(bbio);
+        if (!dev || !dev->bdev || !dev->writeable) {
+                bio_put(bio);
+                return -EIO;
+        }
+        bio->bi_bdev = dev->bdev;
+        bio_add_page(bio, page, length, start-page_offset(page));
+        submit_bio(WRITE_SYNC, bio);
+        wait_for_completion(&compl);
+        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+                /* try to remap that extent elsewhere? */
+                bio_put(bio);
+                return -EIO;
+        }
+        printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
+                        "sector %llu)\n", page->mapping->host->i_ino, start,
+                        dev->name, sector);
+        bio_put(bio);
+        return 0;
+}
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int clean_io_failure(u64 start, struct page *page)
+{
+        u64 private;
+        u64 private_failure;
+        struct io_failure_record *failrec;
+        struct btrfs_mapping_tree *map_tree;
+        struct extent_state *state;
+        int num_copies;
+        int did_repair = 0;
+        int ret;
+        struct inode *inode = page->mapping->host;
+        private = 0;
+        ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+                                (u64)-1, 1, EXTENT_DIRTY, 0);
+        if (!ret)
+                return 0;
+        ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+                                &private_failure);
+        if (ret)
+                return 0;
+        failrec = (struct io_failure_record *)(unsigned long) private_failure;
+        BUG_ON(!failrec->this_mirror);
+        if (failrec->in_validation) {
+                /* there was no real error, just free the record */
+                pr_debug("clean_io_failure: freeing dummy error at %llu\n",
+                         failrec->start);
+                did_repair = 1;
+                goto out;
+        }
+        spin_lock(&BTRFS_I(inode)->io_tree.lock);
+        state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+                                            failrec->start,
+                                            EXTENT_LOCKED);
+        spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+        if (state && state->start == failrec->start) {
+                map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+                num_copies = btrfs_num_copies(map_tree, failrec->logical,
+                                                failrec->len);
+                if (num_copies > 1)  {
+                        ret = repair_io_failure(map_tree, start, failrec->len,
+                                                failrec->logical, page,
+                                                failrec->failed_mirror);
+                        did_repair = !ret;
+                }
+        }
+out:
+        if (!ret)
+                ret = free_io_failure(inode, failrec, did_repair);
+        return ret;
+}
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+static int bio_readpage_error(struct bio *failed_bio, struct page *page,
+                                u64 start, u64 end, int failed_mirror,
+                                struct extent_state *state)
+{
+        struct io_failure_record *failrec = NULL;
+        u64 private;
+        struct extent_map *em;
+        struct inode *inode = page->mapping->host;
+        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct bio *bio;
+        int num_copies;
+        int ret;
+        int read_mode;
+        u64 logical;
+        BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+        ret = get_state_private(failure_tree, start, &private);
+        if (ret) {
+                failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+                if (!failrec)
+                        return -ENOMEM;
+                failrec->start = start;
+                failrec->len = end - start + 1;
+                failrec->this_mirror = 0;
+                failrec->bio_flags = 0;
+                failrec->in_validation = 0;
+                read_lock(&em_tree->lock);
+                em = lookup_extent_mapping(em_tree, start, failrec->len);
+                if (!em) {
+                        read_unlock(&em_tree->lock);
+                        kfree(failrec);
+                        return -EIO;
+                }
+                if (em->start > start || em->start + em->len < start) {
+                        free_extent_map(em);
+                        em = NULL;
+                }
+                read_unlock(&em_tree->lock);
+                if (!em || IS_ERR(em)) {
+                        kfree(failrec);
+                        return -EIO;
+                }
+                logical = start - em->start;
+                logical = em->block_start + logical;
+                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+                        logical = em->block_start;
+                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+                        extent_set_compress_type(&failrec->bio_flags,
+                                                 em->compress_type);
+                }
+                pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
+                         "len=%llu\n", logical, start, failrec->len);
+                failrec->logical = logical;
+                free_extent_map(em);
+                /* set the bits in the private failure tree */
+                ret = set_extent_bits(failure_tree, start, end,
+                                        EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+                if (ret >= 0)
+                        ret = set_state_private(failure_tree, start,
+                                                (u64)(unsigned long)failrec);
+                /* set the bits in the inode's tree */
+                if (ret >= 0)
+                        ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
+                                                GFP_NOFS);
+                if (ret < 0) {
+                        kfree(failrec);
+                        return ret;
+                }
+        } else {
+                failrec = (struct io_failure_record *)(unsigned long)private;
+                pr_debug("bio_readpage_error: (found) logical=%llu, "
+                         "start=%llu, len=%llu, validation=%d\n",
+                         failrec->logical, failrec->start, failrec->len,
+                         failrec->in_validation);
+                /*
+                 * when data can be on disk more than twice, add to failrec here
+                 * (e.g. with a list for failed_mirror) to make
+                 * clean_io_failure() clean all those errors at once.
+                 */
+        }
+        num_copies = btrfs_num_copies(
+                              &BTRFS_I(inode)->root->fs_info->mapping_tree,
+                              failrec->logical, failrec->len);
+        if (num_copies == 1) {
+                /*
+                 * we only have a single copy of the data, so don't bother with
+                 * all the retry and error correction code that follows. no
+                 * matter what the error is, it is very likely to persist.
+                 */
+                pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
+                         "state=%p, num_copies=%d, next_mirror %d, "
+                         "failed_mirror %d\n", state, num_copies,
+                         failrec->this_mirror, failed_mirror);
+                free_io_failure(inode, failrec, 0);
+                return -EIO;
+        }
+        if (!state) {
+                spin_lock(&tree->lock);
+                state = find_first_extent_bit_state(tree, failrec->start,
+                                                    EXTENT_LOCKED);
+                if (state && state->start != failrec->start)
+                        state = NULL;
+                spin_unlock(&tree->lock);
+        }
+        /*
+         * there are two premises:
+         *      a) deliver good data to the caller
+         *      b) correct the bad sectors on disk
+         */
+        if (failed_bio->bi_vcnt > 1) {
+                /*
+                 * to fulfill b), we need to know the exact failing sectors, as
+                 * we don't want to rewrite any more than the failed ones. thus,
+                 * we need separate read requests for the failed bio
+                 *
+                 * if the following BUG_ON triggers, our validation request got
+                 * merged. we need separate requests for our algorithm to work.
+                 */
+                BUG_ON(failrec->in_validation);
+                failrec->in_validation = 1;
+                failrec->this_mirror = failed_mirror;
+                read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+        } else {
+                /*
+                 * we're ready to fulfill a) and b) alongside. get a good copy
+                 * of the failed sector and if we succeed, we have setup
+                 * everything for repair_io_failure to do the rest for us.
+                 */
+                if (failrec->in_validation) {
+                        BUG_ON(failrec->this_mirror != failed_mirror);
+                        failrec->in_validation = 0;
+                        failrec->this_mirror = 0;
+                }
+                failrec->failed_mirror = failed_mirror;
+                failrec->this_mirror++;
+                if (failrec->this_mirror == failed_mirror)
+                        failrec->this_mirror++;
+                read_mode = READ_SYNC;
+        }
+        if (!state || failrec->this_mirror > num_copies) {
+                pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
+                         "next_mirror %d, failed_mirror %d\n", state,
+                         num_copies, failrec->this_mirror, failed_mirror);
+                free_io_failure(inode, failrec, 0);
+                return -EIO;
+        }
+        bio = bio_alloc(GFP_NOFS, 1);
+        bio->bi_private = state;
+        bio->bi_end_io = failed_bio->bi_end_io;
+        bio->bi_sector = failrec->logical >> 9;
+        bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+        bio->bi_size = 0;
+        bio_add_page(bio, page, failrec->len, start - page_offset(page));
+        pr_debug("bio_readpage_error: submitting new read[%#x] to "
+                 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
+                 failrec->this_mirror, num_copies, failrec->in_validation);
+        tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
+                                        failrec->bio_flags, 0);
+        return 0;
+}
 /* lots and lots of room for performance fixes in the end_bio funcs */
 /*
@@ -1697,6 +2256,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                struct extent_state *cached = NULL;
                struct extent_state *state;
+                pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
+                         "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+                         (long int)bio->bi_bdev);
                tree = &BTRFS_I(page->mapping->host)->io_tree;
                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,12 +2289,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                                              state);
                        if (ret)
                                uptodate = 0;
+                        else
+                                clean_io_failure(start, page);
                }
-                if (!uptodate && tree->ops &&
+                if (!uptodate) {
-                    tree->ops->readpage_io_failed_hook) {
+                        int failed_mirror;
-                        ret = tree->ops->readpage_io_failed_hook(bio, page,
+                        failed_mirror = (int)(unsigned long)bio->bi_bdev;
-                                                         start, end, NULL);
+                        /*
+                         * The generic bio_readpage_error handles errors the
+                         * following way: If possible, new read requests are
+                         * created and submitted and will end up in
+                         * end_bio_extent_readpage as well (if we're lucky, not
+                         * in the !uptodate case). In that case it returns 0 and
+                         * we just go on with the next page in our bio. If it
+                         * can't handle the error it will return -EIO and we
+                         * remain responsible for that page.
+                         */
+                        ret = bio_readpage_error(bio, page, start, end,
+                                                        failed_mirror, NULL);
                        if (ret == 0) {
+error_handled:
                                uptodate =
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
@@ -1740,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                uncache_state(&cached);
                                continue;
                        }
+                        if (tree->ops && tree->ops->readpage_io_failed_hook) {
+                                ret = tree->ops->readpage_io_failed_hook(
+                                                        bio, page, start, end,
+                                                        failed_mirror, state);
+                                if (ret == 0)
+                                        goto error_handled;
+                        }
                }
                if (uptodate) {
@@ -1811,6 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
                                           mirror_num, bio_flags, start);
        else
                submit_bio(rw, bio);
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
                ret = -EOPNOTSUPP;
        bio_put(bio);
@@ -2076,16 +2660,16 @@ out:
 }
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-                            get_extent_t *get_extent)
+                            get_extent_t *get_extent, int mirror_num)
 {
        struct bio *bio = NULL;
        unsigned long bio_flags = 0;
        int ret;
-        ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+        ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
                                      &bio_flags);
        if (bio)
-                ret = submit_one_bio(READ, bio, 0, bio_flags);
+                ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
        return ret;
 }
@@ -2136,6 +2720,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        int compressed;
        int write_flags;
        unsigned long nr_written = 0;
+        bool fill_delalloc = true;
        if (wbc->sync_mode == WB_SYNC_ALL)
                write_flags = WRITE_SYNC;
@@ -2145,6 +2730,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        trace___extent_writepage(page, inode, wbc);
        WARN_ON(!PageLocked(page));
+        ClearPageError(page);
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
           (page->index == end_index && !pg_offset)) {
@@ -2166,10 +2754,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        set_page_extent_mapped(page);
+        if (!tree->ops || !tree->ops->fill_delalloc)
+                fill_delalloc = false;
        delalloc_start = start;
        delalloc_end = 0;
        page_started = 0;
-        if (!epd->extent_locked) {
+        if (!epd->extent_locked && fill_delalloc) {
                u64 delalloc_to_write = 0;
                /*
                 * make sure the wbc mapping index is at least updated
@@ -2421,10 +3012,16 @@ retry:
                         * swizzled back from swapper_space to tmpfs file
                         * mapping
                         */
-                        if (tree->ops && tree->ops->write_cache_pages_lock_hook)
+                        if (tree->ops &&
-                                tree->ops->write_cache_pages_lock_hook(page);
+                            tree->ops->write_cache_pages_lock_hook) {
-                        else
+                                tree->ops->write_cache_pages_lock_hook(page,
-                                lock_page(page);
+                                                               data, flush_fn);
+                        } else {
+                                if (!trylock_page(page)) {
+                                        flush_fn(data);
+                                        lock_page(page);
+                                }
+                        }
                        if (unlikely(page->mapping != mapping)) {
                                unlock_page(page);
@@ -2790,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                return -ENOMEM;
        path->leave_spinning = 1;
+        start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
+        len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
        /*
         * lookup the last file extent.  We're not using i_size here
         * because there might be preallocation past i_size
@@ -2837,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
-        em = get_extent_skip_holes(inode, off, last_for_get_extent,
+        em = get_extent_skip_holes(inode, start, last_for_get_extent,
                                   get_extent);
        if (!em)
                goto out;
@@ -2926,7 +3526,7 @@ out:
        return ret;
 }
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+inline struct page *extent_buffer_page(struct extent_buffer *eb,
                                              unsigned long i)
 {
        struct page *p;
@@ -2951,7 +3551,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
        return p;
 }
-static inline unsigned long num_extent_pages(u64 start, u64 len)
+inline unsigned long num_extent_pages(u64 start, u64 len)
 {
        return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
                (start >> PAGE_CACHE_SHIFT);
@@ -3204,6 +3804,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                                                PAGECACHE_TAG_DIRTY);
                }
                spin_unlock_irq(&page->mapping->tree_lock);
+                ClearPageError(page);
                unlock_page(page);
        }
        return 0;
@@ -3349,8 +3950,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 }
 int read_extent_buffer_pages(struct extent_io_tree *tree,
-                             struct extent_buffer *eb,
+                             struct extent_buffer *eb, u64 start, int wait,
-                             u64 start, int wait,
                             get_extent_t *get_extent, int mirror_num)
 {
        unsigned long i;
@@ -3386,7 +3986,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-                if (!wait) {
+                if (wait == WAIT_NONE) {
                        if (!trylock_page(page))
                                goto unlock_exit;
                } else {
@@ -3430,7 +4030,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        if (bio)
                submit_one_bio(READ, bio, mirror_num, bio_flags);
-        if (ret || !wait)
+        if (ret || wait != WAIT_COMPLETE)
                return ret;
        for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e792..7604c300132 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,8 @@
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
 #define EXTENT_FIRST_DELALLOC (1 << 12)
+#define EXTENT_NEED_WAIT (1 << 13)
+#define EXTENT_DAMAGED (1 << 14)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
@@ -32,6 +34,7 @@
 #define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
 #define EXTENT_BUFFER_CORRUPT 3
+#define EXTENT_BUFFER_READAHEAD 4       /* this got triggered by readahead */
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -67,7 +70,7 @@ struct extent_io_ops {
                              unsigned long bio_flags);
        int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
        int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
-                                       u64 start, u64 end,
+                                       u64 start, u64 end, int failed_mirror,
                                       struct extent_state *state);
        int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
                                        u64 start, u64 end,
@@ -85,7 +88,8 @@ struct extent_io_ops {
                                  struct extent_state *other);
        void (*split_extent_hook)(struct inode *inode,
                                  struct extent_state *orig, u64 split);
-        int (*write_cache_pages_lock_hook)(struct page *page);
+        int (*write_cache_pages_lock_hook)(struct page *page, void *data,
+                                           void (*flush_fn)(void *));
 };
 struct extent_io_tree {
@@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
                    gfp_t mask);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-                          get_extent_t *get_extent);
+                          get_extent_t *get_extent, int mirror_num);
 int __init extent_io_init(void);
 void extent_io_exit(void);
@@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask);
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                       int bits, int clear_bits, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
                                         u64 start, unsigned long len);
 void free_extent_buffer(struct extent_buffer *eb);
+#define WAIT_NONE       0
+#define WAIT_COMPLETE   1
+#define WAIT_PAGE_LOCK  2
 int read_extent_buffer_pages(struct extent_io_tree *tree,
                             struct extent_buffer *eb, u64 start, int wait,
                             get_extent_t *get_extent, int mirror_num);
+unsigned long num_extent_pages(u64 start, u64 len);
+struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
@@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
                gfp_t gfp_flags);
+struct btrfs_mapping_tree;
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+                        u64 length, u64 logical, struct page *page,
+                        int mirror_num);
 #endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821bec..c7fb3a4247d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
        struct btrfs_csum_item *item;
        struct extent_buffer *leaf;
        u64 csum_offset = 0;
-        u16 csum_size =
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                btrfs_super_csum_size(&root->fs_info->super_copy);
        int csums_in_item;
        file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
        u64 item_last_offset = 0;
        u64 disk_bytenr;
        u32 diff;
-        u16 csum_size =
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                btrfs_super_csum_size(&root->fs_info->super_copy);
        int ret;
        struct btrfs_path *path;
        struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        int ret;
        size_t size;
        u64 csum_end;
-        u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
        path = btrfs_alloc_path();
        if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
                                      u64 bytenr, u64 len)
 {
        struct extent_buffer *leaf;
-        u16 csum_size =
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                btrfs_super_csum_size(&root->fs_info->super_copy);
        u64 csum_end;
        u64 end_byte = bytenr + len;
        u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
        u64 csum_end;
        struct extent_buffer *leaf;
        int ret;
-        u16 csum_size =
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                btrfs_super_csum_size(&root->fs_info->super_copy);
        int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
        root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
        struct btrfs_sector_sum *sector_sum;
        u32 nritems;
        u32 ins_size;
-        u16 csum_size =
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                btrfs_super_csum_size(&root->fs_info->super_copy);
        path = btrfs_alloc_path();
        if (!path)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1266f6e9cdb..dafdfa059bf 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        int i;
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
        struct inode *inode = fdentry(file)->d_inode;
+        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
        int err = 0;
        int faili = 0;
        u64 start_pos;
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = find_or_create_page(inode->i_mapping, index + i,
-                                               GFP_NOFS);
+                                               mask);
                if (!pages[i]) {
                        faili = i - 1;
                        err = -ENOMEM;
@@ -1615,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,
                        goto out;
        }
-        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-        if (ret)
-                goto out;
        locked_end = alloc_end - 1;
        while (1) {
                struct btrfs_ordered_extent *ordered;
@@ -1664,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                        /*
+                         * Make sure we have enough space before we do the
+                         * allocation.
+                         */
+                        ret = btrfs_check_data_free_space(inode, last_byte -
+                                                          cur_offset);
+                        if (ret) {
+                                free_extent_map(em);
+                                break;
+                        }
                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
                                                        last_byte - cur_offset,
                                                        1 << inode->i_blkbits,
                                                        offset + len,
                                                        &alloc_hint);
+                        /* Let go of our reservation. */
+                        btrfs_free_reserved_data_space(inode, last_byte -
+                                                       cur_offset);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
@@ -1694,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,
        }
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state, GFP_NOFS);
-        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d..ec23d43d0c3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
+#include <linux/ratelimit.h>
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
                                      *block_group, struct btrfs_path *path)
 {
        struct inode *inode = NULL;
+        u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
        spin_lock(&block_group->lock);
        if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
                return inode;
        spin_lock(&block_group->lock);
-        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) {
+        if (!((BTRFS_I(inode)->flags & flags) == flags)) {
                printk(KERN_INFO "Old style space inode found, converting.\n");
-                BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM;
+                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
+                        BTRFS_INODE_NODATACOW;
                block_group->disk_cache_state = BTRFS_DC_CLEAR;
        }
-        if (!btrfs_fs_closing(root->fs_info)) {
+        if (!block_group->iref) {
                block_group->inode = igrab(inode);
                block_group->iref = 1;
        }
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
        struct btrfs_free_space_header *header;
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
+        u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
        int ret;
        ret = btrfs_insert_empty_inode(trans, root, path, ino);
        if (ret)
                return ret;
+        /* We inline crc's for the free disk space cache */
+        if (ino != BTRFS_FREE_INO_OBJECTID)
+                flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
        leaf = path->nodes[0];
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
        btrfs_set_inode_uid(leaf, inode_item, 0);
        btrfs_set_inode_gid(leaf, inode_item, 0);
        btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
-        btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
+        btrfs_set_inode_flags(leaf, inode_item, flags);
-                              BTRFS_INODE_PREALLOC);
        btrfs_set_inode_nlink(leaf, inode_item, 1);
        btrfs_set_inode_transid(leaf, inode_item, trans->transid);
        btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
                                    struct inode *inode)
 {
        struct btrfs_block_rsv *rsv;
+        u64 needed_bytes;
        loff_t oldsize;
        int ret = 0;
        rsv = trans->block_rsv;
-        trans->block_rsv = root->orphan_block_rsv;
+        trans->block_rsv = &root->fs_info->global_block_rsv;
-        ret = btrfs_block_rsv_check(trans, root,
-                                    root->orphan_block_rsv,
+        /* 1 for slack space, 1 for updating the inode */
-                                    0, 5);
+        needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
-        if (ret)
+                btrfs_calc_trans_metadata_size(root, 1);
-                return ret;
+        spin_lock(&trans->block_rsv->lock);
+        if (trans->block_rsv->reserved < needed_bytes) {
+                spin_unlock(&trans->block_rsv->lock);
+                trans->block_rsv = rsv;
+                return -ENOSPC;
+        }
+        spin_unlock(&trans->block_rsv->lock);
        oldsize = i_size_read(inode);
        btrfs_i_size_write(inode, 0);
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
        ret = btrfs_truncate_inode_items(trans, root, inode,
                                         0, BTRFS_EXTENT_DATA_KEY);
-        trans->block_rsv = rsv;
        if (ret) {
+                trans->block_rsv = rsv;
                WARN_ON(1);
                return ret;
        }
        ret = btrfs_update_inode(trans, root, inode);
+        trans->block_rsv = rsv;
        return ret;
 }
@@ -242,26 +259,348 @@ static int readahead_cache(struct inode *inode)
        return 0;
 }
+struct io_ctl {
+        void *cur, *orig;
+        struct page *page;
+        struct page **pages;
+        struct btrfs_root *root;
+        unsigned long size;
+        int index;
+        int num_pages;
+        unsigned check_crcs:1;
+};
+static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
+                       struct btrfs_root *root)
+{
+        memset(io_ctl, 0, sizeof(struct io_ctl));
+        io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                PAGE_CACHE_SHIFT;
+        io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
+                                GFP_NOFS);
+        if (!io_ctl->pages)
+                return -ENOMEM;
+        io_ctl->root = root;
+        if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+                io_ctl->check_crcs = 1;
+        return 0;
+}
+static void io_ctl_free(struct io_ctl *io_ctl)
+{
+        kfree(io_ctl->pages);
+}
+static void io_ctl_unmap_page(struct io_ctl *io_ctl)
+{
+        if (io_ctl->cur) {
+                kunmap(io_ctl->page);
+                io_ctl->cur = NULL;
+                io_ctl->orig = NULL;
+        }
+}
+static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
+{
+        WARN_ON(io_ctl->cur);
+        BUG_ON(io_ctl->index >= io_ctl->num_pages);
+        io_ctl->page = io_ctl->pages[io_ctl->index++];
+        io_ctl->cur = kmap(io_ctl->page);
+        io_ctl->orig = io_ctl->cur;
+        io_ctl->size = PAGE_CACHE_SIZE;
+        if (clear)
+                memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
+}
+static void io_ctl_drop_pages(struct io_ctl *io_ctl)
+{
+        int i;
+        io_ctl_unmap_page(io_ctl);
+        for (i = 0; i < io_ctl->num_pages; i++) {
+                ClearPageChecked(io_ctl->pages[i]);
+                unlock_page(io_ctl->pages[i]);
+                page_cache_release(io_ctl->pages[i]);
+        }
+}
+static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
+                                int uptodate)
+{
+        struct page *page;
+        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+        int i;
+        for (i = 0; i < io_ctl->num_pages; i++) {
+                page = find_or_create_page(inode->i_mapping, i, mask);
+                if (!page) {
+                        io_ctl_drop_pages(io_ctl);
+                        return -ENOMEM;
+                }
+                io_ctl->pages[i] = page;
+                if (uptodate && !PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                printk(KERN_ERR "btrfs: error reading free "
+                                       "space cache\n");
+                                io_ctl_drop_pages(io_ctl);
+                                return -EIO;
+                        }
+                }
+        }
+        for (i = 0; i < io_ctl->num_pages; i++) {
+                clear_page_dirty_for_io(io_ctl->pages[i]);
+                set_page_extent_mapped(io_ctl->pages[i]);
+        }
+        return 0;
+}
+static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
+{
+        u64 *val;
+        io_ctl_map_page(io_ctl, 1);
+        /*
+         * Skip the csum areas.  If we don't check crcs then we just have a
+         * 64bit chunk at the front of the first page.
+         */
+        if (io_ctl->check_crcs) {
+                io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+                io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
+        } else {
+                io_ctl->cur += sizeof(u64);
+                io_ctl->size -= sizeof(u64) * 2;
+        }
+        val = io_ctl->cur;
+        *val = cpu_to_le64(generation);
+        io_ctl->cur += sizeof(u64);
+}
+static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
+{
+        u64 *gen;
+        /*
+         * Skip the crc area.  If we don't check crcs then we just have a 64bit
+         * chunk at the front of the first page.
+         */
+        if (io_ctl->check_crcs) {
+                io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+                io_ctl->size -= sizeof(u64) +
+                        (sizeof(u32) * io_ctl->num_pages);
+        } else {
+                io_ctl->cur += sizeof(u64);
+                io_ctl->size -= sizeof(u64) * 2;
+        }
+        gen = io_ctl->cur;
+        if (le64_to_cpu(*gen) != generation) {
+                printk_ratelimited(KERN_ERR "btrfs: space cache generation "
+                                   "(%Lu) does not match inode (%Lu)\n", *gen,
+                                   generation);
+                io_ctl_unmap_page(io_ctl);
+                return -EIO;
+        }
+        io_ctl->cur += sizeof(u64);
+        return 0;
+}
+static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
+{
+        u32 *tmp;
+        u32 crc = ~(u32)0;
+        unsigned offset = 0;
+        if (!io_ctl->check_crcs) {
+                io_ctl_unmap_page(io_ctl);
+                return;
+        }
+        if (index == 0)
+                offset = sizeof(u32) * io_ctl->num_pages;;
+        crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+                              PAGE_CACHE_SIZE - offset);
+        btrfs_csum_final(crc, (char *)&crc);
+        io_ctl_unmap_page(io_ctl);
+        tmp = kmap(io_ctl->pages[0]);
+        tmp += index;
+        *tmp = crc;
+        kunmap(io_ctl->pages[0]);
+}
+static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
+{
+        u32 *tmp, val;
+        u32 crc = ~(u32)0;
+        unsigned offset = 0;
+        if (!io_ctl->check_crcs) {
+                io_ctl_map_page(io_ctl, 0);
+                return 0;
+        }
+        if (index == 0)
+                offset = sizeof(u32) * io_ctl->num_pages;
+        tmp = kmap(io_ctl->pages[0]);
+        tmp += index;
+        val = *tmp;
+        kunmap(io_ctl->pages[0]);
+        io_ctl_map_page(io_ctl, 0);
+        crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+                              PAGE_CACHE_SIZE - offset);
+        btrfs_csum_final(crc, (char *)&crc);
+        if (val != crc) {
+                printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
+                                   "space cache\n");
+                io_ctl_unmap_page(io_ctl);
+                return -EIO;
+        }
+        return 0;
+}
+static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
+                            void *bitmap)
+{
+        struct btrfs_free_space_entry *entry;
+        if (!io_ctl->cur)
+                return -ENOSPC;
+        entry = io_ctl->cur;
+        entry->offset = cpu_to_le64(offset);
+        entry->bytes = cpu_to_le64(bytes);
+        entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
+                BTRFS_FREE_SPACE_EXTENT;
+        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+                return 0;
+        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+        /* No more pages to map */
+        if (io_ctl->index >= io_ctl->num_pages)
+                return 0;
+        /* map the next page */
+        io_ctl_map_page(io_ctl, 1);
+        return 0;
+}
+static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
+{
+        if (!io_ctl->cur)
+                return -ENOSPC;
+        /*
+         * If we aren't at the start of the current page, unmap this one and
+         * map the next one if there is any left.
+         */
+        if (io_ctl->cur != io_ctl->orig) {
+                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+                if (io_ctl->index >= io_ctl->num_pages)
+                        return -ENOSPC;
+                io_ctl_map_page(io_ctl, 0);
+        }
+        memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
+        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+        if (io_ctl->index < io_ctl->num_pages)
+                io_ctl_map_page(io_ctl, 0);
+        return 0;
+}
+static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
+{
+        /*
+         * If we're not on the boundary we know we've modified the page and we
+         * need to crc the page.
+         */
+        if (io_ctl->cur != io_ctl->orig)
+                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+        else
+                io_ctl_unmap_page(io_ctl);
+        while (io_ctl->index < io_ctl->num_pages) {
+                io_ctl_map_page(io_ctl, 1);
+                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+        }
+}
+static int io_ctl_read_entry(struct io_ctl *io_ctl,
+                            struct btrfs_free_space *entry, u8 *type)
+{
+        struct btrfs_free_space_entry *e;
+        int ret;
+        if (!io_ctl->cur) {
+                ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+                if (ret)
+                        return ret;
+        }
+        e = io_ctl->cur;
+        entry->offset = le64_to_cpu(e->offset);
+        entry->bytes = le64_to_cpu(e->bytes);
+        *type = e->type;
+        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+                return 0;
+        io_ctl_unmap_page(io_ctl);
+        return 0;
+}
+static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
+                              struct btrfs_free_space *entry)
+{
+        int ret;
+        ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+        if (ret)
+                return ret;
+        memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
+        io_ctl_unmap_page(io_ctl);
+        return 0;
+}
 int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                            struct btrfs_free_space_ctl *ctl,
                            struct btrfs_path *path, u64 offset)
 {
        struct btrfs_free_space_header *header;
        struct extent_buffer *leaf;
-        struct page *page;
+        struct io_ctl io_ctl;
        struct btrfs_key key;
+        struct btrfs_free_space *e, *n;
        struct list_head bitmaps;
        u64 num_entries;
        u64 num_bitmaps;
        u64 generation;
-        pgoff_t index = 0;
+        u8 type;
        int ret = 0;
        INIT_LIST_HEAD(&bitmaps);
        /* Nothing in the space cache, goodbye */
        if (!i_size_read(inode))
-                goto out;
+                return 0;
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
        key.offset = offset;
@@ -269,11 +608,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
-                goto out;
+                return 0;
        else if (ret > 0) {
                btrfs_release_path(path);
-                ret = 0;
+                return 0;
-                goto out;
        }
        ret = -1;
@@ -291,169 +629,102 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                       " not match free space cache generation (%llu)\n",
                       (unsigned long long)BTRFS_I(inode)->generation,
                       (unsigned long long)generation);
-                goto out;
+                return 0;
        }
        if (!num_entries)
-                goto out;
+                return 0;
+        io_ctl_init(&io_ctl, inode, root);
        ret = readahead_cache(inode);
        if (ret)
                goto out;
-        while (1) {
+        ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
-                struct btrfs_free_space_entry *entry;
+        if (ret)
-                struct btrfs_free_space *e;
+                goto out;
-                void *addr;
-                unsigned long offset = 0;
-                int need_loop = 0;
-                if (!num_entries && !num_bitmaps)
+        ret = io_ctl_check_crc(&io_ctl, 0);
-                        break;
+        if (ret)
+                goto free_cache;
+        ret = io_ctl_check_generation(&io_ctl, generation);
+        if (ret)
+                goto free_cache;
-                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+        while (num_entries) {
-                if (!page)
+                e = kmem_cache_zalloc(btrfs_free_space_cachep,
+                                      GFP_NOFS);
+                if (!e)
                        goto free_cache;
-                if (!PageUptodate(page)) {
+                ret = io_ctl_read_entry(&io_ctl, e, &type);
-                        btrfs_readpage(NULL, page);
+                if (ret) {
-                        lock_page(page);
+                        kmem_cache_free(btrfs_free_space_cachep, e);
-                        if (!PageUptodate(page)) {
+                        goto free_cache;
-                                unlock_page(page);
-                                page_cache_release(page);
-                                printk(KERN_ERR "btrfs: error reading free "
-                                       "space cache\n");
-                                goto free_cache;
-                        }
                }
-                addr = kmap(page);
-                if (index == 0) {
+                if (!e->bytes) {
-                        u64 *gen;
+                        kmem_cache_free(btrfs_free_space_cachep, e);
+                        goto free_cache;
+                }
-                        /*
+                if (type == BTRFS_FREE_SPACE_EXTENT) {
-                         * We put a bogus crc in the front of the first page in
+                        spin_lock(&ctl->tree_lock);
-                         * case old kernels try to mount a fs with the new
+                        ret = link_free_space(ctl, e);
-                         * format to make sure they discard the cache.
+                        spin_unlock(&ctl->tree_lock);
-                         */
+                        if (ret) {
-                        addr += sizeof(u64);
+                                printk(KERN_ERR "Duplicate entries in "
-                        offset += sizeof(u64);
+                                       "free space cache, dumping\n");
+                                kmem_cache_free(btrfs_free_space_cachep, e);
-                        gen = addr;
-                        if (*gen != BTRFS_I(inode)->generation) {
-                                printk(KERN_ERR "btrfs: space cache generation"
-                                       " (%llu) does not match inode (%llu)\n",
-                                       (unsigned long long)*gen,
-                                       (unsigned long long)
-                                       BTRFS_I(inode)->generation);
-                                kunmap(page);
-                                unlock_page(page);
-                                page_cache_release(page);
                                goto free_cache;
                        }
-                        addr += sizeof(u64);
+                } else {
-                        offset += sizeof(u64);
+                        BUG_ON(!num_bitmaps);
-                }
+                        num_bitmaps--;
-                entry = addr;
+                        e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+                        if (!e->bitmap) {
-                while (1) {
+                                kmem_cache_free(
-                        if (!num_entries)
+                                        btrfs_free_space_cachep, e);
-                                break;
-                        need_loop = 1;
-                        e = kmem_cache_zalloc(btrfs_free_space_cachep,
-                                              GFP_NOFS);
-                        if (!e) {
-                                kunmap(page);
-                                unlock_page(page);
-                                page_cache_release(page);
                                goto free_cache;
                        }
+                        spin_lock(&ctl->tree_lock);
-                        e->offset = le64_to_cpu(entry->offset);
+                        ret = link_free_space(ctl, e);
-                        e->bytes = le64_to_cpu(entry->bytes);
+                        ctl->total_bitmaps++;
-                        if (!e->bytes) {
+                        ctl->op->recalc_thresholds(ctl);
-                                kunmap(page);
+                        spin_unlock(&ctl->tree_lock);
+                        if (ret) {
+                                printk(KERN_ERR "Duplicate entries in "
+                                       "free space cache, dumping\n");
                                kmem_cache_free(btrfs_free_space_cachep, e);
-                                unlock_page(page);
-                                page_cache_release(page);
                                goto free_cache;
                        }
+                        list_add_tail(&e->list, &bitmaps);
-                        if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
-                                spin_lock(&ctl->tree_lock);
-                                ret = link_free_space(ctl, e);
-                                spin_unlock(&ctl->tree_lock);
-                                if (ret) {
-                                        printk(KERN_ERR "Duplicate entries in "
-                                               "free space cache, dumping\n");
-                                        kunmap(page);
-                                        unlock_page(page);
-                                        page_cache_release(page);
-                                        goto free_cache;
-                                }
-                        } else {
-                                e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-                                if (!e->bitmap) {
-                                        kunmap(page);
-                                        kmem_cache_free(
-                                                btrfs_free_space_cachep, e);
-                                        unlock_page(page);
-                                        page_cache_release(page);
-                                        goto free_cache;
-                                }
-                                spin_lock(&ctl->tree_lock);
-                                ret = link_free_space(ctl, e);
-                                ctl->total_bitmaps++;
-                                ctl->op->recalc_thresholds(ctl);
-                                spin_unlock(&ctl->tree_lock);
-                                if (ret) {
-                                        printk(KERN_ERR "Duplicate entries in "
-                                               "free space cache, dumping\n");
-                                        kunmap(page);
-                                        unlock_page(page);
-                                        page_cache_release(page);
-                                        goto free_cache;
-                                }
-                                list_add_tail(&e->list, &bitmaps);
-                        }
-                        num_entries--;
-                        offset += sizeof(struct btrfs_free_space_entry);
-                        if (offset + sizeof(struct btrfs_free_space_entry) >=
-                            PAGE_CACHE_SIZE)
-                                break;
-                        entry++;
                }
-                /*
+                num_entries--;
-                 * We read an entry out of this page, we need to move on to the
+        }
-                 * next page.
-                 */
-                if (need_loop) {
-                        kunmap(page);
-                        goto next;
-                }
-                /*
+        io_ctl_unmap_page(&io_ctl);
-                 * We add the bitmaps at the end of the entries in order that
-                 * the bitmap entries are added to the cache.
+        /*
-                 */
+         * We add the bitmaps at the end of the entries in order that
-                e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+         * the bitmap entries are added to the cache.
+         */
+        list_for_each_entry_safe(e, n, &bitmaps, list) {
                list_del_init(&e->list);
-                memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
+                ret = io_ctl_read_bitmap(&io_ctl, e);
-                kunmap(page);
+                if (ret)
-                num_bitmaps--;
+                        goto free_cache;
-next:
-                unlock_page(page);
-                page_cache_release(page);
-                index++;
        }
+        io_ctl_drop_pages(&io_ctl);
        ret = 1;
 out:
+        io_ctl_free(&io_ctl);
        return ret;
 free_cache:
+        io_ctl_drop_pages(&io_ctl);
        __btrfs_remove_free_space_cache(ctl);
        goto out;
 }
@@ -465,7 +736,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
        struct btrfs_root *root = fs_info->tree_root;
        struct inode *inode;
        struct btrfs_path *path;
-        int ret;
+        int ret = 0;
        bool matched;
        u64 used = btrfs_block_group_used(&block_group->item);
@@ -497,6 +768,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                return 0;
        }
+        /* We may have converted the inode and made the cache invalid. */
+        spin_lock(&block_group->lock);
+        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+                spin_unlock(&block_group->lock);
+                goto out;
+        }
+        spin_unlock(&block_group->lock);
        ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
                                      path, block_group->key.objectid);
        btrfs_free_path(path);
@@ -530,6 +809,19 @@ out:
        return ret;
 }
+/**
+ * __btrfs_write_out_cache - write out cached info to an inode
+ * @root - the root the inode belongs to
+ * @ctl - the free space cache we are going to write out
+ * @block_group - the block_group for this cache if it belongs to a block_group
+ * @trans - the trans handle
+ * @path - the path to use
+ * @offset - the offset for the key we'll insert
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount.  This will return 0 if it was successfull in writing the cache out,
+ * and -1 if it was not.
+ */
 int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                            struct btrfs_free_space_ctl *ctl,
                            struct btrfs_block_group_cache *block_group,
@@ -540,42 +832,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        struct extent_buffer *leaf;
        struct rb_node *node;
        struct list_head *pos, *n;
-        struct page **pages;
-        struct page *page;
        struct extent_state *cached_state = NULL;
        struct btrfs_free_cluster *cluster = NULL;
        struct extent_io_tree *unpin = NULL;
+        struct io_ctl io_ctl;
        struct list_head bitmap_list;
        struct btrfs_key key;
        u64 start, end, len;
-        u64 bytes = 0;
-        u32 crc = ~(u32)0;
-        int index = 0, num_pages = 0;
        int entries = 0;
        int bitmaps = 0;
-        int ret = -1;
+        int ret;
-        bool next_page = false;
+        int err = -1;
-        bool out_of_space = false;
        INIT_LIST_HEAD(&bitmap_list);
-        node = rb_first(&ctl->free_space_offset);
-        if (!node)
-                return 0;
        if (!i_size_read(inode))
                return -1;
-        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+        io_ctl_init(&io_ctl, inode, root);
-                PAGE_CACHE_SHIFT;
-        filemap_write_and_wait(inode->i_mapping);
-        btrfs_wait_ordered_range(inode, inode->i_size &
-                                 ~(root->sectorsize - 1), (u64)-1);
-        pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
-        if (!pages)
-                return -1;
        /* Get the cluster for this block_group if it exists */
        if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +863,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
         */
        unpin = root->fs_info->pinned_extents;
-        /*
+        /* Lock all pages first so we can lock the extent safely. */
-         * Lock all pages first so we can lock the extent safely.
+        io_ctl_prepare_pages(&io_ctl, inode, 0);
-         *
-         * NOTE: Because we hold the ref the entire time we're going to write to
-         * the page find_get_page should never fail, so we don't do a check
-         * after find_get_page at this point.  Just putting this here so people
-         * know and don't freak out.
-         */
-        while (index < num_pages) {
-                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
-                if (!page) {
-                        int i;
-                        for (i = 0; i < num_pages; i++) {
-                                unlock_page(pages[i]);
-                                page_cache_release(pages[i]);
-                        }
-                        goto out;
-                }
-                pages[index] = page;
-                index++;
-        }
-        index = 0;
        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
                         0, &cached_state, GFP_NOFS);
@@ -623,189 +876,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        if (block_group)
                start = block_group->key.objectid;
-        /* Write out the extent entries */
+        node = rb_first(&ctl->free_space_offset);
-        do {
+        if (!node && cluster) {
-                struct btrfs_free_space_entry *entry;
+                node = rb_first(&cluster->root);
-                void *addr, *orig;
+                cluster = NULL;
-                unsigned long offset = 0;
+        }
-                next_page = false;
+        /* Make sure we can fit our crcs into the first page */
+        if (io_ctl.check_crcs &&
+            (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
+                WARN_ON(1);
+                goto out_nospc;
+        }
-                if (index >= num_pages) {
+        io_ctl_set_generation(&io_ctl, trans->transid);
-                        out_of_space = true;
-                        break;
-                }
-                page = pages[index];
+        /* Write out the extent entries */
+        while (node) {
+                struct btrfs_free_space *e;
-                orig = addr = kmap(page);
+                e = rb_entry(node, struct btrfs_free_space, offset_index);
-                if (index == 0) {
+                entries++;
-                        u64 *gen;
-                        /*
+                ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
-                         * We're going to put in a bogus crc for this page to
+                                       e->bitmap);
-                         * make sure that old kernels who aren't aware of this
+                if (ret)
-                         * format will be sure to discard the cache.
+                        goto out_nospc;
-                         */
-                        addr += sizeof(u64);
-                        offset += sizeof(u64);
-                        gen = addr;
+                if (e->bitmap) {
-                        *gen = trans->transid;
+                        list_add_tail(&e->list, &bitmap_list);
-                        addr += sizeof(u64);
+                        bitmaps++;
-                        offset += sizeof(u64);
                }
-                entry = addr;
+                node = rb_next(node);
+                if (!node && cluster) {
-                memset(addr, 0, PAGE_CACHE_SIZE - offset);
+                        node = rb_first(&cluster->root);
-                while (node && !next_page) {
+                        cluster = NULL;
-                        struct btrfs_free_space *e;
-                        e = rb_entry(node, struct btrfs_free_space, offset_index);
-                        entries++;
-                        entry->offset = cpu_to_le64(e->offset);
-                        entry->bytes = cpu_to_le64(e->bytes);
-                        if (e->bitmap) {
-                                entry->type = BTRFS_FREE_SPACE_BITMAP;
-                                list_add_tail(&e->list, &bitmap_list);
-                                bitmaps++;
-                        } else {
-                                entry->type = BTRFS_FREE_SPACE_EXTENT;
-                        }
-                        node = rb_next(node);
-                        if (!node && cluster) {
-                                node = rb_first(&cluster->root);
-                                cluster = NULL;
-                        }
-                        offset += sizeof(struct btrfs_free_space_entry);
-                        if (offset + sizeof(struct btrfs_free_space_entry) >=
-                            PAGE_CACHE_SIZE)
-                                next_page = true;
-                        entry++;
                }
+        }
-                /*
+        /*
-                 * We want to add any pinned extents to our free space cache
+         * We want to add any pinned extents to our free space cache
-                 * so we don't leak the space
+         * so we don't leak the space
-                 */
+         */
-                while (block_group && !next_page &&
+        while (block_group && (start < block_group->key.objectid +
-                       (start < block_group->key.objectid +
+                               block_group->key.offset)) {
-                        block_group->key.offset)) {
+                ret = find_first_extent_bit(unpin, start, &start, &end,
-                        ret = find_first_extent_bit(unpin, start, &start, &end,
+                                            EXTENT_DIRTY);
-                                                    EXTENT_DIRTY);
+                if (ret) {
-                        if (ret) {
+                        ret = 0;
-                                ret = 0;
+                        break;
-                                break;
-                        }
-                        /* This pinned extent is out of our range */
-                        if (start >= block_group->key.objectid +
-                            block_group->key.offset)
-                                break;
-                        len = block_group->key.objectid +
-                                block_group->key.offset - start;
-                        len = min(len, end + 1 - start);
-                        entries++;
-                        entry->offset = cpu_to_le64(start);
-                        entry->bytes = cpu_to_le64(len);
-                        entry->type = BTRFS_FREE_SPACE_EXTENT;
-                        start = end + 1;
-                        offset += sizeof(struct btrfs_free_space_entry);
-                        if (offset + sizeof(struct btrfs_free_space_entry) >=
-                            PAGE_CACHE_SIZE)
-                                next_page = true;
-                        entry++;
                }
-                /* Generate bogus crc value */
+                /* This pinned extent is out of our range */
-                if (index == 0) {
+                if (start >= block_group->key.objectid +
-                        u32 *tmp;
+                    block_group->key.offset)
-                        crc = btrfs_csum_data(root, orig + sizeof(u64), crc,
+                        break;
-                                              PAGE_CACHE_SIZE - sizeof(u64));
-                        btrfs_csum_final(crc, (char *)&crc);
-                        crc++;
-                        tmp = orig;
-                        *tmp = crc;
-                }
-                kunmap(page);
+                len = block_group->key.objectid +
+                        block_group->key.offset - start;
+                len = min(len, end + 1 - start);
-                bytes += PAGE_CACHE_SIZE;
+                entries++;
+                ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+                if (ret)
+                        goto out_nospc;
-                index++;
+                start = end + 1;
-        } while (node || next_page);
+        }
        /* Write out the bitmaps */
        list_for_each_safe(pos, n, &bitmap_list) {
-                void *addr;
                struct btrfs_free_space *entry =
                        list_entry(pos, struct btrfs_free_space, list);
-                if (index >= num_pages) {
+                ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
-                        out_of_space = true;
+                if (ret)
-                        break;
+                        goto out_nospc;
-                }
-                page = pages[index];
-                addr = kmap(page);
-                memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
-                kunmap(page);
-                bytes += PAGE_CACHE_SIZE;
                list_del_init(&entry->list);
-                index++;
-        }
-        if (out_of_space) {
-                btrfs_drop_pages(pages, num_pages);
-                unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
-                                     i_size_read(inode) - 1, &cached_state,
-                                     GFP_NOFS);
-                ret = 0;
-                goto out;
        }
        /* Zero out the rest of the pages just to make sure */
-        while (index < num_pages) {
+        io_ctl_zero_remaining_pages(&io_ctl);
-                void *addr;
-                page = pages[index];
+        ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
-                addr = kmap(page);
+                                0, i_size_read(inode), &cached_state);
-                memset(addr, 0, PAGE_CACHE_SIZE);
+        io_ctl_drop_pages(&io_ctl);
-                kunmap(page);
-                bytes += PAGE_CACHE_SIZE;
-                index++;
-        }
-        ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
-                                            bytes, &cached_state);
-        btrfs_drop_pages(pages, num_pages);
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
                             i_size_read(inode) - 1, &cached_state, GFP_NOFS);
-        if (ret) {
+        if (ret)
-                ret = 0;
                goto out;
-        }
-        BTRFS_I(inode)->generation = trans->transid;
-        filemap_write_and_wait(inode->i_mapping);
+        ret = filemap_write_and_wait(inode->i_mapping);
+        if (ret)
+                goto out;
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
        key.offset = offset;
        key.type = 0;
-        ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret < 0) {
-                ret = -1;
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
-                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
-                                 EXTENT_DIRTY | EXTENT_DELALLOC |
+                                 GFP_NOFS);
-                                 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
                goto out;
        }
        leaf = path->nodes[0];
@@ -816,15 +991,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
                    found_key.offset != offset) {
-                        ret = -1;
+                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
-                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+                                         inode->i_size - 1,
-                                         EXTENT_DIRTY | EXTENT_DELALLOC |
+                                         EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
-                                         EXTENT_DO_ACCOUNTING, 0, 0, NULL,
+                                         NULL, GFP_NOFS);
-                                         GFP_NOFS);
                        btrfs_release_path(path);
                        goto out;
                }
        }
+        BTRFS_I(inode)->generation = trans->transid;
        header = btrfs_item_ptr(leaf, path->slots[0],
                                struct btrfs_free_space_header);
        btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +1009,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(path);
-        ret = 1;
+        err = 0;
 out:
-        kfree(pages);
+        io_ctl_free(&io_ctl);
-        if (ret != 1) {
+        if (err) {
-                invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+                invalidate_inode_pages2(inode->i_mapping);
                BTRFS_I(inode)->generation = 0;
        }
        btrfs_update_inode(trans, root, inode);
-        return ret;
+        return err;
+out_nospc:
+        list_for_each_safe(pos, n, &bitmap_list) {
+                struct btrfs_free_space *entry =
+                        list_entry(pos, struct btrfs_free_space, list);
+                list_del_init(&entry->list);
+        }
+        io_ctl_drop_pages(&io_ctl);
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+                             i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+        goto out;
 }
 int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1055,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
                                      path, block_group->key.objectid);
-        if (ret < 0) {
+        if (ret) {
                spin_lock(&block_group->lock);
                block_group->disk_cache_state = BTRFS_DC_ERROR;
                spin_unlock(&block_group->lock);
                ret = 0;
+#ifdef DEBUG
                printk(KERN_ERR "btrfs: failed to write free space cace "
                       "for block group %llu\n", block_group->key.objectid);
+#endif
        }
        iput(inode);
@@ -1283,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
 {
        info->offset = offset_to_bitmap(ctl, offset);
        info->bytes = 0;
+        INIT_LIST_HEAD(&info->list);
        link_free_space(ctl, info);
        ctl->total_bitmaps++;
@@ -1662,7 +1850,13 @@ again:
                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                          1, 0);
                if (!info) {
-                        WARN_ON(1);
+                        /* the tree logging code might be calling us before we
+                         * have fully loaded the free space rbtree for this
+                         * block group.  So it is possible the entry won't
+                         * be in the rbtree yet at all.  The caching code
+                         * will make sure not to put it in the rbtree if
+                         * the logging code has pinned it.
+                         */
                        goto out_lock;
                }
        }
@@ -1701,6 +1895,7 @@ again:
                        ctl->total_bitmaps--;
                }
                kmem_cache_free(btrfs_free_space_cachep, info);
+                ret = 0;
                goto out_lock;
        }
@@ -1708,7 +1903,8 @@ again:
                unlink_free_space(ctl, info);
                info->offset += bytes;
                info->bytes -= bytes;
-                link_free_space(ctl, info);
+                ret = link_free_space(ctl, info);
+                WARN_ON(ret);
                goto out_lock;
        }
@@ -2124,6 +2320,7 @@ again:
        if (!found) {
                start = i;
+                cluster->max_size = 0;
                found = true;
        }
@@ -2267,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry;
-        struct rb_node *node;
        int ret = -ENOSPC;
+        u64 bitmap_offset = offset_to_bitmap(ctl, offset);
        if (ctl->total_bitmaps == 0)
                return -ENOSPC;
        /*
-         * First check our cached list of bitmaps and see if there is an entry
+         * The bitmap that covers offset won't be in the list unless offset
-         * here that will work.
+         * is just its start offset.
         */
+        entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+        if (entry->offset != bitmap_offset) {
+                entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
+                if (entry && list_empty(&entry->list))
+                        list_add(&entry->list, bitmaps);
+        }
        list_for_each_entry(entry, bitmaps, list) {
                if (entry->bytes < min_bytes)
                        continue;
@@ -2287,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
        }
        /*
-         * If we do have entries on our list and we are here then we didn't find
+         * The bitmaps list has all the bitmaps that record free space
-         * anything, so go ahead and get the next entry after the last entry in
+         * starting after offset, so no more search is required.
-         * this list and start the search from there.
         */
-        if (!list_empty(bitmaps)) {
+        return -ENOSPC;
-                entry = list_entry(bitmaps->prev, struct btrfs_free_space,
-                                   list);
-                node = rb_next(&entry->offset_index);
-                if (!node)
-                        return -ENOSPC;
-                entry = rb_entry(node, struct btrfs_free_space, offset_index);
-                goto search;
-        }
-        entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
-        if (!entry)
-                return -ENOSPC;
-search:
-        node = &entry->offset_index;
-        do {
-                entry = rb_entry(node, struct btrfs_free_space, offset_index);
-                node = rb_next(&entry->offset_index);
-                if (!entry->bitmap)
-                        continue;
-                if (entry->bytes < min_bytes)
-                        continue;
-                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-                                           bytes, min_bytes);
-        } while (ret && node);
-        return ret;
 }
 /*
@@ -2336,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                             u64 offset, u64 bytes, u64 empty_size)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-        struct list_head bitmaps;
        struct btrfs_free_space *entry, *tmp;
+        LIST_HEAD(bitmaps);
        u64 min_bytes;
        int ret;
@@ -2376,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                goto out;
        }
-        INIT_LIST_HEAD(&bitmaps);
        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
                                      bytes, min_bytes);
        if (ret)
@@ -2472,9 +2647,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                spin_unlock(&ctl->tree_lock);
                if (bytes >= minlen) {
-                        int update_ret;
+                        struct btrfs_space_info *space_info;
-                        update_ret = btrfs_update_reserved_bytes(block_group,
+                        int update = 0;
-                                                                 bytes, 1, 1);
+                        space_info = block_group->space_info;
+                        spin_lock(&space_info->lock);
+                        spin_lock(&block_group->lock);
+                        if (!block_group->ro) {
+                                block_group->reserved += bytes;
+                                space_info->bytes_reserved += bytes;
+                                update = 1;
+                        }
+                        spin_unlock(&block_group->lock);
+                        spin_unlock(&space_info->lock);
                        ret = btrfs_error_discard_extent(fs_info->extent_root,
                                                         start,
@@ -2482,9 +2667,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                                                         &actually_trimmed);
                        btrfs_add_free_space(block_group, start, bytes);
-                        if (!update_ret)
+                        if (update) {
-                                btrfs_update_reserved_bytes(block_group,
+                                spin_lock(&space_info->lock);
-                                                            bytes, 0, 1);
+                                spin_lock(&block_group->lock);
+                                if (block_group->ro)
+                                        space_info->bytes_readonly += bytes;
+                                block_group->reserved -= bytes;
+                                space_info->bytes_reserved -= bytes;
+                                spin_unlock(&space_info->lock);
+                                spin_unlock(&block_group->lock);
+                        }
                        if (ret)
                                break;
@@ -2643,9 +2835,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
                return 0;
        ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
-        if (ret < 0)
+        if (ret) {
+                btrfs_delalloc_release_metadata(inode, inode->i_size);
+#ifdef DEBUG
                printk(KERN_ERR "btrfs: failed to write free ino cache "
                       "for root %llu\n", root->root_key.objectid);
+#endif
+        }
        iput(inode);
        return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa87..f8962a957d6 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
        struct btrfs_path *path;
        struct inode *inode;
+        struct btrfs_block_rsv *rsv;
+        u64 num_bytes;
        u64 alloc_hint = 0;
        int ret;
        int prealloc;
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
        if (!path)
                return -ENOMEM;
+        rsv = trans->block_rsv;
+        trans->block_rsv = &root->fs_info->trans_block_rsv;
+        num_bytes = trans->bytes_reserved;
+        /*
+         * 1 item for inode item insertion if need
+         * 3 items for inode item update (in the worst case)
+         * 1 item for free space object
+         * 3 items for pre-allocation
+         */
+        trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
+        ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
+                                          trans->bytes_reserved);
+        if (ret)
+                goto out;
 again:
        inode = lookup_free_ino_inode(root, path);
        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
                ret = PTR_ERR(inode);
-                goto out;
+                goto out_release;
        }
        if (IS_ERR(inode)) {
@@ -434,7 +451,7 @@ again:
                ret = create_free_ino_inode(root, trans, path);
                if (ret)
-                        goto out;
+                        goto out_release;
                goto again;
        }
@@ -465,21 +482,26 @@ again:
        /* Just to make sure we have enough space */
        prealloc += 8 * PAGE_CACHE_SIZE;
-        ret = btrfs_check_data_free_space(inode, prealloc);
+        ret = btrfs_delalloc_reserve_space(inode, prealloc);
        if (ret)
                goto out_put;
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
                                              prealloc, prealloc, &alloc_hint);
-        if (ret)
+        if (ret) {
+                btrfs_delalloc_release_space(inode, prealloc);
                goto out_put;
+        }
        btrfs_free_reserved_data_space(inode, prealloc);
+        ret = btrfs_write_out_ino_cache(root, trans, path);
 out_put:
        iput(inode);
+out_release:
+        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 out:
-        if (ret == 0)
+        trans->block_rsv = rsv;
-                ret = btrfs_write_out_ino_cache(root, trans, path);
+        trans->bytes_reserved = num_bytes;
        btrfs_free_path(path);
        return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 75686a61bd4..2c984f7d4c2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
 #include "btrfs_inode.h"
 #include "ioctl.h"
 #include "print-tree.h"
-#include "volumes.h"
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
+#include "volumes.h"
 #include "compression.h"
 #include "locking.h"
 #include "free-space-cache.h"
@@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode,
                                   struct page *locked_page,
                                   u64 start, u64 end, int *page_started,
                                   unsigned long *nr_written, int unlock);
+static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, struct inode *inode);
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                     struct inode *inode,  struct inode *dir,
@@ -393,7 +395,10 @@ again:
             (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
-                BUG_ON(!pages);
+                if (!pages) {
+                        /* just bail out to the uncompressed code */
+                        goto cont;
+                }
                if (BTRFS_I(inode)->force_compress)
                        compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +429,7 @@ again:
                        will_compress = 1;
                }
        }
+cont:
        if (start == 0) {
                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
@@ -820,7 +826,7 @@ static noinline int cow_file_range(struct inode *inode,
        }
        BUG_ON(disk_num_bytes >
-               btrfs_super_total_bytes(&root->fs_info->super_copy));
+               btrfs_super_total_bytes(root->fs_info->super_copy));
        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1737,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                trans = btrfs_join_transaction(root);
                        BUG_ON(IS_ERR(trans));
                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-                        ret = btrfs_update_inode(trans, root, inode);
+                        ret = btrfs_update_inode_fallback(trans, root, inode);
                        BUG_ON(ret);
                }
                goto out;
@@ -1787,17 +1793,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
        if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-                ret = btrfs_update_inode(trans, root, inode);
+                ret = btrfs_update_inode_fallback(trans, root, inode);
                BUG_ON(ret);
        }
        ret = 0;
 out:
-        if (nolock) {
+        if (root != root->fs_info->tree_root)
-                if (trans)
-                        btrfs_end_transaction_nolock(trans, root);
-        } else {
                btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-                if (trans)
+        if (trans) {
+                if (nolock)
+                        btrfs_end_transaction_nolock(trans, root);
+                else
                        btrfs_end_transaction(trans, root);
        }
@@ -1819,153 +1825,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 }
 /*
- * When IO fails, either with EIO or csum verification fails, we
- * try other mirrors that might have a good copy of the data.  This
- * io_failure_record is used to record state as we go through all the
- * mirrors.  If another mirror has good data, the page is set up to date
- * and things continue.  If a good mirror can't be found, the original
- * bio end_io callback is called to indicate things have failed.
- */
-struct io_failure_record {
-        struct page *page;
-        u64 start;
-        u64 len;
-        u64 logical;
-        unsigned long bio_flags;
-        int last_mirror;
-};
-static int btrfs_io_failed_hook(struct bio *failed_bio,
-                         struct page *page, u64 start, u64 end,
-                         struct extent_state *state)
-{
-        struct io_failure_record *failrec = NULL;
-        u64 private;
-        struct extent_map *em;
-        struct inode *inode = page->mapping->host;
-        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
-        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-        struct bio *bio;
-        int num_copies;
-        int ret;
-        int rw;
-        u64 logical;
-        ret = get_state_private(failure_tree, start, &private);
-        if (ret) {
-                failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
-                if (!failrec)
-                        return -ENOMEM;
-                failrec->start = start;
-                failrec->len = end - start + 1;
-                failrec->last_mirror = 0;
-                failrec->bio_flags = 0;
-                read_lock(&em_tree->lock);
-                em = lookup_extent_mapping(em_tree, start, failrec->len);
-                if (em->start > start || em->start + em->len < start) {
-                        free_extent_map(em);
-                        em = NULL;
-                }
-                read_unlock(&em_tree->lock);
-                if (IS_ERR_OR_NULL(em)) {
-                        kfree(failrec);
-                        return -EIO;
-                }
-                logical = start - em->start;
-                logical = em->block_start + logical;
-                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
-                        logical = em->block_start;
-                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
-                        extent_set_compress_type(&failrec->bio_flags,
-                                                 em->compress_type);
-                }
-                failrec->logical = logical;
-                free_extent_map(em);
-                set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
-                                EXTENT_DIRTY, GFP_NOFS);
-                set_state_private(failure_tree, start,
-                                 (u64)(unsigned long)failrec);
-        } else {
-                failrec = (struct io_failure_record *)(unsigned long)private;
-        }
-        num_copies = btrfs_num_copies(
-                              &BTRFS_I(inode)->root->fs_info->mapping_tree,
-                              failrec->logical, failrec->len);
-        failrec->last_mirror++;
-        if (!state) {
-                spin_lock(&BTRFS_I(inode)->io_tree.lock);
-                state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
-                                                    failrec->start,
-                                                    EXTENT_LOCKED);
-                if (state && state->start != failrec->start)
-                        state = NULL;
-                spin_unlock(&BTRFS_I(inode)->io_tree.lock);
-        }
-        if (!state || failrec->last_mirror > num_copies) {
-                set_state_private(failure_tree, failrec->start, 0);
-                clear_extent_bits(failure_tree, failrec->start,
-                                  failrec->start + failrec->len - 1,
-                                  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
-                kfree(failrec);
-                return -EIO;
-        }
-        bio = bio_alloc(GFP_NOFS, 1);
-        bio->bi_private = state;
-        bio->bi_end_io = failed_bio->bi_end_io;
-        bio->bi_sector = failrec->logical >> 9;
-        bio->bi_bdev = failed_bio->bi_bdev;
-        bio->bi_size = 0;
-        bio_add_page(bio, page, failrec->len, start - page_offset(page));
-        if (failed_bio->bi_rw & REQ_WRITE)
-                rw = WRITE;
-        else
-                rw = READ;
-        ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
-                                                      failrec->last_mirror,
-                                                      failrec->bio_flags, 0);
-        return ret;
-}
-/*
- * each time an IO finishes, we do a fast check in the IO failure tree
- * to see if we need to process or clean up an io_failure_record
- */
-static int btrfs_clean_io_failures(struct inode *inode, u64 start)
-{
-        u64 private;
-        u64 private_failure;
-        struct io_failure_record *failure;
-        int ret;
-        private = 0;
-        if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-                             (u64)-1, 1, EXTENT_DIRTY, 0)) {
-                ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
-                                        start, &private_failure);
-                if (ret == 0) {
-                        failure = (struct io_failure_record *)(unsigned long)
-                                   private_failure;
-                        set_state_private(&BTRFS_I(inode)->io_failure_tree,
-                                          failure->start, 0);
-                        clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
-                                          failure->start,
-                                          failure->start + failure->len - 1,
-                                          EXTENT_DIRTY | EXTENT_LOCKED,
-                                          GFP_NOFS);
-                        kfree(failure);
-                }
-        }
-        return 0;
-}
-/*
 * when reads are done, we need to check csums to verify the data is correct
- * if there's a match, we allow the bio to finish.  If not, we go through
+ * if there's a match, we allow the bio to finish.  If not, the code in
- * the io_failure_record routines to find good copies
+ * extent_io.c will try to find good copies for us.
 */
 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                               struct extent_state *state)
@@ -2011,10 +1873,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        kunmap_atomic(kaddr, KM_USER0);
 good:
-        /* if the io failure tree for this inode is non-empty,
-         * check to see if we've recovered from a failed IO
-         */
-        btrfs_clean_io_failures(inode, start);
        return 0;
 zeroit:
@@ -2079,89 +1937,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
        up_read(&root->fs_info->cleanup_work_sem);
 }
-/*
- * calculate extra metadata reservation when snapshotting a subvolume
- * contains orphan files.
- */
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
-                                struct btrfs_pending_snapshot *pending,
-                                u64 *bytes_to_reserve)
-{
-        struct btrfs_root *root;
-        struct btrfs_block_rsv *block_rsv;
-        u64 num_bytes;
-        int index;
-        root = pending->root;
-        if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
-                return;
-        block_rsv = root->orphan_block_rsv;
-        /* orphan block reservation for the snapshot */
-        num_bytes = block_rsv->size;
-        /*
-         * after the snapshot is created, COWing tree blocks may use more
-         * space than it frees. So we should make sure there is enough
-         * reserved space.
-         */
-        index = trans->transid & 0x1;
-        if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
-                num_bytes += block_rsv->size -
-                             (block_rsv->reserved + block_rsv->freed[index]);
-        }
-        *bytes_to_reserve += num_bytes;
-}
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
-                                struct btrfs_pending_snapshot *pending)
-{
-        struct btrfs_root *root = pending->root;
-        struct btrfs_root *snap = pending->snap;
-        struct btrfs_block_rsv *block_rsv;
-        u64 num_bytes;
-        int index;
-        int ret;
-        if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
-                return;
-        /* refill source subvolume's orphan block reservation */
-        block_rsv = root->orphan_block_rsv;
-        index = trans->transid & 0x1;
-        if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
-                num_bytes = block_rsv->size -
-                            (block_rsv->reserved + block_rsv->freed[index]);
-                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
-                                              root->orphan_block_rsv,
-                                              num_bytes);
-                BUG_ON(ret);
-        }
-        /* setup orphan block reservation for the snapshot */
-        block_rsv = btrfs_alloc_block_rsv(snap);
-        BUG_ON(!block_rsv);
-        btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-        snap->orphan_block_rsv = block_rsv;
-        num_bytes = root->orphan_block_rsv->size;
-        ret = btrfs_block_rsv_migrate(&pending->block_rsv,
-                                      block_rsv, num_bytes);
-        BUG_ON(ret);
-#if 0
-        /* insert orphan item for the snapshot */
-        WARN_ON(!root->orphan_item_inserted);
-        ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
-                                       snap->root_key.objectid);
-        BUG_ON(ret);
-        snap->orphan_item_inserted = 1;
-#endif
-}
 enum btrfs_orphan_cleanup_state {
        ORPHAN_CLEANUP_STARTED  = 1,
        ORPHAN_CLEANUP_DONE     = 2,
@@ -2247,9 +2022,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
        }
        spin_unlock(&root->orphan_lock);
-        if (block_rsv)
-                btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
        /* grab metadata reservation from transaction handle */
        if (reserve) {
                ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2316,6 +2088,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
        struct btrfs_key key, found_key;
        struct btrfs_trans_handle *trans;
        struct inode *inode;
+        u64 last_objectid = 0;
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2367,41 +2140,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                 * crossing root thing.  we store the inode number in the
                 * offset of the orphan item.
                 */
+                if (found_key.offset == last_objectid) {
+                        printk(KERN_ERR "btrfs: Error removing orphan entry, "
+                               "stopping orphan cleanup\n");
+                        ret = -EINVAL;
+                        goto out;
+                }
+                last_objectid = found_key.offset;
                found_key.objectid = found_key.offset;
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-                if (IS_ERR(inode)) {
+                ret = PTR_RET(inode);
-                        ret = PTR_ERR(inode);
+                if (ret && ret != -ESTALE)
                        goto out;
-                }
                /*
-                 * add this inode to the orphan list so btrfs_orphan_del does
+                 * Inode is already gone but the orphan item is still there,
-                 * the proper thing when we hit it
+                 * kill the orphan item.
                 */
-                spin_lock(&root->orphan_lock);
+                if (ret == -ESTALE) {
-                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+                        trans = btrfs_start_transaction(root, 1);
-                spin_unlock(&root->orphan_lock);
-                /*
-                 * if this is a bad inode, means we actually succeeded in
-                 * removing the inode, but not the orphan record, which means
-                 * we need to manually delete the orphan since iput will just
-                 * do a destroy_inode
-                 */
-                if (is_bad_inode(inode)) {
-                        trans = btrfs_start_transaction(root, 0);
                        if (IS_ERR(trans)) {
                                ret = PTR_ERR(trans);
                                goto out;
                        }
-                        btrfs_orphan_del(trans, inode);
+                        ret = btrfs_del_orphan_item(trans, root,
+                                                    found_key.objectid);
+                        BUG_ON(ret);
                        btrfs_end_transaction(trans, root);
-                        iput(inode);
                        continue;
                }
+                /*
+                 * add this inode to the orphan list so btrfs_orphan_del does
+                 * the proper thing when we hit it
+                 */
+                spin_lock(&root->orphan_lock);
+                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+                spin_unlock(&root->orphan_lock);
                /* if we have links, this was a truncate, lets do that */
                if (inode->i_nlink) {
                        if (!S_ISREG(inode->i_mode)) {
@@ -2420,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                if (ret)
                        goto out;
        }
+        /* release the path since we're done with it */
+        btrfs_release_path(path);
        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
        if (root->orphan_block_rsv)
@@ -2647,7 +2431,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 /*
 * copy everything in the in-memory inode into the btree.
 */
-noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root, struct inode *inode)
 {
        struct btrfs_inode_item *inode_item;
@@ -2655,21 +2439,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
        struct extent_buffer *leaf;
        int ret;
-        /*
-         * If the inode is a free space inode, we can deadlock during commit
-         * if we put it into the delayed code.
-         *
-         * The data relocation inode should also be directly updated
-         * without delay
-         */
-        if (!btrfs_is_free_space_inode(root, inode)
-            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
-                ret = btrfs_delayed_update_inode(trans, root, inode);
-                if (!ret)
-                        btrfs_set_inode_last_trans(trans, inode);
-                return ret;
-        }
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -2698,6 +2467,43 @@ failed:
 }
 /*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, struct inode *inode)
+{
+        int ret;
+        /*
+         * If the inode is a free space inode, we can deadlock during commit
+         * if we put it into the delayed code.
+         *
+         * The data relocation inode should also be directly updated
+         * without delay
+         */
+        if (!btrfs_is_free_space_inode(root, inode)
+            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                ret = btrfs_delayed_update_inode(trans, root, inode);
+                if (!ret)
+                        btrfs_set_inode_last_trans(trans, inode);
+                return ret;
+        }
+        return btrfs_update_inode_item(trans, root, inode);
+}
+static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, struct inode *inode)
+{
+        int ret;
+        ret = btrfs_update_inode(trans, root, inode);
+        if (ret == -ENOSPC)
+                return btrfs_update_inode_item(trans, root, inode);
+        return ret;
+}
+/*
 * unlink helper that gets used here in inode.c and in the tree logging
 * recovery code.  It remove a link in a directory with a given name, and
 * also drops the back refs in the inode to the directory
@@ -2835,7 +2641,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        u64 ino = btrfs_ino(inode);
        u64 dir_ino = btrfs_ino(dir);
-        trans = btrfs_start_transaction(root, 10);
+        /*
+         * 1 for the possible orphan item
+         * 1 for the dir item
+         * 1 for the dir index
+         * 1 for the inode ref
+         * 1 for the inode ref in the tree log
+         * 2 for the dir entries in the log
+         * 1 for the inode
+         */
+        trans = btrfs_start_transaction(root, 8);
        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
                return trans;
@@ -2858,7 +2673,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
                return ERR_PTR(-ENOMEM);
        }
-        trans = btrfs_start_transaction(root, 0);
+        /* 1 for the orphan item */
+        trans = btrfs_start_transaction(root, 1);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
                root->fs_info->enospc_unlink = 0;
@@ -2963,6 +2779,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        err = 0;
 out:
        btrfs_free_path(path);
+        /* Migrate the orphan reservation over */
+        if (!err)
+                err = btrfs_block_rsv_migrate(trans->block_rsv,
+                                &root->fs_info->global_block_rsv,
+                                trans->bytes_reserved);
        if (err) {
                btrfs_end_transaction(trans, root);
                root->fs_info->enospc_unlink = 0;
@@ -2977,6 +2799,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
 {
        if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+                btrfs_block_rsv_release(root, trans->block_rsv,
+                                        trans->bytes_reserved);
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
                BUG_ON(!root->fs_info->enospc_unlink);
                root->fs_info->enospc_unlink = 0;
        }
@@ -3368,6 +3193,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        pgoff_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
        struct page *page;
+        gfp_t mask = btrfs_alloc_write_mask(mapping);
        int ret = 0;
        u64 page_start;
        u64 page_end;
@@ -3380,7 +3206,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        ret = -ENOMEM;
 again:
-        page = find_or_create_page(mapping, index, GFP_NOFS);
+        page = find_or_create_page(mapping, index, mask);
        if (!page) {
                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
                goto out;
@@ -3613,6 +3439,8 @@ void btrfs_evict_inode(struct inode *inode)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *rsv, *global_rsv;
+        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
        unsigned long nr;
        int ret;
@@ -3640,22 +3468,55 @@ void btrfs_evict_inode(struct inode *inode)
                goto no_delete;
        }
+        rsv = btrfs_alloc_block_rsv(root);
+        if (!rsv) {
+                btrfs_orphan_del(NULL, inode);
+                goto no_delete;
+        }
+        rsv->size = min_size;
+        global_rsv = &root->fs_info->global_block_rsv;
        btrfs_i_size_write(inode, 0);
+        /*
+         * This is a bit simpler than btrfs_truncate since
+         *
+         * 1) We've already reserved our space for our orphan item in the
+         *    unlink.
+         * 2) We're going to delete the inode item, so we don't need to update
+         *    it at all.
+         *
+         * So we just need to reserve some slack space in case we add bytes when
+         * doing the truncate.
+         */
        while (1) {
-                trans = btrfs_join_transaction(root);
+                ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
-                BUG_ON(IS_ERR(trans));
-                trans->block_rsv = root->orphan_block_rsv;
+                /*
+                 * Try and steal from the global reserve since we will
+                 * likely not use this space anyway, we want to try as
+                 * hard as possible to get this to work.
+                 */
+                if (ret)
+                        ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
-                ret = btrfs_block_rsv_check(trans, root,
-                                            root->orphan_block_rsv, 0, 5);
                if (ret) {
-                        BUG_ON(ret != -EAGAIN);
+                        printk(KERN_WARNING "Could not get space for a "
-                        ret = btrfs_commit_transaction(trans, root);
+                               "delete, will truncate on mount %d\n", ret);
-                        BUG_ON(ret);
+                        btrfs_orphan_del(NULL, inode);
-                        continue;
+                        btrfs_free_block_rsv(root, rsv);
+                        goto no_delete;
+                }
+                trans = btrfs_start_transaction(root, 0);
+                if (IS_ERR(trans)) {
+                        btrfs_orphan_del(NULL, inode);
+                        btrfs_free_block_rsv(root, rsv);
+                        goto no_delete;
                }
+                trans->block_rsv = rsv;
                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
                if (ret != -EAGAIN)
                        break;
@@ -3664,14 +3525,17 @@ void btrfs_evict_inode(struct inode *inode)
                btrfs_end_transaction(trans, root);
                trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
        }
+        btrfs_free_block_rsv(root, rsv);
        if (ret == 0) {
+                trans->block_rsv = root->orphan_block_rsv;
                ret = btrfs_orphan_del(trans, inode);
                BUG_ON(ret);
        }
+        trans->block_rsv = &root->fs_info->trans_block_rsv;
        if (!(root == root->fs_info->tree_root ||
              root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
                btrfs_return_ino(root, btrfs_ino(inode));
@@ -5795,8 +5659,7 @@ again:
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
                ret = btrfs_ordered_update_i_size(inode, 0, ordered);
                if (!ret)
-                        ret = btrfs_update_inode(trans, root, inode);
+                        err = btrfs_update_inode_fallback(trans, root, inode);
-                err = ret;
                goto out;
        }
@@ -5834,7 +5697,7 @@ again:
        add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
        ret = btrfs_ordered_update_i_size(inode, 0, ordered);
        if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
-                btrfs_update_inode(trans, root, inode);
+                btrfs_update_inode_fallback(trans, root, inode);
        ret = 0;
 out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
@@ -6289,7 +6152,7 @@ int btrfs_readpage(struct file *file, struct page *page)
 {
        struct extent_io_tree *tree;
        tree = &BTRFS_I(page->mapping->host)->io_tree;
-        return extent_read_full_page(tree, page, btrfs_get_extent);
+        return extent_read_full_page(tree, page, btrfs_get_extent, 0);
 }
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -6541,6 +6404,7 @@ static int btrfs_truncate(struct inode *inode)
        struct btrfs_trans_handle *trans;
        unsigned long nr;
        u64 mask = root->sectorsize - 1;
+        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
        ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
        if (ret)
@@ -6588,19 +6452,23 @@ static int btrfs_truncate(struct inode *inode)
        rsv = btrfs_alloc_block_rsv(root);
        if (!rsv)
                return -ENOMEM;
-        btrfs_add_durable_block_rsv(root->fs_info, rsv);
+        rsv->size = min_size;
+        /*
+         * 1 for the truncate slack space
+         * 1 for the orphan item we're going to add
+         * 1 for the orphan item deletion
+         * 1 for updating the inode.
+         */
        trans = btrfs_start_transaction(root, 4);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out;
        }
-        /*
+        /* Migrate the slack space for the truncate to our reserve */
-         * Reserve space for the truncate process.  Truncate should be adding
+        ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
-         * space, but if there are snapshots it may end up using space.
+                                      min_size);
-         */
-        ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
        BUG_ON(ret);
        ret = btrfs_orphan_add(trans, inode);
@@ -6609,21 +6477,6 @@ static int btrfs_truncate(struct inode *inode)
                goto out;
        }
-        nr = trans->blocks_used;
-        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
-        /*
-         * Ok so we've already migrated our bytes over for the truncate, so here
-         * just reserve the one slot we need for updating the inode.
-         */
-        trans = btrfs_start_transaction(root, 1);
-        if (IS_ERR(trans)) {
-                err = PTR_ERR(trans);
-                goto out;
-        }
-        trans->block_rsv = rsv;
        /*
         * setattr is responsible for setting the ordered_data_close flag,
         * but that is only tested during the last file release.  That
@@ -6645,20 +6498,30 @@ static int btrfs_truncate(struct inode *inode)
                btrfs_add_ordered_operation(trans, root, inode);
        while (1) {
+                ret = btrfs_block_rsv_refill(root, rsv, min_size);
+                if (ret) {
+                        /*
+                         * This can only happen with the original transaction we
+                         * started above, every other time we shouldn't have a
+                         * transaction started yet.
+                         */
+                        if (ret == -EAGAIN)
+                                goto end_trans;
+                        err = ret;
+                        break;
+                }
                if (!trans) {
-                        trans = btrfs_start_transaction(root, 3);
+                        /* Just need the 1 for updating the inode */
+                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
                                err = PTR_ERR(trans);
                                goto out;
                        }
-                        ret = btrfs_truncate_reserve_metadata(trans, root,
-                                                              rsv);
-                        BUG_ON(ret);
-                        trans->block_rsv = rsv;
                }
+                trans->block_rsv = rsv;
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
@@ -6673,7 +6536,7 @@ static int btrfs_truncate(struct inode *inode)
                        err = ret;
                        break;
                }
+end_trans:
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
                trans = NULL;
@@ -6693,14 +6556,16 @@ static int btrfs_truncate(struct inode *inode)
                ret = btrfs_orphan_del(NULL, inode);
        }
-        trans->block_rsv = &root->fs_info->trans_block_rsv;
+        if (trans) {
-        ret = btrfs_update_inode(trans, root, inode);
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
-        if (ret && !err)
+                ret = btrfs_update_inode(trans, root, inode);
-                err = ret;
+                if (ret && !err)
+                        err = ret;
-        nr = trans->blocks_used;
+                nr = trans->blocks_used;
-        ret = btrfs_end_transaction_throttle(trans, root);
+                ret = btrfs_end_transaction_throttle(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+                btrfs_btree_balance_dirty(root, nr);
+        }
 out:
        btrfs_free_block_rsv(root, rsv);
@@ -6755,9 +6620,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
        ei->delalloc_bytes = 0;
-        ei->reserved_bytes = 0;
        ei->disk_i_size = 0;
        ei->flags = 0;
+        ei->csum_bytes = 0;
        ei->index_cnt = (u64)-1;
        ei->last_unlink_trans = 0;
@@ -6769,6 +6634,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->orphan_meta_reserved = 0;
        ei->dummy_inode = 0;
        ei->in_defrag = 0;
+        ei->delalloc_meta_reserved = 0;
        ei->force_compress = BTRFS_COMPRESS_NONE;
        ei->delayed_node = NULL;
@@ -6803,6 +6669,8 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(inode->i_data.nrpages);
        WARN_ON(BTRFS_I(inode)->outstanding_extents);
        WARN_ON(BTRFS_I(inode)->reserved_extents);
+        WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+        WARN_ON(BTRFS_I(inode)->csum_bytes);
        /*
         * This can happen where we create an inode, but somebody else also
@@ -6926,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
                         struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
+        u32 blocksize = inode->i_sb->s_blocksize;
        generic_fillattr(inode, stat);
        stat->dev = BTRFS_I(inode)->root->anon_dev;
        stat->blksize = PAGE_CACHE_SIZE;
-        stat->blocks = (inode_get_bytes(inode) +
+        stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
-                        BTRFS_I(inode)->delalloc_bytes) >> 9;
+                ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
        return 0;
 }
@@ -7420,7 +7290,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
        .readpage_end_io_hook = btrfs_readpage_end_io_hook,
        .writepage_end_io_hook = btrfs_writepage_end_io_hook,
        .writepage_start_hook = btrfs_writepage_start_hook,
-        .readpage_io_failed_hook = btrfs_io_failed_hook,
        .set_bit_hook = btrfs_set_bit_hook,
        .clear_bit_hook = btrfs_clear_bit_hook,
        .merge_extent_hook = btrfs_merge_extent_hook,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dae5dfe41ba..72d461656f6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
 #include "volumes.h"
 #include "locking.h"
 #include "inode-map.h"
+#include "backref.h"
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
 /*
 * Inherit flags from the parent inode.
 *
- * Unlike extN we don't have any flags we don't want to inherit currently.
+ * Currently only the compression flags and the cow flags are inherited.
 */
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 {
@@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
        flags = BTRFS_I(dir)->flags;
-        if (S_ISREG(inode->i_mode))
+        if (flags & BTRFS_INODE_NOCOMPRESS) {
-                flags &= ~BTRFS_INODE_DIRSYNC;
+                BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
-        else if (!S_ISDIR(inode->i_mode))
+                BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
-                flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+        } else if (flags & BTRFS_INODE_COMPRESS) {
+                BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+                BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+        }
+        if (flags & BTRFS_INODE_NODATACOW)
+                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
-        BTRFS_I(inode)->flags = flags;
        btrfs_update_iflags(inode);
 }
@@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
        struct fstrim_range range;
        u64 minlen = ULLONG_MAX;
        u64 num_devices = 0;
+        u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
        int ret;
        if (!capable(CAP_SYS_ADMIN))
@@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
                }
        }
        rcu_read_unlock();
        if (!num_devices)
                return -EOPNOTSUPP;
        if (copy_from_user(&range, arg, sizeof(range)))
                return -EFAULT;
+        if (range.start > total_bytes)
+                return -EINVAL;
+        range.len = min(range.len, total_bytes - range.start);
        range.minlen = max(range.minlen, minlen);
        ret = btrfs_trim_fs(root, &range);
        if (ret < 0)
@@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
        int ret = 1;
        /*
-         * make sure that once we start defragging and extent, we keep on
+         * make sure that once we start defragging an extent, we keep on
         * defragging it
         */
        if (start < *defrag_end)
@@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
         * extent will force at least part of that big extent to be defragged.
         */
        if (ret) {
-                *last_len += len;
                *defrag_end = extent_map_end(em);
        } else {
                *last_len = 0;
@@ -843,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
        int i_done;
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
+        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
        if (isize == 0)
                return 0;
@@ -860,7 +870,7 @@ again:
        for (i = 0; i < num_pages; i++) {
                struct page *page;
                page = find_or_create_page(inode->i_mapping,
-                                            start_index + i, GFP_NOFS);
+                                            start_index + i, mask);
                if (!page)
                        break;
@@ -972,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
        struct btrfs_super_block *disk_super;
        struct file_ra_state *ra = NULL;
        unsigned long last_index;
+        u64 isize = i_size_read(inode);
        u64 features;
        u64 last_len = 0;
        u64 skip = 0;
        u64 defrag_end = 0;
        u64 newer_off = range->start;
-        int newer_left = 0;
        unsigned long i;
+        unsigned long ra_index = 0;
        int ret;
        int defrag_count = 0;
        int compress_type = BTRFS_COMPRESS_ZLIB;
        int extent_thresh = range->extent_thresh;
-        int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+        int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+        int cluster = max_cluster;
        u64 new_align = ~((u64)128 * 1024 - 1);
        struct page **pages = NULL;
@@ -997,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                        compress_type = range->compress_type;
        }
-        if (inode->i_size == 0)
+        if (isize == 0)
                return 0;
        /*
@@ -1013,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                ra = &file->f_ra;
        }
-        pages = kmalloc(sizeof(struct page *) * newer_cluster,
+        pages = kmalloc(sizeof(struct page *) * max_cluster,
                        GFP_NOFS);
        if (!pages) {
                ret = -ENOMEM;
@@ -1022,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
        /* find the last page to defrag */
        if (range->start + range->len > range->start) {
-                last_index = min_t(u64, inode->i_size - 1,
+                last_index = min_t(u64, isize - 1,
                         range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
        } else {
-                last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+                last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
        }
        if (newer_than) {
@@ -1038,14 +1050,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                         * the extents in the file evenly spaced
                         */
                        i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
-                        newer_left = newer_cluster;
                } else
                        goto out_ra;
        } else {
                i = range->start >> PAGE_CACHE_SHIFT;
        }
        if (!max_to_defrag)
-                max_to_defrag = last_index - 1;
+                max_to_defrag = last_index;
        /*
         * make writeback starts from i, so the defrag range can be
@@ -1079,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                        i = max(i + 1, next);
                        continue;
                }
+                if (!newer_than) {
+                        cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
+                                   PAGE_CACHE_SHIFT) - i;
+                        cluster = min(cluster, max_cluster);
+                } else {
+                        cluster = max_cluster;
+                }
                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
                        BTRFS_I(inode)->force_compress = compress_type;
-                btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
+                if (i + cluster > ra_index) {
+                        ra_index = max(i, ra_index);
+                        btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
+                                       cluster);
+                        ra_index += max_cluster;
+                }
-                ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
+                ret = cluster_pages_for_defrag(inode, pages, i, cluster);
                if (ret < 0)
                        goto out_ra;
                defrag_count += ret;
                balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
-                i += ret;
                if (newer_than) {
                        if (newer_off == (u64)-1)
@@ -1105,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                        if (!ret) {
                                range->start = newer_off;
                                i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
-                                newer_left = newer_cluster;
                        } else {
                                break;
                        }
                } else {
-                        i++;
+                        if (ret > 0) {
+                                i += ret;
+                                last_len += ret << PAGE_CACHE_SHIFT;
+                        } else {
+                                i++;
+                                last_len = 0;
+                        }
                }
        }
@@ -1136,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                mutex_unlock(&inode->i_mutex);
        }
-        disk_super = &root->fs_info->super_copy;
+        disk_super = root->fs_info->super_copy;
        features = btrfs_super_incompat_flags(disk_super);
        if (range->compress_type == BTRFS_COMPRESS_LZO) {
                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
                btrfs_set_super_incompat_flags(disk_super, features);
        }
-        if (!file)
+        ret = defrag_count;
-                kfree(ra);
-        return defrag_count;
 out_ra:
        if (!file)
@@ -1189,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                *devstr = '\0';
                devstr = vol_args->name;
                devid = simple_strtoull(devstr, &end, 10);
-                printk(KERN_INFO "resizing devid %llu\n",
+                printk(KERN_INFO "btrfs: resizing devid %llu\n",
                       (unsigned long long)devid);
        }
        device = btrfs_find_device(root, devid, NULL, NULL);
        if (!device) {
-                printk(KERN_INFO "resizer unable to find device %llu\n",
+                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                       (unsigned long long)devid);
                ret = -EINVAL;
                goto out_unlock;
@@ -1240,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        do_div(new_size, root->sectorsize);
        new_size *= root->sectorsize;
-        printk(KERN_INFO "new size for %s is %llu\n",
+        printk(KERN_INFO "btrfs: new size for %s is %llu\n",
                device->name, (unsigned long long)new_size);
        if (new_size > old_size) {
@@ -1251,7 +1278,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                }
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
-        } else {
+        } else if (new_size < old_size) {
                ret = btrfs_shrink_device(device, new_size);
        }
@@ -2587,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
                return PTR_ERR(trans);
        }
-        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+        dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
        di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
                                   dir_id, "default", 7, 1);
        if (IS_ERR_OR_NULL(di)) {
@@ -2603,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
-        disk_super = &root->fs_info->super_copy;
+        disk_super = root->fs_info->super_copy;
        features = btrfs_super_incompat_flags(disk_super);
        if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
                features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2864,6 +2891,147 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
        return ret;
 }
+static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
+{
+        int ret = 0;
+        int i;
+        u64 rel_ptr;
+        int size;
+        struct btrfs_ioctl_ino_path_args *ipa = NULL;
+        struct inode_fs_paths *ipath = NULL;
+        struct btrfs_path *path;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ipa = memdup_user(arg, sizeof(*ipa));
+        if (IS_ERR(ipa)) {
+                ret = PTR_ERR(ipa);
+                ipa = NULL;
+                goto out;
+        }
+        size = min_t(u32, ipa->size, 4096);
+        ipath = init_ipath(size, root, path);
+        if (IS_ERR(ipath)) {
+                ret = PTR_ERR(ipath);
+                ipath = NULL;
+                goto out;
+        }
+        ret = paths_from_inode(ipa->inum, ipath);
+        if (ret < 0)
+                goto out;
+        for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
+                rel_ptr = ipath->fspath->val[i] -
+                          (u64)(unsigned long)ipath->fspath->val;
+                ipath->fspath->val[i] = rel_ptr;
+        }
+        ret = copy_to_user((void *)(unsigned long)ipa->fspath,
+                           (void *)(unsigned long)ipath->fspath, size);
+        if (ret) {
+                ret = -EFAULT;
+                goto out;
+        }
+out:
+        btrfs_free_path(path);
+        free_ipath(ipath);
+        kfree(ipa);
+        return ret;
+}
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+        struct btrfs_data_container *inodes = ctx;
+        const size_t c = 3 * sizeof(u64);
+        if (inodes->bytes_left >= c) {
+                inodes->bytes_left -= c;
+                inodes->val[inodes->elem_cnt] = inum;
+                inodes->val[inodes->elem_cnt + 1] = offset;
+                inodes->val[inodes->elem_cnt + 2] = root;
+                inodes->elem_cnt += 3;
+        } else {
+                inodes->bytes_missing += c - inodes->bytes_left;
+                inodes->bytes_left = 0;
+                inodes->elem_missed += 3;
+        }
+        return 0;
+}
+static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
+                                        void __user *arg)
+{
+        int ret = 0;
+        int size;
+        u64 extent_offset;
+        struct btrfs_ioctl_logical_ino_args *loi;
+        struct btrfs_data_container *inodes = NULL;
+        struct btrfs_path *path = NULL;
+        struct btrfs_key key;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        loi = memdup_user(arg, sizeof(*loi));
+        if (IS_ERR(loi)) {
+                ret = PTR_ERR(loi);
+                loi = NULL;
+                goto out;
+        }
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        size = min_t(u32, loi->size, 4096);
+        inodes = init_data_container(size);
+        if (IS_ERR(inodes)) {
+                ret = PTR_ERR(inodes);
+                inodes = NULL;
+                goto out;
+        }
+        ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+                ret = -ENOENT;
+        if (ret < 0)
+                goto out;
+        extent_offset = loi->logical - key.objectid;
+        ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
+                                        extent_offset, build_ino_list, inodes);
+        if (ret < 0)
+                goto out;
+        ret = copy_to_user((void *)(unsigned long)loi->inodes,
+                           (void *)(unsigned long)inodes, size);
+        if (ret)
+                ret = -EFAULT;
+out:
+        btrfs_free_path(path);
+        kfree(inodes);
+        kfree(loi);
+        return ret;
+}
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -2921,6 +3089,10 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_tree_search(file, argp);
        case BTRFS_IOC_INO_LOOKUP:
                return btrfs_ioctl_ino_lookup(file, argp);
+        case BTRFS_IOC_INO_PATHS:
+                return btrfs_ioctl_ino_to_path(root, argp);
+        case BTRFS_IOC_LOGICAL_INO:
+                return btrfs_ioctl_logical_to_ino(root, argp);
        case BTRFS_IOC_SPACE_INFO:
                return btrfs_ioctl_space_info(root, argp);
        case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea789fcb..252ae9915de 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {
        struct btrfs_ioctl_space_info spaces[0];
 };
+struct btrfs_data_container {
+        __u32   bytes_left;     /* out -- bytes not needed to deliver output */
+        __u32   bytes_missing;  /* out -- additional bytes needed for result */
+        __u32   elem_cnt;       /* out */
+        __u32   elem_missed;    /* out */
+        __u64   val[0];         /* out */
+};
+struct btrfs_ioctl_ino_path_args {
+        __u64                           inum;           /* in */
+        __u32                           size;           /* in */
+        __u64                           reserved[4];
+        /* struct btrfs_data_container  *fspath;           out */
+        __u64                           fspath;         /* out */
+};
+struct btrfs_ioctl_logical_ino_args {
+        __u64                           logical;        /* in */
+        __u32                           size;           /* in */
+        __u64                           reserved[4];
+        /* struct btrfs_data_container  *inodes;        out   */
+        __u64                           inodes;
+};
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
                                   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {
                                 struct btrfs_ioctl_dev_info_args)
 #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
                               struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
+                                        struct btrfs_ioctl_ino_path_args)
+#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
+                                        struct btrfs_ioctl_ino_path_args)
 #endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e..f38e452486b 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
        int i;
-        u32 type;
+        u32 type, nr;
-        u32 nr = btrfs_header_nritems(l);
        struct btrfs_item *item;
        struct btrfs_root_item *ri;
        struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
        struct btrfs_key key;
        struct btrfs_key found_key;
+        if (!l)
+                return;
+        nr = btrfs_header_nritems(l);
        printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
                (unsigned long long)btrfs_header_bytenr(l), nr,
                btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 00000000000..2373b39a132
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,951 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "transaction.h"
+#undef DEBUG
+/*
+ * This is the implementation for the generic read ahead framework.
+ *
+ * To trigger a readahead, btrfs_reada_add must be called. It will start
+ * a read ahead for the given range [start, end) on tree root. The returned
+ * handle can either be used to wait on the readahead to finish
+ * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
+ *
+ * The read ahead works as follows:
+ * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
+ * reada_start_machine will then search for extents to prefetch and trigger
+ * some reads. When a read finishes for a node, all contained node/leaf
+ * pointers that lie in the given range will also be enqueued. The reads will
+ * be triggered in sequential order, thus giving a big win over a naive
+ * enumeration. It will also make use of multi-device layouts. Each disk
+ * will have its on read pointer and all disks will by utilized in parallel.
+ * Also will no two disks read both sides of a mirror simultaneously, as this
+ * would waste seeking capacity. Instead both disks will read different parts
+ * of the filesystem.
+ * Any number of readaheads can be started in parallel. The read order will be
+ * determined globally, i.e. 2 parallel readaheads will normally finish faster
+ * than the 2 started one after another.
+ */
+#define MAX_MIRRORS 2
+#define MAX_IN_FLIGHT 6
+struct reada_extctl {
+        struct list_head        list;
+        struct reada_control    *rc;
+        u64                     generation;
+};
+struct reada_extent {
+        u64                     logical;
+        struct btrfs_key        top;
+        u32                     blocksize;
+        int                     err;
+        struct list_head        extctl;
+        struct kref             refcnt;
+        spinlock_t              lock;
+        struct reada_zone       *zones[MAX_MIRRORS];
+        int                     nzones;
+        struct btrfs_device     *scheduled_for;
+};
+struct reada_zone {
+        u64                     start;
+        u64                     end;
+        u64                     elems;
+        struct list_head        list;
+        spinlock_t              lock;
+        int                     locked;
+        struct btrfs_device     *device;
+        struct btrfs_device     *devs[MAX_MIRRORS]; /* full list, incl self */
+        int                     ndevs;
+        struct kref             refcnt;
+};
+struct reada_machine_work {
+        struct btrfs_work       work;
+        struct btrfs_fs_info    *fs_info;
+};
+static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
+static void reada_control_release(struct kref *kref);
+static void reada_zone_release(struct kref *kref);
+static void reada_start_machine(struct btrfs_fs_info *fs_info);
+static void __reada_start_machine(struct btrfs_fs_info *fs_info);
+static int reada_add_block(struct reada_control *rc, u64 logical,
+                           struct btrfs_key *top, int level, u64 generation);
+/* recurses */
+/* in case of err, eb might be NULL */
+static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+                            u64 start, int err)
+{
+        int level = 0;
+        int nritems;
+        int i;
+        u64 bytenr;
+        u64 generation;
+        struct reada_extent *re;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct list_head list;
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        struct btrfs_device *for_dev;
+        if (eb)
+                level = btrfs_header_level(eb);
+        /* find extent */
+        spin_lock(&fs_info->reada_lock);
+        re = radix_tree_lookup(&fs_info->reada_tree, index);
+        if (re)
+                kref_get(&re->refcnt);
+        spin_unlock(&fs_info->reada_lock);
+        if (!re)
+                return -1;
+        spin_lock(&re->lock);
+        /*
+         * just take the full list from the extent. afterwards we
+         * don't need the lock anymore
+         */
+        list_replace_init(&re->extctl, &list);
+        for_dev = re->scheduled_for;
+        re->scheduled_for = NULL;
+        spin_unlock(&re->lock);
+        if (err == 0) {
+                nritems = level ? btrfs_header_nritems(eb) : 0;
+                generation = btrfs_header_generation(eb);
+                /*
+                 * FIXME: currently we just set nritems to 0 if this is a leaf,
+                 * effectively ignoring the content. In a next step we could
+                 * trigger more readahead depending from the content, e.g.
+                 * fetch the checksums for the extents in the leaf.
+                 */
+        } else {
+                /*
+                 * this is the error case, the extent buffer has not been
+                 * read correctly. We won't access anything from it and
+                 * just cleanup our data structures. Effectively this will
+                 * cut the branch below this node from read ahead.
+                 */
+                nritems = 0;
+                generation = 0;
+        }
+        for (i = 0; i < nritems; i++) {
+                struct reada_extctl *rec;
+                u64 n_gen;
+                struct btrfs_key key;
+                struct btrfs_key next_key;
+                btrfs_node_key_to_cpu(eb, &key, i);
+                if (i + 1 < nritems)
+                        btrfs_node_key_to_cpu(eb, &next_key, i + 1);
+                else
+                        next_key = re->top;
+                bytenr = btrfs_node_blockptr(eb, i);
+                n_gen = btrfs_node_ptr_generation(eb, i);
+                list_for_each_entry(rec, &list, list) {
+                        struct reada_control *rc = rec->rc;
+                        /*
+                         * if the generation doesn't match, just ignore this
+                         * extctl. This will probably cut off a branch from
+                         * prefetch. Alternatively one could start a new (sub-)
+                         * prefetch for this branch, starting again from root.
+                         * FIXME: move the generation check out of this loop
+                         */
+#ifdef DEBUG
+                        if (rec->generation != generation) {
+                                printk(KERN_DEBUG "generation mismatch for "
+                                                "(%llu,%d,%llu) %llu != %llu\n",
+                                       key.objectid, key.type, key.offset,
+                                       rec->generation, generation);
+                        }
+#endif
+                        if (rec->generation == generation &&
+                            btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
+                            btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
+                                reada_add_block(rc, bytenr, &next_key,
+                                                level - 1, n_gen);
+                }
+        }
+        /*
+         * free extctl records
+         */
+        while (!list_empty(&list)) {
+                struct reada_control *rc;
+                struct reada_extctl *rec;
+                rec = list_first_entry(&list, struct reada_extctl, list);
+                list_del(&rec->list);
+                rc = rec->rc;
+                kfree(rec);
+                kref_get(&rc->refcnt);
+                if (atomic_dec_and_test(&rc->elems)) {
+                        kref_put(&rc->refcnt, reada_control_release);
+                        wake_up(&rc->wait);
+                }
+                kref_put(&rc->refcnt, reada_control_release);
+                reada_extent_put(fs_info, re);  /* one ref for each entry */
+        }
+        reada_extent_put(fs_info, re);  /* our ref */
+        if (for_dev)
+                atomic_dec(&for_dev->reada_in_flight);
+        return 0;
+}
+/*
+ * start is passed separately in case eb in NULL, which may be the case with
+ * failed I/O
+ */
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+                         u64 start, int err)
+{
+        int ret;
+        ret = __readahead_hook(root, eb, start, err);
+        reada_start_machine(root->fs_info);
+        return ret;
+}
+static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
+                                          struct btrfs_device *dev, u64 logical,
+                                          struct btrfs_bio *bbio)
+{
+        int ret;
+        int looped = 0;
+        struct reada_zone *zone;
+        struct btrfs_block_group_cache *cache = NULL;
+        u64 start;
+        u64 end;
+        int i;
+again:
+        zone = NULL;
+        spin_lock(&fs_info->reada_lock);
+        ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+                                     logical >> PAGE_CACHE_SHIFT, 1);
+        if (ret == 1)
+                kref_get(&zone->refcnt);
+        spin_unlock(&fs_info->reada_lock);
+        if (ret == 1) {
+                if (logical >= zone->start && logical < zone->end)
+                        return zone;
+                spin_lock(&fs_info->reada_lock);
+                kref_put(&zone->refcnt, reada_zone_release);
+                spin_unlock(&fs_info->reada_lock);
+        }
+        if (looped)
+                return NULL;
+        cache = btrfs_lookup_block_group(fs_info, logical);
+        if (!cache)
+                return NULL;
+        start = cache->key.objectid;
+        end = start + cache->key.offset - 1;
+        btrfs_put_block_group(cache);
+        zone = kzalloc(sizeof(*zone), GFP_NOFS);
+        if (!zone)
+                return NULL;
+        zone->start = start;
+        zone->end = end;
+        INIT_LIST_HEAD(&zone->list);
+        spin_lock_init(&zone->lock);
+        zone->locked = 0;
+        kref_init(&zone->refcnt);
+        zone->elems = 0;
+        zone->device = dev; /* our device always sits at index 0 */
+        for (i = 0; i < bbio->num_stripes; ++i) {
+                /* bounds have already been checked */
+                zone->devs[i] = bbio->stripes[i].dev;
+        }
+        zone->ndevs = bbio->num_stripes;
+        spin_lock(&fs_info->reada_lock);
+        ret = radix_tree_insert(&dev->reada_zones,
+                                (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
+                                zone);
+        spin_unlock(&fs_info->reada_lock);
+        if (ret) {
+                kfree(zone);
+                looped = 1;
+                goto again;
+        }
+        return zone;
+}
+static struct reada_extent *reada_find_extent(struct btrfs_root *root,
+                                              u64 logical,
+                                              struct btrfs_key *top, int level)
+{
+        int ret;
+        int looped = 0;
+        struct reada_extent *re = NULL;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+        struct btrfs_bio *bbio = NULL;
+        struct btrfs_device *dev;
+        u32 blocksize;
+        u64 length;
+        int nzones = 0;
+        int i;
+        unsigned long index = logical >> PAGE_CACHE_SHIFT;
+again:
+        spin_lock(&fs_info->reada_lock);
+        re = radix_tree_lookup(&fs_info->reada_tree, index);
+        if (re)
+                kref_get(&re->refcnt);
+        spin_unlock(&fs_info->reada_lock);
+        if (re || looped)
+                return re;
+        re = kzalloc(sizeof(*re), GFP_NOFS);
+        if (!re)
+                return NULL;
+        blocksize = btrfs_level_size(root, level);
+        re->logical = logical;
+        re->blocksize = blocksize;
+        re->top = *top;
+        INIT_LIST_HEAD(&re->extctl);
+        spin_lock_init(&re->lock);
+        kref_init(&re->refcnt);
+        /*
+         * map block
+         */
+        length = blocksize;
+        ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
+        if (ret || !bbio || length < blocksize)
+                goto error;
+        if (bbio->num_stripes > MAX_MIRRORS) {
+                printk(KERN_ERR "btrfs readahead: more than %d copies not "
+                                "supported", MAX_MIRRORS);
+                goto error;
+        }
+        for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
+                struct reada_zone *zone;
+                dev = bbio->stripes[nzones].dev;
+                zone = reada_find_zone(fs_info, dev, logical, bbio);
+                if (!zone)
+                        break;
+                re->zones[nzones] = zone;
+                spin_lock(&zone->lock);
+                if (!zone->elems)
+                        kref_get(&zone->refcnt);
+                ++zone->elems;
+                spin_unlock(&zone->lock);
+                spin_lock(&fs_info->reada_lock);
+                kref_put(&zone->refcnt, reada_zone_release);
+                spin_unlock(&fs_info->reada_lock);
+        }
+        re->nzones = nzones;
+        if (nzones == 0) {
+                /* not a single zone found, error and out */
+                goto error;
+        }
+        /* insert extent in reada_tree + all per-device trees, all or nothing */
+        spin_lock(&fs_info->reada_lock);
+        ret = radix_tree_insert(&fs_info->reada_tree, index, re);
+        if (ret) {
+                spin_unlock(&fs_info->reada_lock);
+                if (ret != -ENOMEM) {
+                        /* someone inserted the extent in the meantime */
+                        looped = 1;
+                }
+                goto error;
+        }
+        for (i = 0; i < nzones; ++i) {
+                dev = bbio->stripes[i].dev;
+                ret = radix_tree_insert(&dev->reada_extents, index, re);
+                if (ret) {
+                        while (--i >= 0) {
+                                dev = bbio->stripes[i].dev;
+                                BUG_ON(dev == NULL);
+                                radix_tree_delete(&dev->reada_extents, index);
+                        }
+                        BUG_ON(fs_info == NULL);
+                        radix_tree_delete(&fs_info->reada_tree, index);
+                        spin_unlock(&fs_info->reada_lock);
+                        goto error;
+                }
+        }
+        spin_unlock(&fs_info->reada_lock);
+        kfree(bbio);
+        return re;
+error:
+        while (nzones) {
+                struct reada_zone *zone;
+                --nzones;
+                zone = re->zones[nzones];
+                kref_get(&zone->refcnt);
+                spin_lock(&zone->lock);
+                --zone->elems;
+                if (zone->elems == 0) {
+                        /*
+                         * no fs_info->reada_lock needed, as this can't be
+                         * the last ref
+                         */
+                        kref_put(&zone->refcnt, reada_zone_release);
+                }
+                spin_unlock(&zone->lock);
+                spin_lock(&fs_info->reada_lock);
+                kref_put(&zone->refcnt, reada_zone_release);
+                spin_unlock(&fs_info->reada_lock);
+        }
+        kfree(bbio);
+        kfree(re);
+        if (looped)
+                goto again;
+        return NULL;
+}
+static void reada_kref_dummy(struct kref *kr)
+{
+}
+static void reada_extent_put(struct btrfs_fs_info *fs_info,
+                             struct reada_extent *re)
+{
+        int i;
+        unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
+        spin_lock(&fs_info->reada_lock);
+        if (!kref_put(&re->refcnt, reada_kref_dummy)) {
+                spin_unlock(&fs_info->reada_lock);
+                return;
+        }
+        radix_tree_delete(&fs_info->reada_tree, index);
+        for (i = 0; i < re->nzones; ++i) {
+                struct reada_zone *zone = re->zones[i];
+                radix_tree_delete(&zone->device->reada_extents, index);
+        }
+        spin_unlock(&fs_info->reada_lock);
+        for (i = 0; i < re->nzones; ++i) {
+                struct reada_zone *zone = re->zones[i];
+                kref_get(&zone->refcnt);
+                spin_lock(&zone->lock);
+                --zone->elems;
+                if (zone->elems == 0) {
+                        /* no fs_info->reada_lock needed, as this can't be
+                         * the last ref */
+                        kref_put(&zone->refcnt, reada_zone_release);
+                }
+                spin_unlock(&zone->lock);
+                spin_lock(&fs_info->reada_lock);
+                kref_put(&zone->refcnt, reada_zone_release);
+                spin_unlock(&fs_info->reada_lock);
+        }
+        if (re->scheduled_for)
+                atomic_dec(&re->scheduled_for->reada_in_flight);
+        kfree(re);
+}
+static void reada_zone_release(struct kref *kref)
+{
+        struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+        radix_tree_delete(&zone->device->reada_zones,
+                          zone->end >> PAGE_CACHE_SHIFT);
+        kfree(zone);
+}
+static void reada_control_release(struct kref *kref)
+{
+        struct reada_control *rc = container_of(kref, struct reada_control,
+                                                refcnt);
+        kfree(rc);
+}
+static int reada_add_block(struct reada_control *rc, u64 logical,
+                           struct btrfs_key *top, int level, u64 generation)
+{
+        struct btrfs_root *root = rc->root;
+        struct reada_extent *re;
+        struct reada_extctl *rec;
+        re = reada_find_extent(root, logical, top, level); /* takes one ref */
+        if (!re)
+                return -1;
+        rec = kzalloc(sizeof(*rec), GFP_NOFS);
+        if (!rec) {
+                reada_extent_put(root->fs_info, re);
+                return -1;
+        }
+        rec->rc = rc;
+        rec->generation = generation;
+        atomic_inc(&rc->elems);
+        spin_lock(&re->lock);
+        list_add_tail(&rec->list, &re->extctl);
+        spin_unlock(&re->lock);
+        /* leave the ref on the extent */
+        return 0;
+}
+/*
+ * called with fs_info->reada_lock held
+ */
+static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
+{
+        int i;
+        unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
+        for (i = 0; i < zone->ndevs; ++i) {
+                struct reada_zone *peer;
+                peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
+                if (peer && peer->device != zone->device)
+                        peer->locked = lock;
+        }
+}
+/*
+ * called with fs_info->reada_lock held
+ */
+static int reada_pick_zone(struct btrfs_device *dev)
+{
+        struct reada_zone *top_zone = NULL;
+        struct reada_zone *top_locked_zone = NULL;
+        u64 top_elems = 0;
+        u64 top_locked_elems = 0;
+        unsigned long index = 0;
+        int ret;
+        if (dev->reada_curr_zone) {
+                reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
+                kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
+                dev->reada_curr_zone = NULL;
+        }
+        /* pick the zone with the most elements */
+        while (1) {
+                struct reada_zone *zone;
+                ret = radix_tree_gang_lookup(&dev->reada_zones,
+                                             (void **)&zone, index, 1);
+                if (ret == 0)
+                        break;
+                index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+                if (zone->locked) {
+                        if (zone->elems > top_locked_elems) {
+                                top_locked_elems = zone->elems;
+                                top_locked_zone = zone;
+                        }
+                } else {
+                        if (zone->elems > top_elems) {
+                                top_elems = zone->elems;
+                                top_zone = zone;
+                        }
+                }
+        }
+        if (top_zone)
+                dev->reada_curr_zone = top_zone;
+        else if (top_locked_zone)
+                dev->reada_curr_zone = top_locked_zone;
+        else
+                return 0;
+        dev->reada_next = dev->reada_curr_zone->start;
+        kref_get(&dev->reada_curr_zone->refcnt);
+        reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
+        return 1;
+}
+static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
+                                   struct btrfs_device *dev)
+{
+        struct reada_extent *re = NULL;
+        int mirror_num = 0;
+        struct extent_buffer *eb = NULL;
+        u64 logical;
+        u32 blocksize;
+        int ret;
+        int i;
+        int need_kick = 0;
+        spin_lock(&fs_info->reada_lock);
+        if (dev->reada_curr_zone == NULL) {
+                ret = reada_pick_zone(dev);
+                if (!ret) {
+                        spin_unlock(&fs_info->reada_lock);
+                        return 0;
+                }
+        }
+        /*
+         * FIXME currently we issue the reads one extent at a time. If we have
+         * a contiguous block of extents, we could also coagulate them or use
+         * plugging to speed things up
+         */
+        ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+                                     dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+        if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
+                ret = reada_pick_zone(dev);
+                if (!ret) {
+                        spin_unlock(&fs_info->reada_lock);
+                        return 0;
+                }
+                re = NULL;
+                ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+                                        dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+        }
+        if (ret == 0) {
+                spin_unlock(&fs_info->reada_lock);
+                return 0;
+        }
+        dev->reada_next = re->logical + re->blocksize;
+        kref_get(&re->refcnt);
+        spin_unlock(&fs_info->reada_lock);
+        /*
+         * find mirror num
+         */
+        for (i = 0; i < re->nzones; ++i) {
+                if (re->zones[i]->device == dev) {
+                        mirror_num = i + 1;
+                        break;
+                }
+        }
+        logical = re->logical;
+        blocksize = re->blocksize;
+        spin_lock(&re->lock);
+        if (re->scheduled_for == NULL) {
+                re->scheduled_for = dev;
+                need_kick = 1;
+        }
+        spin_unlock(&re->lock);
+        reada_extent_put(fs_info, re);
+        if (!need_kick)
+                return 0;
+        atomic_inc(&dev->reada_in_flight);
+        ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
+                         mirror_num, &eb);
+        if (ret)
+                __readahead_hook(fs_info->extent_root, NULL, logical, ret);
+        else if (eb)
+                __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
+        if (eb)
+                free_extent_buffer(eb);
+        return 1;
+}
+static void reada_start_machine_worker(struct btrfs_work *work)
+{
+        struct reada_machine_work *rmw;
+        struct btrfs_fs_info *fs_info;
+        rmw = container_of(work, struct reada_machine_work, work);
+        fs_info = rmw->fs_info;
+        kfree(rmw);
+        __reada_start_machine(fs_info);
+}
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_device *device;
+        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+        u64 enqueued;
+        u64 total = 0;
+        int i;
+        do {
+                enqueued = 0;
+                list_for_each_entry(device, &fs_devices->devices, dev_list) {
+                        if (atomic_read(&device->reada_in_flight) <
+                            MAX_IN_FLIGHT)
+                                enqueued += reada_start_machine_dev(fs_info,
+                                                                    device);
+                }
+                total += enqueued;
+        } while (enqueued && total < 10000);
+        if (enqueued == 0)
+                return;
+        /*
+         * If everything is already in the cache, this is effectively single
+         * threaded. To a) not hold the caller for too long and b) to utilize
+         * more cores, we broke the loop above after 10000 iterations and now
+         * enqueue to workers to finish it. This will distribute the load to
+         * the cores.
+         */
+        for (i = 0; i < 2; ++i)
+                reada_start_machine(fs_info);
+}
+static void reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+        struct reada_machine_work *rmw;
+        rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
+        if (!rmw) {
+                /* FIXME we cannot handle this properly right now */
+                BUG();
+        }
+        rmw->work.func = reada_start_machine_worker;
+        rmw->fs_info = fs_info;
+        btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
+}
+#ifdef DEBUG
+static void dump_devs(struct btrfs_fs_info *fs_info, int all)
+{
+        struct btrfs_device *device;
+        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+        unsigned long index;
+        int ret;
+        int i;
+        int j;
+        int cnt;
+        spin_lock(&fs_info->reada_lock);
+        list_for_each_entry(device, &fs_devices->devices, dev_list) {
+                printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
+                        atomic_read(&device->reada_in_flight));
+                index = 0;
+                while (1) {
+                        struct reada_zone *zone;
+                        ret = radix_tree_gang_lookup(&device->reada_zones,
+                                                     (void **)&zone, index, 1);
+                        if (ret == 0)
+                                break;
+                        printk(KERN_DEBUG "  zone %llu-%llu elems %llu locked "
+                                "%d devs", zone->start, zone->end, zone->elems,
+                                zone->locked);
+                        for (j = 0; j < zone->ndevs; ++j) {
+                                printk(KERN_CONT " %lld",
+                                        zone->devs[j]->devid);
+                        }
+                        if (device->reada_curr_zone == zone)
+                                printk(KERN_CONT " curr off %llu",
+                                        device->reada_next - zone->start);
+                        printk(KERN_CONT "\n");
+                        index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+                }
+                cnt = 0;
+                index = 0;
+                while (all) {
+                        struct reada_extent *re = NULL;
+                        ret = radix_tree_gang_lookup(&device->reada_extents,
+                                                     (void **)&re, index, 1);
+                        if (ret == 0)
+                                break;
+                        printk(KERN_DEBUG
+                                "  re: logical %llu size %u empty %d for %lld",
+                                re->logical, re->blocksize,
+                                list_empty(&re->extctl), re->scheduled_for ?
+                                re->scheduled_for->devid : -1);
+                        for (i = 0; i < re->nzones; ++i) {
+                                printk(KERN_CONT " zone %llu-%llu devs",
+                                        re->zones[i]->start,
+                                        re->zones[i]->end);
+                                for (j = 0; j < re->zones[i]->ndevs; ++j) {
+                                        printk(KERN_CONT " %lld",
+                                                re->zones[i]->devs[j]->devid);
+                                }
+                        }
+                        printk(KERN_CONT "\n");
+                        index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+                        if (++cnt > 15)
+                                break;
+                }
+        }
+        index = 0;
+        cnt = 0;
+        while (all) {
+                struct reada_extent *re = NULL;
+                ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
+                                             index, 1);
+                if (ret == 0)
+                        break;
+                if (!re->scheduled_for) {
+                        index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+                        continue;
+                }
+                printk(KERN_DEBUG
+                        "re: logical %llu size %u list empty %d for %lld",
+                        re->logical, re->blocksize, list_empty(&re->extctl),
+                        re->scheduled_for ? re->scheduled_for->devid : -1);
+                for (i = 0; i < re->nzones; ++i) {
+                        printk(KERN_CONT " zone %llu-%llu devs",
+                                re->zones[i]->start,
+                                re->zones[i]->end);
+                        for (i = 0; i < re->nzones; ++i) {
+                                printk(KERN_CONT " zone %llu-%llu devs",
+                                        re->zones[i]->start,
+                                        re->zones[i]->end);
+                                for (j = 0; j < re->zones[i]->ndevs; ++j) {
+                                        printk(KERN_CONT " %lld",
+                                                re->zones[i]->devs[j]->devid);
+                                }
+                        }
+                }
+                printk(KERN_CONT "\n");
+                index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+        }
+        spin_unlock(&fs_info->reada_lock);
+}
+#endif
+/*
+ * interface
+ */
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+                        struct btrfs_key *key_start, struct btrfs_key *key_end)
+{
+        struct reada_control *rc;
+        u64 start;
+        u64 generation;
+        int level;
+        struct extent_buffer *node;
+        static struct btrfs_key max_key = {
+                .objectid = (u64)-1,
+                .type = (u8)-1,
+                .offset = (u64)-1
+        };
+        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        if (!rc)
+                return ERR_PTR(-ENOMEM);
+        rc->root = root;
+        rc->key_start = *key_start;
+        rc->key_end = *key_end;
+        atomic_set(&rc->elems, 0);
+        init_waitqueue_head(&rc->wait);
+        kref_init(&rc->refcnt);
+        kref_get(&rc->refcnt); /* one ref for having elements */
+        node = btrfs_root_node(root);
+        start = node->start;
+        level = btrfs_header_level(node);
+        generation = btrfs_header_generation(node);
+        free_extent_buffer(node);
+        reada_add_block(rc, start, &max_key, level, generation);
+        reada_start_machine(root->fs_info);
+        return rc;
+}
+#ifdef DEBUG
+int btrfs_reada_wait(void *handle)
+{
+        struct reada_control *rc = handle;
+        while (atomic_read(&rc->elems)) {
+                wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+                                   5 * HZ);
+                dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+        }
+        dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+        kref_put(&rc->refcnt, reada_control_release);
+        return 0;
+}
+#else
+int btrfs_reada_wait(void *handle)
+{
+        struct reada_control *rc = handle;
+        while (atomic_read(&rc->elems)) {
+                wait_event(rc->wait, atomic_read(&rc->elems) == 0);
+        }
+        kref_put(&rc->refcnt, reada_control_release);
+        return 0;
+}
+#endif
+void btrfs_reada_detach(void *handle)
+{
+        struct reada_control *rc = handle;
+        kref_put(&rc->refcnt, reada_control_release);
+}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273..dff29d5e151 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
                        list_add_tail(&new_edge->list[UPPER],
                                      &new_node->lower);
                }
+        } else {
+                list_add_tail(&new_node->lower, &cache->leaves);
        }
        rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
@@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                BUG_ON(IS_ERR(trans));
                trans->block_rsv = rc->block_rsv;
-                ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
+                ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
-                                            min_reserved, 0);
                if (ret) {
                        BUG_ON(ret != -EAGAIN);
                        ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 again:
        if (!err) {
                num_bytes = rc->merging_rsv_size;
-                ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
+                ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
-                                          num_bytes);
                if (ret)
                        err = ret;
        }
@@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
        trans->block_rsv = rc->block_rsv;
-        ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
+        ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
        if (ret) {
                if (ret == -EAGAIN)
                        rc->commit_transaction = 1;
@@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
        unsigned long last_index;
        struct page *page;
        struct file_ra_state *ra;
+        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
        int nr = 0;
        int ret = 0;
@@ -2956,7 +2957,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
                                                  ra, NULL, index,
                                                  last_index + 1 - index);
                        page = find_or_create_page(inode->i_mapping, index,
-                                                   GFP_NOFS);
+                                                   mask);
                        if (!page) {
                                btrfs_delalloc_release_metadata(inode,
                                                        PAGE_CACHE_SIZE);
@@ -3323,8 +3324,11 @@ static int find_data_references(struct reloc_control *rc,
        }
        key.objectid = ref_objectid;
-        key.offset = ref_offset;
        key.type = BTRFS_EXTENT_DATA_KEY;
+        if (ref_offset > ((u64)-1 << 32))
+                key.offset = 0;
+        else
+                key.offset = ref_offset;
        path->search_commit_root = 1;
        path->skip_locking = 1;
@@ -3645,14 +3649,11 @@ int prepare_to_relocate(struct reloc_control *rc)
         * btrfs_init_reloc_root will use them when there
         * is no reservation in transaction handle.
         */
-        ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+        ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
                                  rc->extent_root->nodesize * 256);
        if (ret)
                return ret;
-        rc->block_rsv->refill_used = 1;
-        btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
        memset(&rc->cluster, 0, sizeof(rc->cluster));
        rc->search_start = rc->block_group->key.objectid;
        rc->extents_found = 0;
@@ -3777,8 +3778,7 @@ restart:
                        }
                }
-                ret = btrfs_block_rsv_check(trans, rc->extent_root,
+                ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
-                                            rc->block_rsv, 0, 5);
                if (ret < 0) {
                        if (ret != -EAGAIN) {
                                err = ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5..c27bcb67f33 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
 */
 #include <linux/blkdev.h>
+#include <linux/ratelimit.h>
 #include "ctree.h"
 #include "volumes.h"
 #include "disk-io.h"
 #include "ordered-data.h"
+#include "transaction.h"
+#include "backref.h"
+#include "extent_io.h"
 /*
 * This is only the first step towards a full-features scrub. It reads all
@@ -29,15 +33,12 @@
 * any can be found.
 *
 * Future enhancements:
- *  - To enhance the performance, better read-ahead strategies for the
- *    extent-tree can be employed.
 *  - In case an unrepairable extent is encountered, track which files are
 *    affected and report them
 *  - In case of a read error on files with nodatasum, map the file and read
 *    the extent to trigger a writeback of the good copy
 *  - track and record media errors, throw out bad devices
 *  - add a mode to also read unallocated space
- *  - make the prefetch cancellable
 */
 struct scrub_bio;
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
 struct scrub_page {
        u64                     flags;  /* extent flags */
        u64                     generation;
-        u64                     mirror_num;
+        int                     mirror_num;
        int                     have_csum;
        u8                      csum[BTRFS_CSUM_SIZE];
 };
@@ -87,6 +88,7 @@ struct scrub_dev {
        int                     first_free;
        int                     curr;
        atomic_t                in_flight;
+        atomic_t                fixup_cnt;
        spinlock_t              list_lock;
        wait_queue_head_t       list_wait;
        u16                     csum_size;
@@ -100,6 +102,27 @@ struct scrub_dev {
        spinlock_t              stat_lock;
 };
+struct scrub_fixup_nodatasum {
+        struct scrub_dev        *sdev;
+        u64                     logical;
+        struct btrfs_root       *root;
+        struct btrfs_work       work;
+        int                     mirror_num;
+};
+struct scrub_warning {
+        struct btrfs_path       *path;
+        u64                     extent_item_size;
+        char                    *scratch_buf;
+        char                    *msg_buf;
+        const char              *errstr;
+        sector_t                sector;
+        u64                     logical;
+        struct btrfs_device     *dev;
+        int                     msg_bufsize;
+        int                     scratch_bufsize;
+};
 static void scrub_free_csums(struct scrub_dev *sdev)
 {
        while (!list_empty(&sdev->csum_list)) {
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
                if (i != SCRUB_BIOS_PER_DEV-1)
                        sdev->bios[i]->next_free = i + 1;
-                 else
+                else
                        sdev->bios[i]->next_free = -1;
        }
        sdev->first_free = 0;
        sdev->curr = -1;
        atomic_set(&sdev->in_flight, 0);
+        atomic_set(&sdev->fixup_cnt, 0);
        atomic_set(&sdev->cancel_req, 0);
-        sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+        sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
        INIT_LIST_HEAD(&sdev->csum_list);
        spin_lock_init(&sdev->list_lock);
@@ -195,24 +219,366 @@ nomem:
        return ERR_PTR(-ENOMEM);
 }
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
+{
+        u64 isize;
+        u32 nlink;
+        int ret;
+        int i;
+        struct extent_buffer *eb;
+        struct btrfs_inode_item *inode_item;
+        struct scrub_warning *swarn = ctx;
+        struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
+        struct inode_fs_paths *ipath = NULL;
+        struct btrfs_root *local_root;
+        struct btrfs_key root_key;
+        root_key.objectid = root;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = (u64)-1;
+        local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+        if (IS_ERR(local_root)) {
+                ret = PTR_ERR(local_root);
+                goto err;
+        }
+        ret = inode_item_info(inum, 0, local_root, swarn->path);
+        if (ret) {
+                btrfs_release_path(swarn->path);
+                goto err;
+        }
+        eb = swarn->path->nodes[0];
+        inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
+                                        struct btrfs_inode_item);
+        isize = btrfs_inode_size(eb, inode_item);
+        nlink = btrfs_inode_nlink(eb, inode_item);
+        btrfs_release_path(swarn->path);
+        ipath = init_ipath(4096, local_root, swarn->path);
+        if (IS_ERR(ipath)) {
+                ret = PTR_ERR(ipath);
+                ipath = NULL;
+                goto err;
+        }
+        ret = paths_from_inode(inum, ipath);
+        if (ret < 0)
+                goto err;
+        /*
+         * we deliberately ignore the bit ipath might have been too small to
+         * hold all of the paths here
+         */
+        for (i = 0; i < ipath->fspath->elem_cnt; ++i)
+                printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+                        "%s, sector %llu, root %llu, inode %llu, offset %llu, "
+                        "length %llu, links %u (path: %s)\n", swarn->errstr,
+                        swarn->logical, swarn->dev->name,
+                        (unsigned long long)swarn->sector, root, inum, offset,
+                        min(isize - offset, (u64)PAGE_SIZE), nlink,
+                        (char *)(unsigned long)ipath->fspath->val[i]);
+        free_ipath(ipath);
+        return 0;
+err:
+        printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+                "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
+                "resolving failed with ret=%d\n", swarn->errstr,
+                swarn->logical, swarn->dev->name,
+                (unsigned long long)swarn->sector, root, inum, offset, ret);
+        free_ipath(ipath);
+        return 0;
+}
+static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
+                                int ix)
+{
+        struct btrfs_device *dev = sbio->sdev->dev;
+        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+        struct btrfs_path *path;
+        struct btrfs_key found_key;
+        struct extent_buffer *eb;
+        struct btrfs_extent_item *ei;
+        struct scrub_warning swarn;
+        u32 item_size;
+        int ret;
+        u64 ref_root;
+        u8 ref_level;
+        unsigned long ptr = 0;
+        const int bufsize = 4096;
+        u64 extent_offset;
+        path = btrfs_alloc_path();
+        swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
+        swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
+        swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+        swarn.logical = sbio->logical + ix * PAGE_SIZE;
+        swarn.errstr = errstr;
+        swarn.dev = dev;
+        swarn.msg_bufsize = bufsize;
+        swarn.scratch_bufsize = bufsize;
+        if (!path || !swarn.scratch_buf || !swarn.msg_buf)
+                goto out;
+        ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
+        if (ret < 0)
+                goto out;
+        extent_offset = swarn.logical - found_key.objectid;
+        swarn.extent_item_size = found_key.offset;
+        eb = path->nodes[0];
+        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+        item_size = btrfs_item_size_nr(eb, path->slots[0]);
+        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                do {
+                        ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
+                                                        &ref_root, &ref_level);
+                        printk(KERN_WARNING "%s at logical %llu on dev %s, "
+                                "sector %llu: metadata %s (level %d) in tree "
+                                "%llu\n", errstr, swarn.logical, dev->name,
+                                (unsigned long long)swarn.sector,
+                                ref_level ? "node" : "leaf",
+                                ret < 0 ? -1 : ref_level,
+                                ret < 0 ? -1 : ref_root);
+                } while (ret != 1);
+        } else {
+                swarn.path = path;
+                iterate_extent_inodes(fs_info, path, found_key.objectid,
+                                        extent_offset,
+                                        scrub_print_warning_inode, &swarn);
+        }
+out:
+        btrfs_free_path(path);
+        kfree(swarn.scratch_buf);
+        kfree(swarn.msg_buf);
+}
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
+{
+        struct page *page = NULL;
+        unsigned long index;
+        struct scrub_fixup_nodatasum *fixup = ctx;
+        int ret;
+        int corrected = 0;
+        struct btrfs_key key;
+        struct inode *inode = NULL;
+        u64 end = offset + PAGE_SIZE - 1;
+        struct btrfs_root *local_root;
+        key.objectid = root;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        key.offset = (u64)-1;
+        local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
+        if (IS_ERR(local_root))
+                return PTR_ERR(local_root);
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.objectid = inum;
+        key.offset = 0;
+        inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        index = offset >> PAGE_CACHE_SHIFT;
+        page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+        if (!page) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        if (PageUptodate(page)) {
+                struct btrfs_mapping_tree *map_tree;
+                if (PageDirty(page)) {
+                        /*
+                         * we need to write the data to the defect sector. the
+                         * data that was in that sector is not in memory,
+                         * because the page was modified. we must not write the
+                         * modified page to that sector.
+                         *
+                         * TODO: what could be done here: wait for the delalloc
+                         *       runner to write out that page (might involve
+                         *       COW) and see whether the sector is still
+                         *       referenced afterwards.
+                         *
+                         * For the meantime, we'll treat this error
+                         * incorrectable, although there is a chance that a
+                         * later scrub will find the bad sector again and that
+                         * there's no dirty page in memory, then.
+                         */
+                        ret = -EIO;
+                        goto out;
+                }
+                map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+                ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
+                                        fixup->logical, page,
+                                        fixup->mirror_num);
+                unlock_page(page);
+                corrected = !ret;
+        } else {
+                /*
+                 * we need to get good data first. the general readpage path
+                 * will call repair_io_failure for us, we just have to make
+                 * sure we read the bad mirror.
+                 */
+                ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+                                        EXTENT_DAMAGED, GFP_NOFS);
+                if (ret) {
+                        /* set_extent_bits should give proper error */
+                        WARN_ON(ret > 0);
+                        if (ret > 0)
+                                ret = -EFAULT;
+                        goto out;
+                }
+                ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
+                                                btrfs_get_extent,
+                                                fixup->mirror_num);
+                wait_on_page_locked(page);
+                corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
+                                                end, EXTENT_DAMAGED, 0, NULL);
+                if (!corrected)
+                        clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+                                                EXTENT_DAMAGED, GFP_NOFS);
+        }
+out:
+        if (page)
+                put_page(page);
+        if (inode)
+                iput(inode);
+        if (ret < 0)
+                return ret;
+        if (ret == 0 && corrected) {
+                /*
+                 * we only need to call readpage for one of the inodes belonging
+                 * to this extent. so make iterate_extent_inodes stop
+                 */
+                return 1;
+        }
+        return -EIO;
+}
+static void scrub_fixup_nodatasum(struct btrfs_work *work)
+{
+        int ret;
+        struct scrub_fixup_nodatasum *fixup;
+        struct scrub_dev *sdev;
+        struct btrfs_trans_handle *trans = NULL;
+        struct btrfs_fs_info *fs_info;
+        struct btrfs_path *path;
+        int uncorrectable = 0;
+        fixup = container_of(work, struct scrub_fixup_nodatasum, work);
+        sdev = fixup->sdev;
+        fs_info = fixup->root->fs_info;
+        path = btrfs_alloc_path();
+        if (!path) {
+                spin_lock(&sdev->stat_lock);
+                ++sdev->stat.malloc_errors;
+                spin_unlock(&sdev->stat_lock);
+                uncorrectable = 1;
+                goto out;
+        }
+        trans = btrfs_join_transaction(fixup->root);
+        if (IS_ERR(trans)) {
+                uncorrectable = 1;
+                goto out;
+        }
+        /*
+         * the idea is to trigger a regular read through the standard path. we
+         * read a page from the (failed) logical address by specifying the
+         * corresponding copynum of the failed sector. thus, that readpage is
+         * expected to fail.
+         * that is the point where on-the-fly error correction will kick in
+         * (once it's finished) and rewrite the failed sector if a good copy
+         * can be found.
+         */
+        ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
+                                                path, scrub_fixup_readpage,
+                                                fixup);
+        if (ret < 0) {
+                uncorrectable = 1;
+                goto out;
+        }
+        WARN_ON(ret != 1);
+        spin_lock(&sdev->stat_lock);
+        ++sdev->stat.corrected_errors;
+        spin_unlock(&sdev->stat_lock);
+out:
+        if (trans && !IS_ERR(trans))
+                btrfs_end_transaction(trans, fixup->root);
+        if (uncorrectable) {
+                spin_lock(&sdev->stat_lock);
+                ++sdev->stat.uncorrectable_errors;
+                spin_unlock(&sdev->stat_lock);
+                printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
+                                        "(nodatasum) error at logical %llu\n",
+                                        fixup->logical);
+        }
+        btrfs_free_path(path);
+        kfree(fixup);
+        /* see caller why we're pretending to be paused in the scrub counters */
+        mutex_lock(&fs_info->scrub_lock);
+        atomic_dec(&fs_info->scrubs_running);
+        atomic_dec(&fs_info->scrubs_paused);
+        mutex_unlock(&fs_info->scrub_lock);
+        atomic_dec(&sdev->fixup_cnt);
+        wake_up(&fs_info->scrub_pause_wait);
+        wake_up(&sdev->list_wait);
+}
 /*
 * scrub_recheck_error gets called when either verification of the page
 * failed or the bio failed to read, e.g. with EIO. In the latter case,
 * recheck_error gets called for every page in the bio, even though only
 * one may be bad
 */
-static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
+static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
 {
+        struct scrub_dev *sdev = sbio->sdev;
+        u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                        DEFAULT_RATELIMIT_BURST);
        if (sbio->err) {
-                if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
+                if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
-                                   (sbio->physical + ix * PAGE_SIZE) >> 9,
                                   sbio->bio->bi_io_vec[ix].bv_page) == 0) {
                        if (scrub_fixup_check(sbio, ix) == 0)
-                                return;
+                                return 0;
                }
+                if (__ratelimit(&_rs))
+                        scrub_print_warning("i/o error", sbio, ix);
+        } else {
+                if (__ratelimit(&_rs))
+                        scrub_print_warning("checksum error", sbio, ix);
        }
+        spin_lock(&sdev->stat_lock);
+        ++sdev->stat.read_errors;
+        spin_unlock(&sdev->stat_lock);
        scrub_fixup(sbio, ix);
+        return 1;
 }
 static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -250,7 +616,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
        struct scrub_dev *sdev = sbio->sdev;
        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
-        struct btrfs_multi_bio *multi = NULL;
+        struct btrfs_bio *bbio = NULL;
+        struct scrub_fixup_nodatasum *fixup;
        u64 logical = sbio->logical + ix * PAGE_SIZE;
        u64 length;
        int i;
@@ -259,38 +626,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
        if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
            (sbio->spag[ix].have_csum == 0)) {
+                fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+                if (!fixup)
+                        goto uncorrectable;
+                fixup->sdev = sdev;
+                fixup->logical = logical;
+                fixup->root = fs_info->extent_root;
+                fixup->mirror_num = sbio->spag[ix].mirror_num;
                /*
-                 * nodatasum, don't try to fix anything
+                 * increment scrubs_running to prevent cancel requests from
-                 * FIXME: we can do better, open the inode and trigger a
+                 * completing as long as a fixup worker is running. we must also
-                 * writeback
+                 * increment scrubs_paused to prevent deadlocking on pause
+                 * requests used for transactions commits (as the worker uses a
+                 * transaction context). it is safe to regard the fixup worker
+                 * as paused for all matters practical. effectively, we only
+                 * avoid cancellation requests from completing.
                 */
-                goto uncorrectable;
+                mutex_lock(&fs_info->scrub_lock);
+                atomic_inc(&fs_info->scrubs_running);
+                atomic_inc(&fs_info->scrubs_paused);
+                mutex_unlock(&fs_info->scrub_lock);
+                atomic_inc(&sdev->fixup_cnt);
+                fixup->work.func = scrub_fixup_nodatasum;
+                btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
+                return;
        }
        length = PAGE_SIZE;
        ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
-                              &multi, 0);
+                              &bbio, 0);
-        if (ret || !multi || length < PAGE_SIZE) {
+        if (ret || !bbio || length < PAGE_SIZE) {
                printk(KERN_ERR
                       "scrub_fixup: btrfs_map_block failed us for %llu\n",
                       (unsigned long long)logical);
                WARN_ON(1);
+                kfree(bbio);
                return;
        }
-        if (multi->num_stripes == 1)
+        if (bbio->num_stripes == 1)
                /* there aren't any replicas */
                goto uncorrectable;
        /*
         * first find a good copy
         */
-        for (i = 0; i < multi->num_stripes; ++i) {
+        for (i = 0; i < bbio->num_stripes; ++i) {
-                if (i == sbio->spag[ix].mirror_num)
+                if (i + 1 == sbio->spag[ix].mirror_num)
                        continue;
-                if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
+                if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
-                                   multi->stripes[i].physical >> 9,
+                                   bbio->stripes[i].physical >> 9,
                                   sbio->bio->bi_io_vec[ix].bv_page)) {
                        /* I/O-error, this is not a good copy */
                        continue;
@@ -299,7 +685,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
                if (scrub_fixup_check(sbio, ix) == 0)
                        break;
        }
-        if (i == multi->num_stripes)
+        if (i == bbio->num_stripes)
                goto uncorrectable;
        if (!sdev->readonly) {
@@ -314,25 +700,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
                }
        }
-        kfree(multi);
+        kfree(bbio);
        spin_lock(&sdev->stat_lock);
        ++sdev->stat.corrected_errors;
        spin_unlock(&sdev->stat_lock);
-        if (printk_ratelimit())
+        printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
-                printk(KERN_ERR "btrfs: fixed up at %llu\n",
+                               (unsigned long long)logical);
-                       (unsigned long long)logical);
        return;
 uncorrectable:
-        kfree(multi);
+        kfree(bbio);
        spin_lock(&sdev->stat_lock);
        ++sdev->stat.uncorrectable_errors;
        spin_unlock(&sdev->stat_lock);
-        if (printk_ratelimit())
+        printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
-                printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
+                                "logical %llu\n", (unsigned long long)logical);
-                         (unsigned long long)logical);
 }
 static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -382,8 +766,14 @@ static void scrub_checksum(struct btrfs_work *work)
        int ret;
        if (sbio->err) {
+                ret = 0;
                for (i = 0; i < sbio->count; ++i)
-                        scrub_recheck_error(sbio, i);
+                        ret |= scrub_recheck_error(sbio, i);
+                if (!ret) {
+                        spin_lock(&sdev->stat_lock);
+                        ++sdev->stat.unverified_errors;
+                        spin_unlock(&sdev->stat_lock);
+                }
                sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
                sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +786,6 @@ static void scrub_checksum(struct btrfs_work *work)
                        bi->bv_offset = 0;
                        bi->bv_len = PAGE_SIZE;
                }
-                spin_lock(&sdev->stat_lock);
-                ++sdev->stat.read_errors;
-                spin_unlock(&sdev->stat_lock);
                goto out;
        }
        for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +806,14 @@ static void scrub_checksum(struct btrfs_work *work)
                        WARN_ON(1);
                }
                kunmap_atomic(buffer, KM_USER0);
-                if (ret)
+                if (ret) {
-                        scrub_recheck_error(sbio, i);
+                        ret = scrub_recheck_error(sbio, i);
+                        if (!ret) {
+                                spin_lock(&sdev->stat_lock);
+                                ++sdev->stat.unverified_errors;
+                                spin_unlock(&sdev->stat_lock);
+                        }
+                }
        }
 out:
@@ -557,57 +949,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
 static int scrub_submit(struct scrub_dev *sdev)
 {
        struct scrub_bio *sbio;
-        struct bio *bio;
-        int i;
        if (sdev->curr == -1)
                return 0;
        sbio = sdev->bios[sdev->curr];
-        bio = bio_alloc(GFP_NOFS, sbio->count);
-        if (!bio)
-                goto nomem;
-        bio->bi_private = sbio;
-        bio->bi_end_io = scrub_bio_end_io;
-        bio->bi_bdev = sdev->dev->bdev;
-        bio->bi_sector = sbio->physical >> 9;
-        for (i = 0; i < sbio->count; ++i) {
-                struct page *page;
-                int ret;
-                page = alloc_page(GFP_NOFS);
-                if (!page)
-                        goto nomem;
-                ret = bio_add_page(bio, page, PAGE_SIZE, 0);
-                if (!ret) {
-                        __free_page(page);
-                        goto nomem;
-                }
-        }
        sbio->err = 0;
        sdev->curr = -1;
        atomic_inc(&sdev->in_flight);
-        submit_bio(READ, bio);
+        submit_bio(READ, sbio->bio);
        return 0;
-nomem:
-        scrub_free_bio(bio);
-        return -ENOMEM;
 }
 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
-                      u64 physical, u64 flags, u64 gen, u64 mirror_num,
+                      u64 physical, u64 flags, u64 gen, int mirror_num,
                      u8 *csum, int force)
 {
        struct scrub_bio *sbio;
+        struct page *page;
+        int ret;
 again:
        /*
@@ -628,12 +990,22 @@ again:
        }
        sbio = sdev->bios[sdev->curr];
        if (sbio->count == 0) {
+                struct bio *bio;
                sbio->physical = physical;
                sbio->logical = logical;
+                bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
+                if (!bio)
+                        return -ENOMEM;
+                bio->bi_private = sbio;
+                bio->bi_end_io = scrub_bio_end_io;
+                bio->bi_bdev = sdev->dev->bdev;
+                bio->bi_sector = sbio->physical >> 9;
+                sbio->err = 0;
+                sbio->bio = bio;
        } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
                   sbio->logical + sbio->count * PAGE_SIZE != logical) {
-                int ret;
                ret = scrub_submit(sdev);
                if (ret)
                        return ret;
@@ -643,6 +1015,20 @@ again:
        sbio->spag[sbio->count].generation = gen;
        sbio->spag[sbio->count].have_csum = 0;
        sbio->spag[sbio->count].mirror_num = mirror_num;
+        page = alloc_page(GFP_NOFS);
+        if (!page)
+                return -ENOMEM;
+        ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
+        if (!ret) {
+                __free_page(page);
+                ret = scrub_submit(sdev);
+                if (ret)
+                        return ret;
+                goto again;
+        }
        if (csum) {
                sbio->spag[sbio->count].have_csum = 1;
                memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
@@ -701,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
 /* scrub extent tries to collect up to 64 kB for each bio */
 static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
-                        u64 physical, u64 flags, u64 gen, u64 mirror_num)
+                        u64 physical, u64 flags, u64 gen, int mirror_num)
 {
        int ret;
        u8 csum[BTRFS_CSUM_SIZE];
@@ -741,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
        int slot;
        int i;
        u64 nstripes;
-        int start_stripe;
        struct extent_buffer *l;
        struct btrfs_key key;
        u64 physical;
        u64 logical;
        u64 generation;
-        u64 mirror_num;
+        int mirror_num;
+        struct reada_control *reada1;
+        struct reada_control *reada2;
+        struct btrfs_key key_start;
+        struct btrfs_key key_end;
        u64 increment = map->stripe_len;
        u64 offset;
@@ -758,102 +1147,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
                offset = map->stripe_len * num;
                increment = map->stripe_len * map->num_stripes;
-                mirror_num = 0;
+                mirror_num = 1;
        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
                int factor = map->num_stripes / map->sub_stripes;
                offset = map->stripe_len * (num / map->sub_stripes);
                increment = map->stripe_len * factor;
-                mirror_num = num % map->sub_stripes;
+                mirror_num = num % map->sub_stripes + 1;
        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
                increment = map->stripe_len;
-                mirror_num = num % map->num_stripes;
+                mirror_num = num % map->num_stripes + 1;
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
                increment = map->stripe_len;
-                mirror_num = num % map->num_stripes;
+                mirror_num = num % map->num_stripes + 1;
        } else {
                increment = map->stripe_len;
-                mirror_num = 0;
+                mirror_num = 1;
        }
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        path->reada = 2;
        path->search_commit_root = 1;
        path->skip_locking = 1;
        /*
-         * find all extents for each stripe and just read them to get
+         * trigger the readahead for extent tree csum tree and wait for
-         * them into the page cache
+         * completion. During readahead, the scrub is officially paused
-         * FIXME: we can do better. build a more intelligent prefetching
+         * to not hold off transaction commits
         */
        logical = base + offset;
-        physical = map->stripes[num].physical;
-        ret = 0;
-        for (i = 0; i < nstripes; ++i) {
-                key.objectid = logical;
-                key.type = BTRFS_EXTENT_ITEM_KEY;
-                key.offset = (u64)0;
-                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-                if (ret < 0)
-                        goto out_noplug;
-                /*
-                 * we might miss half an extent here, but that doesn't matter,
-                 * as it's only the prefetch
-                 */
-                while (1) {
-                        l = path->nodes[0];
-                        slot = path->slots[0];
-                        if (slot >= btrfs_header_nritems(l)) {
-                                ret = btrfs_next_leaf(root, path);
-                                if (ret == 0)
-                                        continue;
-                                if (ret < 0)
-                                        goto out_noplug;
-                                break;
+        wait_event(sdev->list_wait,
-                        }
+                   atomic_read(&sdev->in_flight) == 0);
-                        btrfs_item_key_to_cpu(l, &key, slot);
+        atomic_inc(&fs_info->scrubs_paused);
+        wake_up(&fs_info->scrub_pause_wait);
-                        if (key.objectid >= logical + map->stripe_len)
+        /* FIXME it might be better to start readahead at commit root */
-                                break;
+        key_start.objectid = logical;
+        key_start.type = BTRFS_EXTENT_ITEM_KEY;
+        key_start.offset = (u64)0;
+        key_end.objectid = base + offset + nstripes * increment;
+        key_end.type = BTRFS_EXTENT_ITEM_KEY;
+        key_end.offset = (u64)0;
+        reada1 = btrfs_reada_add(root, &key_start, &key_end);
+        key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+        key_start.type = BTRFS_EXTENT_CSUM_KEY;
+        key_start.offset = logical;
+        key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+        key_end.type = BTRFS_EXTENT_CSUM_KEY;
+        key_end.offset = base + offset + nstripes * increment;
+        reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+        if (!IS_ERR(reada1))
+                btrfs_reada_wait(reada1);
+        if (!IS_ERR(reada2))
+                btrfs_reada_wait(reada2);
-                        path->slots[0]++;
+        mutex_lock(&fs_info->scrub_lock);
-                }
+        while (atomic_read(&fs_info->scrub_pause_req)) {
-                btrfs_release_path(path);
+                mutex_unlock(&fs_info->scrub_lock);
-                logical += increment;
+                wait_event(fs_info->scrub_pause_wait,
-                physical += map->stripe_len;
+                   atomic_read(&fs_info->scrub_pause_req) == 0);
-                cond_resched();
+                mutex_lock(&fs_info->scrub_lock);
        }
+        atomic_dec(&fs_info->scrubs_paused);
+        mutex_unlock(&fs_info->scrub_lock);
+        wake_up(&fs_info->scrub_pause_wait);
        /*
         * collect all data csums for the stripe to avoid seeking during
         * the scrub. This might currently (crc32) end up to be about 1MB
         */
-        start_stripe = 0;
        blk_start_plug(&plug);
-again:
-        logical = base + offset + start_stripe * increment;
-        for (i = start_stripe; i < nstripes; ++i) {
-                ret = btrfs_lookup_csums_range(csum_root, logical,
-                                               logical + map->stripe_len - 1,
-                                               &sdev->csum_list, 1);
-                if (ret)
-                        goto out;
-                logical += increment;
-                cond_resched();
-        }
        /*
         * now find all extents for each stripe and scrub them
         */
-        logical = base + offset + start_stripe * increment;
+        logical = base + offset;
-        physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+        physical = map->stripes[num].physical;
        ret = 0;
-        for (i = start_stripe; i < nstripes; ++i) {
+        for (i = 0; i < nstripes; ++i) {
                /*
                 * canceled?
                 */
@@ -882,11 +1257,14 @@ again:
                        atomic_dec(&fs_info->scrubs_paused);
                        mutex_unlock(&fs_info->scrub_lock);
                        wake_up(&fs_info->scrub_pause_wait);
-                        scrub_free_csums(sdev);
-                        start_stripe = i;
-                        goto again;
                }
+                ret = btrfs_lookup_csums_range(csum_root, logical,
+                                               logical + map->stripe_len - 1,
+                                               &sdev->csum_list, 1);
+                if (ret)
+                        goto out;
                key.objectid = logical;
                key.type = BTRFS_EXTENT_ITEM_KEY;
                key.offset = (u64)0;
@@ -982,7 +1360,6 @@ next:
 out:
        blk_finish_plug(&plug);
-out_noplug:
        btrfs_free_path(path);
        return ret < 0 ? ret : 0;
 }
@@ -1253,10 +1630,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
                ret = scrub_enumerate_chunks(sdev, start, end);
        wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
        atomic_dec(&fs_info->scrubs_running);
        wake_up(&fs_info->scrub_pause_wait);
+        wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
        if (progress)
                memcpy(progress, &sdev->stat, sizeof(*progress));
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d..e28ad4baf48 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/cleancache.h>
+#include <linux/mnt_namespace.h>
 #include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -58,6 +59,7 @@
 #include <trace/events/btrfs.h>
 static const struct super_operations btrfs_super_ops;
+static struct file_system_type btrfs_fs_type;
 static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
                                      char nbuf[16])
@@ -162,7 +164,7 @@ enum {
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
        Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-        Opt_inode_cache, Opt_err,
+        Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
 };
 static match_table_t tokens = {
@@ -195,6 +197,8 @@ static match_table_t tokens = {
        {Opt_subvolrootid, "subvolrootid=%d"},
        {Opt_defrag, "autodefrag"},
        {Opt_inode_cache, "inode_cache"},
+        {Opt_no_space_cache, "nospace_cache"},
+        {Opt_recovery, "recovery"},
        {Opt_err, NULL},
 };
@@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 {
        struct btrfs_fs_info *info = root->fs_info;
        substring_t args[MAX_OPT_ARGS];
-        char *p, *num, *orig;
+        char *p, *num, *orig = NULL;
+        u64 cache_gen;
        int intarg;
        int ret = 0;
        char *compress_type;
        bool compress_force = false;
+        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+        if (cache_gen)
+                btrfs_set_opt(info->mount_opt, SPACE_CACHE);
        if (!options)
-                return 0;
+                goto out;
        /*
         * strsep changes the string, duplicate it because parse_options
@@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_set_opt(info->mount_opt, DISCARD);
                        break;
                case Opt_space_cache:
-                        printk(KERN_INFO "btrfs: enabling disk space caching\n");
                        btrfs_set_opt(info->mount_opt, SPACE_CACHE);
                        break;
+                case Opt_no_space_cache:
+                        printk(KERN_INFO "btrfs: disabling disk space caching\n");
+                        btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
+                        break;
                case Opt_inode_cache:
                        printk(KERN_INFO "btrfs: enabling inode map caching\n");
                        btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        printk(KERN_INFO "btrfs: enabling auto defrag");
                        btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
                        break;
+                case Opt_recovery:
+                        printk(KERN_INFO "btrfs: enabling auto recovery");
+                        btrfs_set_opt(info->mount_opt, RECOVERY);
+                        break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                }
        }
 out:
+        if (!ret && btrfs_test_opt(root, SPACE_CACHE))
+                printk(KERN_INFO "btrfs: disk space caching is enabled\n");
        kfree(orig);
        return ret;
 }
@@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
 {
        substring_t args[MAX_OPT_ARGS];
-        char *opts, *orig, *p;
+        char *device_name, *opts, *orig, *p;
        int error = 0;
        int intarg;
        if (!options)
-                goto out;
+                return 0;
        /*
         * strsep changes the string, duplicate it because parse_options
@@ -430,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_subvol:
+                        kfree(*subvol_name);
                        *subvol_name = match_strdup(&args[0]);
                        break;
                case Opt_subvolid:
@@ -457,29 +476,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                        }
                        break;
                case Opt_device:
-                        error = btrfs_scan_one_device(match_strdup(&args[0]),
+                        device_name = match_strdup(&args[0]);
+                        if (!device_name) {
+                                error = -ENOMEM;
+                                goto out;
+                        }
+                        error = btrfs_scan_one_device(device_name,
                                        flags, holder, fs_devices);
+                        kfree(device_name);
                        if (error)
-                                goto out_free_opts;
+                                goto out;
                        break;
                default:
                        break;
                }
        }
- out_free_opts:
+out:
        kfree(orig);
- out:
-        /*
-         * If no subvolume name is specified we use the default one.  Allocate
-         * a copy of the string "." here so that code later in the
-         * mount path doesn't care if it's the default volume or another one.
-         */
-        if (!*subvol_name) {
-                *subvol_name = kstrdup(".", GFP_KERNEL);
-                if (!*subvol_name)
-                        return -ENOMEM;
-        }
        return error;
 }
@@ -492,7 +506,6 @@ static struct dentry *get_default_root(struct super_block *sb,
        struct btrfs_path *path;
        struct btrfs_key location;
        struct inode *inode;
-        struct dentry *dentry;
        u64 dir_id;
        int new = 0;
@@ -517,7 +530,7 @@ static struct dentry *get_default_root(struct super_block *sb,
         * will mount by default if we haven't been given a specific subvolume
         * to mount.
         */
-        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+        dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
        if (IS_ERR(di)) {
                btrfs_free_path(path);
@@ -566,29 +579,7 @@ setup_root:
                return dget(sb->s_root);
        }
-        if (new) {
+        return d_obtain_alias(inode);
-                const struct qstr name = { .name = "/", .len = 1 };
-                /*
-                 * New inode, we need to make the dentry a sibling of s_root so
-                 * everything gets cleaned up properly on unmount.
-                 */
-                dentry = d_alloc(sb->s_root, &name);
-                if (!dentry) {
-                        iput(inode);
-                        return ERR_PTR(-ENOMEM);
-                }
-                d_splice_alias(inode, dentry);
-        } else {
-                /*
-                 * We found the inode in cache, just find a dentry for it and
-                 * put the reference to the inode we just got.
-                 */
-                dentry = d_find_alias(inode);
-                iput(inode);
-        }
-        return dentry;
 }
 static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +710,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",noacl");
        if (btrfs_test_opt(root, SPACE_CACHE))
                seq_puts(seq, ",space_cache");
+        else
+                seq_puts(seq, ",nospace_cache");
        if (btrfs_test_opt(root, CLEAR_CACHE))
                seq_puts(seq, ",clear_cache");
        if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +746,111 @@ static int btrfs_set_super(struct super_block *s, void *data)
        return set_anon_super(s, data);
 }
+/*
+ * subvolumes are identified by ino 256
+ */
+static inline int is_subvolume_inode(struct inode *inode)
+{
+        if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+                return 1;
+        return 0;
+}
+/*
+ * This will strip out the subvol=%s argument for an argument string and add
+ * subvolid=0 to make sure we get the actual tree root for path walking to the
+ * subvol we want.
+ */
+static char *setup_root_args(char *args)
+{
+        unsigned copied = 0;
+        unsigned len = strlen(args) + 2;
+        char *pos;
+        char *ret;
+        /*
+         * We need the same args as before, but minus
+         *
+         * subvol=a
+         *
+         * and add
+         *
+         * subvolid=0
+         *
+         * which is a difference of 2 characters, so we allocate strlen(args) +
+         * 2 characters.
+         */
+        ret = kzalloc(len * sizeof(char), GFP_NOFS);
+        if (!ret)
+                return NULL;
+        pos = strstr(args, "subvol=");
+        /* This shouldn't happen, but just in case.. */
+        if (!pos) {
+                kfree(ret);
+                return NULL;
+        }
+        /*
+         * The subvol=<> arg is not at the front of the string, copy everybody
+         * up to that into ret.
+         */
+        if (pos != args) {
+                *pos = '\0';
+                strcpy(ret, args);
+                copied += strlen(args);
+                pos++;
+        }
+        strncpy(ret + copied, "subvolid=0", len - copied);
+        /* Length of subvolid=0 */
+        copied += 10;
+        /*
+         * If there is no , after the subvol= option then we know there's no
+         * other options and we can just return.
+         */
+        pos = strchr(pos, ',');
+        if (!pos)
+                return ret;
+        /* Copy the rest of the arguments into our buffer */
+        strncpy(ret + copied, pos, len - copied);
+        copied += strlen(pos);
+        return ret;
+}
+static struct dentry *mount_subvol(const char *subvol_name, int flags,
+                                   const char *device_name, char *data)
+{
+        struct dentry *root;
+        struct vfsmount *mnt;
+        char *newargs;
+        newargs = setup_root_args(data);
+        if (!newargs)
+                return ERR_PTR(-ENOMEM);
+        mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
+                             newargs);
+        kfree(newargs);
+        if (IS_ERR(mnt))
+                return ERR_CAST(mnt);
+        root = mount_subtree(mnt, subvol_name);
+        if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
+                struct super_block *s = root->d_sb;
+                dput(root);
+                root = ERR_PTR(-EINVAL);
+                deactivate_locked_super(s);
+                printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
+                                subvol_name);
+        }
+        return root;
+}
 /*
 * Find a superblock for the given device / mount point.
@@ -767,7 +865,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        struct super_block *s;
        struct dentry *root;
        struct btrfs_fs_devices *fs_devices = NULL;
-        struct btrfs_root *tree_root = NULL;
        struct btrfs_fs_info *fs_info = NULL;
        fmode_t mode = FMODE_READ;
        char *subvol_name = NULL;
@@ -781,21 +878,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        error = btrfs_parse_early_options(data, mode, fs_type,
                                          &subvol_name, &subvol_objectid,
                                          &subvol_rootid, &fs_devices);
-        if (error)
+        if (error) {
+                kfree(subvol_name);
                return ERR_PTR(error);
+        }
-        error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
+        if (subvol_name) {
-        if (error)
+                root = mount_subvol(subvol_name, flags, device_name, data);
-                goto error_free_subvol_name;
+                kfree(subvol_name);
+                return root;
+        }
-        error = btrfs_open_devices(fs_devices, mode, fs_type);
+        error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
        if (error)
-                goto error_free_subvol_name;
+                return ERR_PTR(error);
-        if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
-                error = -EACCES;
-                goto error_close_devices;
-        }
        /*
         * Setup a dummy root and fs_info for test/set super.  This is because
@@ -804,19 +900,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
         * then open_ctree will properly initialize everything later.
         */
        fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
-        tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        if (!fs_info)
-        if (!fs_info || !tree_root) {
+                return ERR_PTR(-ENOMEM);
+        fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        if (!fs_info->tree_root) {
                error = -ENOMEM;
-                goto error_close_devices;
+                goto error_fs_info;
        }
-        fs_info->tree_root = tree_root;
+        fs_info->tree_root->fs_info = fs_info;
        fs_info->fs_devices = fs_devices;
-        tree_root->fs_info = fs_info;
+        fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+        fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+        if (!fs_info->super_copy || !fs_info->super_for_commit) {
+                error = -ENOMEM;
+                goto error_fs_info;
+        }
+        error = btrfs_open_devices(fs_devices, mode, fs_type);
+        if (error)
+                goto error_fs_info;
+        if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+                error = -EACCES;
+                goto error_close_devices;
+        }
        bdev = fs_devices->latest_bdev;
-        s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
+        s = sget(fs_type, btrfs_test_super, btrfs_set_super,
-        if (IS_ERR(s))
+                 fs_info->tree_root);
-                goto error_s;
+        if (IS_ERR(s)) {
+                error = PTR_ERR(s);
+                goto error_close_devices;
+        }
        if (s->s_root) {
                if ((flags ^ s->s_flags) & MS_RDONLY) {
@@ -826,75 +943,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                }
                btrfs_close_devices(fs_devices);
-                kfree(fs_info);
+                free_fs_info(fs_info);
-                kfree(tree_root);
        } else {
                char b[BDEVNAME_SIZE];
                s->s_flags = flags | MS_NOSEC;
                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+                btrfs_sb(s)->fs_info->bdev_holder = fs_type;
                error = btrfs_fill_super(s, fs_devices, data,
                                         flags & MS_SILENT ? 1 : 0);
                if (error) {
                        deactivate_locked_super(s);
-                        goto error_free_subvol_name;
+                        return ERR_PTR(error);
                }
-                btrfs_sb(s)->fs_info->bdev_holder = fs_type;
                s->s_flags |= MS_ACTIVE;
        }
-        /* if they gave us a subvolume name bind mount into that */
+        root = get_default_root(s, subvol_objectid);
-        if (strcmp(subvol_name, ".")) {
+        if (IS_ERR(root)) {
-                struct dentry *new_root;
+                deactivate_locked_super(s);
+                return root;
-                root = get_default_root(s, subvol_rootid);
-                if (IS_ERR(root)) {
-                        error = PTR_ERR(root);
-                        deactivate_locked_super(s);
-                        goto error_free_subvol_name;
-                }
-                mutex_lock(&root->d_inode->i_mutex);
-                new_root = lookup_one_len(subvol_name, root,
-                                      strlen(subvol_name));
-                mutex_unlock(&root->d_inode->i_mutex);
-                if (IS_ERR(new_root)) {
-                        dput(root);
-                        deactivate_locked_super(s);
-                        error = PTR_ERR(new_root);
-                        goto error_free_subvol_name;
-                }
-                if (!new_root->d_inode) {
-                        dput(root);
-                        dput(new_root);
-                        deactivate_locked_super(s);
-                        error = -ENXIO;
-                        goto error_free_subvol_name;
-                }
-                dput(root);
-                root = new_root;
-        } else {
-                root = get_default_root(s, subvol_objectid);
-                if (IS_ERR(root)) {
-                        error = PTR_ERR(root);
-                        deactivate_locked_super(s);
-                        goto error_free_subvol_name;
-                }
        }
-        kfree(subvol_name);
        return root;
-error_s:
-        error = PTR_ERR(s);
 error_close_devices:
        btrfs_close_devices(fs_devices);
-        kfree(fs_info);
+error_fs_info:
-        kfree(tree_root);
+        free_fs_info(fs_info);
-error_free_subvol_name:
-        kfree(subvol_name);
        return ERR_PTR(error);
 }
@@ -919,7 +996,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (root->fs_info->fs_devices->rw_devices == 0)
                        return -EACCES;
-                if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+                if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
                        return -EINVAL;
                ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -980,7 +1057,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        int i = 0, nr_devices;
        int ret;
-        nr_devices = fs_info->fs_devices->rw_devices;
+        nr_devices = fs_info->fs_devices->open_devices;
        BUG_ON(!nr_devices);
        devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -1002,8 +1079,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        else
                min_stripe_size = BTRFS_STRIPE_LEN;
-        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+        list_for_each_entry(device, &fs_devices->devices, dev_list) {
-                if (!device->in_fs_metadata)
+                if (!device->in_fs_metadata || !device->bdev)
                        continue;
                avail_space = device->total_bytes - device->bytes_used;
@@ -1085,7 +1162,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+        struct btrfs_super_block *disk_super = root->fs_info->super_copy;
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a15..81376d94cd3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
        struct btrfs_transaction *cur_trans;
        spin_lock(&root->fs_info->trans_lock);
+loop:
        if (root->fs_info->trans_no_join) {
                if (!nofail) {
                        spin_unlock(&root->fs_info->trans_lock);
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
        cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
        if (!cur_trans)
                return -ENOMEM;
        spin_lock(&root->fs_info->trans_lock);
        if (root->fs_info->running_transaction) {
+                /*
+                 * someone started a transaction after we unlocked.  Make sure
+                 * to redo the trans_no_join checks above
+                 */
                kmem_cache_free(btrfs_transaction_cachep, cur_trans);
                cur_trans = root->fs_info->running_transaction;
-                atomic_inc(&cur_trans->use_count);
+                goto loop;
-                atomic_inc(&cur_trans->num_writers);
-                cur_trans->num_joined++;
-                spin_unlock(&root->fs_info->trans_lock);
-                return 0;
        }
        atomic_set(&cur_trans->num_writers, 1);
        cur_trans->num_joined = 0;
        init_waitqueue_head(&cur_trans->writer_wait);
@@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
         */
        if (num_items > 0 && root != root->fs_info->chunk_root) {
                num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-                ret = btrfs_block_rsv_add(NULL, root,
+                ret = btrfs_block_rsv_add(root,
                                          &root->fs_info->trans_block_rsv,
                                          num_bytes);
                if (ret)
@@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root)
 {
        int ret;
-        ret = btrfs_block_rsv_check(trans, root,
-                                    &root->fs_info->global_block_rsv, 0, 5);
+        ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
        return ret ? 1 : 0;
 }
@@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root)
 {
        struct btrfs_transaction *cur_trans = trans->transaction;
+        struct btrfs_block_rsv *rsv = trans->block_rsv;
        int updates;
        smp_mb();
        if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
                return 1;
+        /*
+         * We need to do this in case we're deleting csums so the global block
+         * rsv get's used instead of the csum block rsv.
+         */
+        trans->block_rsv = NULL;
        updates = trans->delayed_ref_updates;
        trans->delayed_ref_updates = 0;
        if (updates)
                btrfs_run_delayed_refs(trans, root, updates);
+        trans->block_rsv = rsv;
        return should_end_transaction(trans, root);
 }
@@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                return 0;
        }
+        btrfs_trans_release_metadata(trans, root);
+        trans->block_rsv = NULL;
        while (count < 4) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
@@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                count++;
        }
-        btrfs_trans_release_metadata(trans, root);
        if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
            should_end_transaction(trans, root)) {
                trans->transaction->blocked = 1;
@@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
 int btrfs_write_marked_extents(struct btrfs_root *root,
                               struct extent_io_tree *dirty_pages, int mark)
 {
-        int ret;
        int err = 0;
        int werr = 0;
-        struct page *page;
+        struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
-        struct inode *btree_inode = root->fs_info->btree_inode;
        u64 start = 0;
        u64 end;
-        unsigned long index;
-        while (1) {
-                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-                                            mark);
-                if (ret)
-                        break;
-                while (start <= end) {
-                        cond_resched();
-                        index = start >> PAGE_CACHE_SHIFT;
-                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-                        page = find_get_page(btree_inode->i_mapping, index);
-                        if (!page)
-                                continue;
-                        btree_lock_page_hook(page);
-                        if (!page->mapping) {
-                                unlock_page(page);
-                                page_cache_release(page);
-                                continue;
-                        }
-                        if (PageWriteback(page)) {
+        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-                                if (PageDirty(page))
+                                      mark)) {
-                                        wait_on_page_writeback(page);
+                convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
-                                else {
+                                   GFP_NOFS);
-                                        unlock_page(page);
+                err = filemap_fdatawrite_range(mapping, start, end);
-                                        page_cache_release(page);
+                if (err)
-                                        continue;
+                        werr = err;
-                                }
+                cond_resched();
-                        }
+                start = end + 1;
-                        err = write_one_page(page, 0);
-                        if (err)
-                                werr = err;
-                        page_cache_release(page);
-                }
        }
        if (err)
                werr = err;
@@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 int btrfs_wait_marked_extents(struct btrfs_root *root,
                              struct extent_io_tree *dirty_pages, int mark)
 {
-        int ret;
        int err = 0;
        int werr = 0;
-        struct page *page;
+        struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
-        struct inode *btree_inode = root->fs_info->btree_inode;
        u64 start = 0;
        u64 end;
-        unsigned long index;
-        while (1) {
+        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+                                      EXTENT_NEED_WAIT)) {
-                                            mark);
+                clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
-                if (ret)
+                err = filemap_fdatawait_range(mapping, start, end);
-                        break;
+                if (err)
+                        werr = err;
-                clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+                cond_resched();
-                while (start <= end) {
+                start = end + 1;
-                        index = start >> PAGE_CACHE_SHIFT;
-                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-                        page = find_get_page(btree_inode->i_mapping, index);
-                        if (!page)
-                                continue;
-                        if (PageDirty(page)) {
-                                btree_lock_page_hook(page);
-                                wait_on_page_writeback(page);
-                                err = write_one_page(page, 0);
-                                if (err)
-                                        werr = err;
-                        }
-                        wait_on_page_writeback(page);
-                        page_cache_release(page);
-                        cond_resched();
-                }
        }
        if (err)
                werr = err;
@@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
        ret = btrfs_write_marked_extents(root, dirty_pages, mark);
        ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
-        return ret || ret2;
+        if (ret)
+                return ret;
+        if (ret2)
+                return ret2;
+        return 0;
 }
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -816,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        btrfs_save_ino_cache(root, trans);
+                        /* see comments in should_cow_block() */
+                        root->force_cow = 0;
+                        smp_wmb();
                        if (root->commit_root != root->node) {
                                mutex_lock(&root->fs_commit_mutex);
                                switch_commit_root(root);
@@ -911,11 +884,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        }
        btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
-        btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
        if (to_reserve > 0) {
-                ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+                ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
-                                          to_reserve);
+                                                  to_reserve);
                if (ret) {
                        pending->error = ret;
                        goto fail;
@@ -979,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_tree_unlock(old);
        free_extent_buffer(old);
+        /* see comments in should_cow_block() */
+        root->force_cow = 1;
+        smp_wmb();
        btrfs_set_root_node(new_root_item, tmp);
        /* record when the snapshot was created in key.offset */
        key.offset = trans->transid;
@@ -1002,7 +978,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        BUG_ON(IS_ERR(pending->snap));
        btrfs_reloc_post_snapshot(trans, pending);
-        btrfs_orphan_post_snapshot(trans, pending);
 fail:
        kfree(new_root_item);
        trans->block_rsv = rsv;
@@ -1032,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root)
        struct btrfs_root_item *root_item;
        struct btrfs_super_block *super;
-        super = &root->fs_info->super_copy;
+        super = root->fs_info->super_copy;
        root_item = &root->fs_info->chunk_root->root_item;
        super->chunk_root = root_item->bytenr;
@@ -1043,7 +1018,7 @@ static void update_super_roots(struct btrfs_root *root)
        super->root = root_item->bytenr;
        super->generation = root_item->generation;
        super->root_level = root_item->level;
-        if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+        if (btrfs_test_opt(root, SPACE_CACHE))
                super->cache_generation = root_item->generation;
 }
@@ -1168,14 +1143,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_run_ordered_operations(root, 0);
+        btrfs_trans_release_metadata(trans, root);
+        trans->block_rsv = NULL;
        /* make a pass through all the delayed refs we have so far
         * any runnings procs may add more while we are here
         */
        ret = btrfs_run_delayed_refs(trans, root, 0);
        BUG_ON(ret);
-        btrfs_trans_release_metadata(trans, root);
        cur_trans = trans->transaction;
        /*
         * set the flushing flag so procs in this transaction have to
@@ -1341,12 +1317,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        update_super_roots(root);
        if (!root->fs_info->log_root_recovering) {
-                btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
+                btrfs_set_super_log_root(root->fs_info->super_copy, 0);
-                btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+                btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
        }
-        memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+        memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
-               sizeof(root->fs_info->super_copy));
+               sizeof(*root->fs_info->super_copy));
        trans->transaction->blocked = 0;
        spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0618aa39740..3568374d419 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
                              struct walk_control *wc, u64 gen)
 {
        if (wc->pin)
-                btrfs_pin_extent(log->fs_info->extent_root,
+                btrfs_pin_extent_for_log_replay(wc->trans,
-                                 eb->start, eb->len, 0);
+                                                log->fs_info->extent_root,
+                                                eb->start, eb->len);
        if (btrfs_buffer_uptodate(eb, gen)) {
                if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                                WARN_ON(root_owner !=
                                        BTRFS_TREE_LOG_OBJECTID);
-                                ret = btrfs_free_reserved_extent(root,
+                                ret = btrfs_free_and_pin_reserved_extent(root,
                                                         bytenr, blocksize);
                                BUG_ON(ret);
                        }
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                btrfs_tree_unlock(next);
                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-                                ret = btrfs_free_reserved_extent(root,
+                                ret = btrfs_free_and_pin_reserved_extent(root,
                                                path->nodes[*level]->start,
                                                path->nodes[*level]->len);
                                BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
                        WARN_ON(log->root_key.objectid !=
                                BTRFS_TREE_LOG_OBJECTID);
-                        ret = btrfs_free_reserved_extent(log, next->start,
+                        ret = btrfs_free_and_pin_reserved_extent(log, next->start,
                                                         next->len);
                        BUG_ON(ret);
                }
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        /* wait for previous tree log sync to complete */
        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
                wait_log_commit(trans, root, root->log_transid - 1);
        while (1) {
                unsigned long batch = root->log_batch;
-                if (root->log_multiple_pids) {
+                /* when we're on an ssd, just kick the log commit out */
+                if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
                        mutex_unlock(&root->log_mutex);
                        schedule_timeout_uninterruptible(1);
                        mutex_lock(&root->log_mutex);
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
-        btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+        btrfs_set_super_log_root(root->fs_info->super_for_commit,
                                log_root_tree->node->start);
-        btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+        btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
                                btrfs_header_level(log_root_tree->node));
        log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da6..0a8c8f8304b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
                }
                INIT_LIST_HEAD(&device->dev_alloc_list);
+                /* init readahead state */
+                spin_lock_init(&device->reada_lock);
+                device->reada_curr_zone = NULL;
+                atomic_set(&device->reada_in_flight, 0);
+                device->reada_next = 0;
+                INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+                INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
                mutex_lock(&fs_devices->device_list_mutex);
                list_add_rcu(&device->dev_list, &fs_devices->devices);
                mutex_unlock(&fs_devices->device_list_mutex);
@@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                set_blocksize(bdev, 4096);
                bh = btrfs_read_dev_super(bdev);
-                if (!bh) {
+                if (!bh)
-                        ret = -EINVAL;
                        goto error_close;
-                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
                devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +661,7 @@ error:
                continue;
        }
        if (fs_devices->open_devices == 0) {
-                ret = -EIO;
+                ret = -EINVAL;
                goto out;
        }
        fs_devices->seeding = seeding;
@@ -993,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
        key.objectid = device->devid;
        key.offset = start;
        key.type = BTRFS_DEV_EXTENT_KEY;
+again:
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0) {
                ret = btrfs_previous_item(root, path, key.objectid,
@@ -1006,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
                                        struct btrfs_dev_extent);
                BUG_ON(found_key.offset > start || found_key.offset +
                       btrfs_dev_extent_length(leaf, extent) < start);
+                key = found_key;
+                btrfs_release_path(path);
+                goto again;
        } else if (ret == 0) {
                leaf = path->nodes[0];
                extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1013,8 +1022,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
        }
        BUG_ON(ret);
-        if (device->bytes_used > 0)
+        if (device->bytes_used > 0) {
-                device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+                u64 len = btrfs_dev_extent_length(leaf, extent);
+                device->bytes_used -= len;
+                spin_lock(&root->fs_info->free_chunk_lock);
+                root->fs_info->free_chunk_space += len;
+                spin_unlock(&root->fs_info->free_chunk_lock);
+        }
        ret = btrfs_del_item(trans, root, path);
 out:
@@ -1356,6 +1370,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        if (ret)
                goto error_undo;
+        spin_lock(&root->fs_info->free_chunk_lock);
+        root->fs_info->free_chunk_space = device->total_bytes -
+                device->bytes_used;
+        spin_unlock(&root->fs_info->free_chunk_lock);
        device->in_fs_metadata = 0;
        btrfs_scrub_cancel_dev(root, device);
@@ -1387,8 +1406,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        call_rcu(&device->rcu, free_device);
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-        num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+        num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
-        btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+        btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
        if (cur_devices->open_devices == 0) {
                struct btrfs_fs_devices *fs_devices;
@@ -1450,7 +1469,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_fs_devices *old_devices;
        struct btrfs_fs_devices *seed_devices;
-        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+        struct btrfs_super_block *disk_super = root->fs_info->super_copy;
        struct btrfs_device *device;
        u64 super_flags;
@@ -1592,7 +1611,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
                return -EINVAL;
-        bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
                                  root->fs_info->bdev_holder);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
@@ -1691,15 +1710,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                root->fs_info->fs_devices->num_can_discard++;
        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+        spin_lock(&root->fs_info->free_chunk_lock);
+        root->fs_info->free_chunk_space += device->total_bytes;
+        spin_unlock(&root->fs_info->free_chunk_lock);
        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
                root->fs_info->fs_devices->rotating = 1;
-        total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
-        btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+        btrfs_set_super_total_bytes(root->fs_info->super_copy,
                                    total_bytes + device->total_bytes);
-        total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+        total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
-        btrfs_set_super_num_devices(&root->fs_info->super_copy,
+        btrfs_set_super_num_devices(root->fs_info->super_copy,
                                    total_bytes + 1);
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
@@ -1790,7 +1813,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
                      struct btrfs_device *device, u64 new_size)
 {
        struct btrfs_super_block *super_copy =
-                &device->dev_root->fs_info->super_copy;
+                device->dev_root->fs_info->super_copy;
        u64 old_total = btrfs_super_total_bytes(super_copy);
        u64 diff = new_size - device->total_bytes;
@@ -1849,7 +1872,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
                        chunk_offset)
 {
-        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        struct btrfs_disk_key *disk_key;
        struct btrfs_chunk *chunk;
        u8 *ptr;
@@ -2175,7 +2198,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        bool retried = false;
        struct extent_buffer *l;
        struct btrfs_key key;
-        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        u64 old_total = btrfs_super_total_bytes(super_copy);
        u64 old_size = device->total_bytes;
        u64 diff = device->total_bytes - new_size;
@@ -2192,8 +2215,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        lock_chunks(root);
        device->total_bytes = new_size;
-        if (device->writeable)
+        if (device->writeable) {
                device->fs_devices->total_rw_bytes -= diff;
+                spin_lock(&root->fs_info->free_chunk_lock);
+                root->fs_info->free_chunk_space -= diff;
+                spin_unlock(&root->fs_info->free_chunk_lock);
+        }
        unlock_chunks(root);
 again:
@@ -2257,6 +2284,9 @@ again:
                device->total_bytes = old_size;
                if (device->writeable)
                        device->fs_devices->total_rw_bytes += diff;
+                spin_lock(&root->fs_info->free_chunk_lock);
+                root->fs_info->free_chunk_space += diff;
+                spin_unlock(&root->fs_info->free_chunk_lock);
                unlock_chunks(root);
                goto done;
        }
@@ -2292,7 +2322,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
                           struct btrfs_key *key,
                           struct btrfs_chunk *chunk, int item_size)
 {
-        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        struct btrfs_disk_key disk_key;
        u32 array_size;
        u8 *ptr;
@@ -2615,6 +2645,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
                index++;
        }
+        spin_lock(&extent_root->fs_info->free_chunk_lock);
+        extent_root->fs_info->free_chunk_space -= (stripe_size *
+                                                   map->num_stripes);
+        spin_unlock(&extent_root->fs_info->free_chunk_lock);
        index = 0;
        stripe = &chunk->stripe;
        while (index < map->num_stripes) {
@@ -2848,7 +2883,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                             u64 logical, u64 *length,
-                             struct btrfs_multi_bio **multi_ret,
+                             struct btrfs_bio **bbio_ret,
                             int mirror_num)
 {
        struct extent_map *em;
@@ -2866,18 +2901,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        int i;
        int num_stripes;
        int max_errors = 0;
-        struct btrfs_multi_bio *multi = NULL;
+        struct btrfs_bio *bbio = NULL;
-        if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
+        if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
                stripes_allocated = 1;
 again:
-        if (multi_ret) {
+        if (bbio_ret) {
-                multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+                bbio = kzalloc(btrfs_bio_size(stripes_allocated),
                                GFP_NOFS);
-                if (!multi)
+                if (!bbio)
                        return -ENOMEM;
-                atomic_set(&multi->error, 0);
+                atomic_set(&bbio->error, 0);
        }
        read_lock(&em_tree->lock);
@@ -2898,7 +2933,7 @@ again:
        if (mirror_num > map->num_stripes)
                mirror_num = 0;
-        /* if our multi bio struct is too small, back off and try again */
+        /* if our btrfs_bio struct is too small, back off and try again */
        if (rw & REQ_WRITE) {
                if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
                                 BTRFS_BLOCK_GROUP_DUP)) {
@@ -2917,11 +2952,11 @@ again:
                        stripes_required = map->num_stripes;
                }
        }
-        if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+        if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
            stripes_allocated < stripes_required) {
                stripes_allocated = map->num_stripes;
                free_extent_map(em);
-                kfree(multi);
+                kfree(bbio);
                goto again;
        }
        stripe_nr = offset;
@@ -2950,7 +2985,7 @@ again:
                *length = em->len - offset;
        }
-        if (!multi_ret)
+        if (!bbio_ret)
                goto out;
        num_stripes = 1;
@@ -2975,13 +3010,17 @@ again:
                        stripe_index = find_live_mirror(map, 0,
                                            map->num_stripes,
                                            current->pid % map->num_stripes);
+                        mirror_num = stripe_index + 1;
                }
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-                if (rw & (REQ_WRITE | REQ_DISCARD))
+                if (rw & (REQ_WRITE | REQ_DISCARD)) {
                        num_stripes = map->num_stripes;
-                else if (mirror_num)
+                } else if (mirror_num) {
                        stripe_index = mirror_num - 1;
+                } else {
+                        mirror_num = 1;
+                }
        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
                int factor = map->num_stripes / map->sub_stripes;
@@ -3001,6 +3040,7 @@ again:
                        stripe_index = find_live_mirror(map, stripe_index,
                                              map->sub_stripes, stripe_index +
                                              current->pid % map->sub_stripes);
+                        mirror_num = stripe_index + 1;
                }
        } else {
                /*
@@ -3009,15 +3049,16 @@ again:
                 * stripe_index is the number of our device in the stripe array
                 */
                stripe_index = do_div(stripe_nr, map->num_stripes);
+                mirror_num = stripe_index + 1;
        }
        BUG_ON(stripe_index >= map->num_stripes);
        if (rw & REQ_DISCARD) {
                for (i = 0; i < num_stripes; i++) {
-                        multi->stripes[i].physical =
+                        bbio->stripes[i].physical =
                                map->stripes[stripe_index].physical +
                                stripe_offset + stripe_nr * map->stripe_len;
-                        multi->stripes[i].dev = map->stripes[stripe_index].dev;
+                        bbio->stripes[i].dev = map->stripes[stripe_index].dev;
                        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
                                u64 stripes;
@@ -3038,16 +3079,16 @@ again:
                                }
                                stripes = stripe_nr_end - 1 - j;
                                do_div(stripes, map->num_stripes);
-                                multi->stripes[i].length = map->stripe_len *
+                                bbio->stripes[i].length = map->stripe_len *
                                        (stripes - stripe_nr + 1);
                                if (i == 0) {
-                                        multi->stripes[i].length -=
+                                        bbio->stripes[i].length -=
                                                stripe_offset;
                                        stripe_offset = 0;
                                }
                                if (stripe_index == last_stripe)
-                                        multi->stripes[i].length -=
+                                        bbio->stripes[i].length -=
                                                stripe_end_offset;
                        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
                                u64 stripes;
@@ -3072,11 +3113,11 @@ again:
                                }
                                stripes = stripe_nr_end - 1 - j;
                                do_div(stripes, factor);
-                                multi->stripes[i].length = map->stripe_len *
+                                bbio->stripes[i].length = map->stripe_len *
                                        (stripes - stripe_nr + 1);
                                if (i < map->sub_stripes) {
-                                        multi->stripes[i].length -=
+                                        bbio->stripes[i].length -=
                                                stripe_offset;
                                        if (i == map->sub_stripes - 1)
                                                stripe_offset = 0;
@@ -3084,11 +3125,11 @@ again:
                                if (stripe_index >= last_stripe &&
                                    stripe_index <= (last_stripe +
                                                     map->sub_stripes - 1)) {
-                                        multi->stripes[i].length -=
+                                        bbio->stripes[i].length -=
                                                stripe_end_offset;
                                }
                        } else
-                                multi->stripes[i].length = *length;
+                                bbio->stripes[i].length = *length;
                        stripe_index++;
                        if (stripe_index == map->num_stripes) {
@@ -3099,19 +3140,20 @@ again:
                }
        } else {
                for (i = 0; i < num_stripes; i++) {
-                        multi->stripes[i].physical =
+                        bbio->stripes[i].physical =
                                map->stripes[stripe_index].physical +
                                stripe_offset +
                                stripe_nr * map->stripe_len;
-                        multi->stripes[i].dev =
+                        bbio->stripes[i].dev =
                                map->stripes[stripe_index].dev;
                        stripe_index++;
                }
        }
-        if (multi_ret) {
+        if (bbio_ret) {
-                *multi_ret = multi;
+                *bbio_ret = bbio;
-                multi->num_stripes = num_stripes;
+                bbio->num_stripes = num_stripes;
-                multi->max_errors = max_errors;
+                bbio->max_errors = max_errors;
+                bbio->mirror_num = mirror_num;
        }
 out:
        free_extent_map(em);
@@ -3120,9 +3162,9 @@ out:
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                      u64 logical, u64 *length,
-                      struct btrfs_multi_bio **multi_ret, int mirror_num)
+                      struct btrfs_bio **bbio_ret, int mirror_num)
 {
-        return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+        return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
                                 mirror_num);
 }
@@ -3191,28 +3233,30 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
        return 0;
 }
-static void end_bio_multi_stripe(struct bio *bio, int err)
+static void btrfs_end_bio(struct bio *bio, int err)
 {
-        struct btrfs_multi_bio *multi = bio->bi_private;
+        struct btrfs_bio *bbio = bio->bi_private;
        int is_orig_bio = 0;
        if (err)
-                atomic_inc(&multi->error);
+                atomic_inc(&bbio->error);
-        if (bio == multi->orig_bio)
+        if (bio == bbio->orig_bio)
                is_orig_bio = 1;
-        if (atomic_dec_and_test(&multi->stripes_pending)) {
+        if (atomic_dec_and_test(&bbio->stripes_pending)) {
                if (!is_orig_bio) {
                        bio_put(bio);
-                        bio = multi->orig_bio;
+                        bio = bbio->orig_bio;
                }
-                bio->bi_private = multi->private;
+                bio->bi_private = bbio->private;
-                bio->bi_end_io = multi->end_io;
+                bio->bi_end_io = bbio->end_io;
+                bio->bi_bdev = (struct block_device *)
+                                        (unsigned long)bbio->mirror_num;
                /* only send an error to the higher layers if it is
                 * beyond the tolerance of the multi-bio
                 */
-                if (atomic_read(&multi->error) > multi->max_errors) {
+                if (atomic_read(&bbio->error) > bbio->max_errors) {
                        err = -EIO;
                } else if (err) {
                        /*
@@ -3222,7 +3266,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
                        set_bit(BIO_UPTODATE, &bio->bi_flags);
                        err = 0;
                }
-                kfree(multi);
+                kfree(bbio);
                bio_endio(bio, err);
        } else if (!is_orig_bio) {
@@ -3302,20 +3346,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        u64 logical = (u64)bio->bi_sector << 9;
        u64 length = 0;
        u64 map_length;
-        struct btrfs_multi_bio *multi = NULL;
        int ret;
        int dev_nr = 0;
        int total_devs = 1;
+        struct btrfs_bio *bbio = NULL;
        length = bio->bi_size;
        map_tree = &root->fs_info->mapping_tree;
        map_length = length;
-        ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+        ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
                              mirror_num);
        BUG_ON(ret);
-        total_devs = multi->num_stripes;
+        total_devs = bbio->num_stripes;
        if (map_length < length) {
                printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
                       "len %llu\n", (unsigned long long)logical,
@@ -3323,25 +3367,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                       (unsigned long long)map_length);
                BUG();
        }
-        multi->end_io = first_bio->bi_end_io;
-        multi->private = first_bio->bi_private;
+        bbio->orig_bio = first_bio;
-        multi->orig_bio = first_bio;
+        bbio->private = first_bio->bi_private;
-        atomic_set(&multi->stripes_pending, multi->num_stripes);
+        bbio->end_io = first_bio->bi_end_io;
+        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
        while (dev_nr < total_devs) {
-                if (total_devs > 1) {
+                if (dev_nr < total_devs - 1) {
-                        if (dev_nr < total_devs - 1) {
+                        bio = bio_clone(first_bio, GFP_NOFS);
-                                bio = bio_clone(first_bio, GFP_NOFS);
+                        BUG_ON(!bio);
-                                BUG_ON(!bio);
+                } else {
-                        } else {
+                        bio = first_bio;
-                                bio = first_bio;
-                        }
-                        bio->bi_private = multi;
-                        bio->bi_end_io = end_bio_multi_stripe;
                }
-                bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
+                bio->bi_private = bbio;
-                dev = multi->stripes[dev_nr].dev;
+                bio->bi_end_io = btrfs_end_bio;
+                bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
+                dev = bbio->stripes[dev_nr].dev;
                if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
+                        pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
+                                 "(%s id %llu), size=%u\n", rw,
+                                 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+                                 dev->name, dev->devid, bio->bi_size);
                        bio->bi_bdev = dev->bdev;
                        if (async_submit)
                                schedule_bio(root, dev, rw, bio);
@@ -3354,8 +3401,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                }
                dev_nr++;
        }
-        if (total_devs == 1)
-                kfree(multi);
        return 0;
 }
@@ -3616,15 +3661,20 @@ static int read_one_dev(struct btrfs_root *root,
        fill_device_from_item(leaf, dev_item, device);
        device->dev_root = root->fs_info->dev_root;
        device->in_fs_metadata = 1;
-        if (device->writeable)
+        if (device->writeable) {
                device->fs_devices->total_rw_bytes += device->total_bytes;
+                spin_lock(&root->fs_info->free_chunk_lock);
+                root->fs_info->free_chunk_space += device->total_bytes -
+                        device->bytes_used;
+                spin_unlock(&root->fs_info->free_chunk_lock);
+        }
        ret = 0;
        return ret;
 }
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
-        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        struct extent_buffer *sb;
        struct btrfs_disk_key *disk_key;
        struct btrfs_chunk *chunk;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e17..78f2d4d4f37 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,20 @@ struct btrfs_device {
        struct btrfs_work work;
        struct rcu_head rcu;
        struct work_struct rcu_work;
+        /* readahead state */
+        spinlock_t reada_lock;
+        atomic_t reada_in_flight;
+        u64 reada_next;
+        struct reada_zone *reada_curr_zone;
+        struct radix_tree_root reada_zones;
+        struct radix_tree_root reada_extents;
+        /* for sending down flush barriers */
+        struct bio *flush_bio;
+        struct completion flush_wait;
+        int nobarriers;
 };
 struct btrfs_fs_devices {
@@ -136,7 +150,10 @@ struct btrfs_bio_stripe {
        u64 length; /* only used for discard mappings */
 };
-struct btrfs_multi_bio {
+struct btrfs_bio;
+typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+struct btrfs_bio {
        atomic_t stripes_pending;
        bio_end_io_t *end_io;
        struct bio *orig_bio;
@@ -144,6 +161,7 @@ struct btrfs_multi_bio {
        atomic_t error;
        int max_errors;
        int num_stripes;
+        int mirror_num;
        struct btrfs_bio_stripe stripes[];
 };
@@ -171,7 +189,7 @@ struct map_lookup {
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                                   u64 end, u64 *length);
-#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
                            (sizeof(struct btrfs_bio_stripe) * (n)))
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -180,7 +198,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
                           u64 chunk_offset, u64 start, u64 num_bytes);
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                    u64 logical, u64 *length,
-                    struct btrfs_multi_bio **multi_ret, int mirror_num);
+                    struct btrfs_bio **bbio_ret, int mirror_num);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                     u64 chunk_start, u64 physical, u64 devid,
                     u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 426aa464f1a..3848b04e310 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 again:
        ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
                                      name, name_len, value, size);
+        /*
+         * If we're setting an xattr to a new value but the new value is say
+         * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
+         * back from split_leaf.  This is because it thinks we'll be extending
+         * the existing item size, but we're asking for enough space to add the
+         * item itself.  So if we get EOVERFLOW just set ret to EEXIST and let
+         * the rest of the function figure it out.
+         */
+        if (ret == -EOVERFLOW)
+                ret = -EEXIST;
        if (ret == -EEXIST) {
                if (flags & XATTR_CREATE)
                        goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 70a19745cb6..19d8eb7fdc8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -288,7 +288,7 @@ static void free_more_memory(void)
        struct zone *zone;
        int nid;
-        wakeup_flusher_threads(1024);
+        wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
        yield();
        for_each_online_node(nid) {
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 2cfb695d1f8..5d9b9acc5fc 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -204,7 +204,7 @@ int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
 }
 /* first calculate 24 bytes ntlm response and then 16 byte session key */
-int setup_ntlm_response(struct cifs_ses *ses)
+int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
        int rc = 0;
        unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
@@ -221,14 +221,14 @@ int setup_ntlm_response(struct cifs_ses *ses)
        ses->auth_key.len = temp_len;
        rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
-                        ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+                        ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp);
        if (rc) {
                cFYI(1, "%s Can't generate NTLM response, error: %d",
                        __func__, rc);
                return rc;
        }
-        rc = E_md4hash(ses->password, temp_key);
+        rc = E_md4hash(ses->password, temp_key, nls_cp);
        if (rc) {
                cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
                return rc;
@@ -404,7 +404,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
        }
        /* calculate md4 hash of password */
-        E_md4hash(ses->password, nt_hash);
+        E_md4hash(ses->password, nt_hash, nls_cp);
        rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
                                CIFS_NTHASH_SIZE);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d9dbaf869cd..30ff56005d8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -125,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION   "1.75"
+#define CIFS_VERSION   "1.76"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ef4f631e4c0..6f4e243e0f6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -395,8 +395,9 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
                                 struct TCP_Server_Info *server,
                                __u32 expected_sequence_number);
-extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
+extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
-extern int setup_ntlm_response(struct cifs_ses *);
+                        const struct nls_table *);
+extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
 extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
 extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
@@ -448,7 +449,8 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
                const unsigned char *path,
                struct cifs_sb_info *cifs_sb, int xid);
 extern int mdfour(unsigned char *, unsigned char *, int);
-extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
+extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
+                        const struct nls_table *codepage);
 extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
                        unsigned char *p24);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d545a95c30e..8cd4b52d421 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -37,6 +37,7 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <linux/inet.h>
+#include <linux/module.h>
 #include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -440,6 +441,8 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
        smb_msg.msg_controllen = 0;
        for (total_read = 0; to_read; total_read += length, to_read -= length) {
+                try_to_freeze();
                if (server_unresponsive(server)) {
                        total_read = -EAGAIN;
                        break;
@@ -3452,7 +3455,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
                else
 #endif /* CIFS_WEAK_PW_HASH */
                rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
-                                        bcc_ptr);
+                                        bcc_ptr, nls_codepage);
                bcc_ptr += CIFS_AUTH_RESP_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ea096ce5d4f..4dd9283885e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -645,20 +645,20 @@ int cifs_closedir(struct inode *inode, struct file *file)
 }
 static struct cifsLockInfo *
-cifs_lock_init(__u64 len, __u64 offset, __u8 type, __u16 netfid)
+cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid)
 {
-        struct cifsLockInfo *li =
+        struct cifsLockInfo *lock =
                kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
-        if (!li)
+        if (!lock)
-                return li;
+                return lock;
-        li->netfid = netfid;
+        lock->offset = offset;
-        li->offset = offset;
+        lock->length = length;
-        li->length = len;
+        lock->type = type;
-        li->type = type;
+        lock->netfid = netfid;
-        li->pid = current->tgid;
+        lock->pid = current->tgid;
-        INIT_LIST_HEAD(&li->blist);
+        INIT_LIST_HEAD(&lock->blist);
-        init_waitqueue_head(&li->block_q);
+        init_waitqueue_head(&lock->block_q);
-        return li;
+        return lock;
 }
 static void
@@ -672,7 +672,7 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
 }
 static bool
-cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
+__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
                        __u64 length, __u8 type, __u16 netfid,
                        struct cifsLockInfo **conf_lock)
 {
@@ -694,6 +694,21 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
        return false;
 }
+static bool
+cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
+                        struct cifsLockInfo **conf_lock)
+{
+        return __cifs_find_lock_conflict(cinode, lock->offset, lock->length,
+                                         lock->type, lock->netfid, conf_lock);
+}
+/*
+ * Check if there is another lock that prevents us to set the lock (mandatory
+ * style). If such a lock exists, update the flock structure with its
+ * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
+ * or leave it the same if we can't. Returns 0 if we don't need to request to
+ * the server or 1 otherwise.
+ */
 static int
 cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
               __u8 type, __u16 netfid, struct file_lock *flock)
@@ -704,8 +719,8 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
        mutex_lock(&cinode->lock_mutex);
-        exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid,
+        exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid,
-                                        &conf_lock);
+                                          &conf_lock);
        if (exist) {
                flock->fl_start = conf_lock->offset;
                flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -723,40 +738,33 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
        return rc;
 }
-static int
+static void
-cifs_lock_add(struct cifsInodeInfo *cinode, __u64 len, __u64 offset,
+cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
-              __u8 type, __u16 netfid)
 {
-        struct cifsLockInfo *li;
-        li = cifs_lock_init(len, offset, type, netfid);
-        if (!li)
-                return -ENOMEM;
        mutex_lock(&cinode->lock_mutex);
-        list_add_tail(&li->llist, &cinode->llist);
+        list_add_tail(&lock->llist, &cinode->llist);
        mutex_unlock(&cinode->lock_mutex);
-        return 0;
 }
+/*
+ * Set the byte-range lock (mandatory style). Returns:
+ * 1) 0, if we set the lock and don't need to request to the server;
+ * 2) 1, if no locks prevent us but we need to request to the server;
+ * 3) -EACCESS, if there is a lock that prevents us and wait is false.
+ */
 static int
-cifs_lock_add_if(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
+cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
-                 __u8 type, __u16 netfid, bool wait)
+                 bool wait)
 {
-        struct cifsLockInfo *lock, *conf_lock;
+        struct cifsLockInfo *conf_lock;
        bool exist;
        int rc = 0;
-        lock = cifs_lock_init(length, offset, type, netfid);
-        if (!lock)
-                return -ENOMEM;
 try_again:
        exist = false;
        mutex_lock(&cinode->lock_mutex);
-        exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid,
+        exist = cifs_find_lock_conflict(cinode, lock, &conf_lock);
-                                        &conf_lock);
        if (!exist && cinode->can_cache_brlcks) {
                list_add_tail(&lock->llist, &cinode->llist);
                mutex_unlock(&cinode->lock_mutex);
@@ -775,18 +783,21 @@ try_again:
                                        (lock->blist.next == &lock->blist));
                if (!rc)
                        goto try_again;
-                else {
+                mutex_lock(&cinode->lock_mutex);
-                        mutex_lock(&cinode->lock_mutex);
+                list_del_init(&lock->blist);
-                        list_del_init(&lock->blist);
-                        mutex_unlock(&cinode->lock_mutex);
-                }
        }
-        kfree(lock);
        mutex_unlock(&cinode->lock_mutex);
        return rc;
 }
+/*
+ * Check if there is another lock that prevents us to set the lock (posix
+ * style). If such a lock exists, update the flock structure with its
+ * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
+ * or leave it the same if we can't. Returns 0 if we don't need to request to
+ * the server or 1 otherwise.
+ */
 static int
 cifs_posix_lock_test(struct file *file, struct file_lock *flock)
 {
@@ -794,6 +805,9 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
        struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
        unsigned char saved_type = flock->fl_type;
+        if ((flock->fl_flags & FL_POSIX) == 0)
+                return 1;
        mutex_lock(&cinode->lock_mutex);
        posix_test_lock(file, flock);
@@ -806,16 +820,25 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
        return rc;
 }
+/*
+ * Set the byte-range lock (posix style). Returns:
+ * 1) 0, if we set the lock and don't need to request to the server;
+ * 2) 1, if we need to request to the server;
+ * 3) <0, if the error occurs while setting the lock.
+ */
 static int
 cifs_posix_lock_set(struct file *file, struct file_lock *flock)
 {
        struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
-        int rc;
+        int rc = 1;
+        if ((flock->fl_flags & FL_POSIX) == 0)
+                return rc;
        mutex_lock(&cinode->lock_mutex);
        if (!cinode->can_cache_brlcks) {
                mutex_unlock(&cinode->lock_mutex);
-                return 1;
+                return rc;
        }
        rc = posix_lock_file_wait(file, flock);
        mutex_unlock(&cinode->lock_mutex);
@@ -928,7 +951,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
                else
                        type = CIFS_WRLCK;
-                lck = cifs_lock_init(length, flock->fl_start, type,
+                lck = cifs_lock_init(flock->fl_start, length, type,
                                     cfile->netfid);
                if (!lck) {
                        rc = -ENOMEM;
@@ -1065,14 +1088,12 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
                if (rc != 0)
                        cERROR(1, "Error unlocking previously locked "
                                   "range %d during test of lock", rc);
-                rc = 0;
+                return 0;
-                return rc;
        }
        if (type & LOCKING_ANDX_SHARED_LOCK) {
                flock->fl_type = F_WRLCK;
-                rc = 0;
+                return 0;
-                return rc;
        }
        rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
@@ -1090,8 +1111,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
        } else
                flock->fl_type = F_WRLCK;
-        rc = 0;
+        return 0;
-        return rc;
 }
 static void
@@ -1249,20 +1269,26 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
        }
        if (lock) {
-                rc = cifs_lock_add_if(cinode, flock->fl_start, length,
+                struct cifsLockInfo *lock;
-                                      type, netfid, wait_flag);
+                lock = cifs_lock_init(flock->fl_start, length, type, netfid);
+                if (!lock)
+                        return -ENOMEM;
+                rc = cifs_lock_add_if(cinode, lock, wait_flag);
                if (rc < 0)
-                        return rc;
+                        kfree(lock);
-                else if (!rc)
+                if (rc <= 0)
                        goto out;
                rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
                                 flock->fl_start, 0, 1, type, wait_flag, 0);
-                if (rc == 0) {
+                if (rc) {
-                        /* For Windows locks we must store them. */
+                        kfree(lock);
-                        rc = cifs_lock_add(cinode, length, flock->fl_start,
+                        goto out;
-                                           type, netfid);
                }
+                cifs_lock_add(cinode, lock);
        } else if (unlock)
                rc = cifs_unlock_range(cfile, flock, xid);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5de03ec2014..a090bbe6ee2 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -554,7 +554,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
                                 rc);
                        return rc;
                }
-                cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
+                /* FindFirst/Next set last_entry to NULL on malformed reply */
+                if (cifsFile->srch_inf.last_entry)
+                        cifs_save_resume_key(cifsFile->srch_inf.last_entry,
+                                                cifsFile);
        }
        while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
@@ -562,7 +565,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
                cFYI(1, "calling findnext2");
                rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
                                  &cifsFile->srch_inf);
-                cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
+                /* FindFirst/Next set last_entry to NULL on malformed reply */
+                if (cifsFile->srch_inf.last_entry)
+                        cifs_save_resume_key(cifsFile->srch_inf.last_entry,
+                                                cifsFile);
                if (rc)
                        return -ENOENT;
        }
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index c7d80e24f24..4ec3ee9d72c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -683,7 +683,7 @@ ssetup_ntlmssp_authenticate:
                        cpu_to_le16(CIFS_AUTH_RESP_SIZE);
                /* calculate ntlm response and session key */
-                rc = setup_ntlm_response(ses);
+                rc = setup_ntlm_response(ses, nls_cp);
                if (rc) {
                        cERROR(1, "Error %d during NTLM authentication", rc);
                        goto ssetup_exit;
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index ac1221d969d..80d85088193 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -199,75 +199,36 @@ SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
        return rc;
 }
-/* Routines for Windows NT MD4 Hash functions. */
-static int
-_my_wcslen(__u16 *str)
-{
-        int len = 0;
-        while (*str++ != 0)
-                len++;
-        return len;
-}
-/*
- * Convert a string into an NT UNICODE string.
- * Note that regardless of processor type
- * this must be in intel (little-endian)
- * format.
- */
-static int
-_my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
-{       /* BB not a very good conversion routine - change/fix */
-        int i;
-        __u16 val;
-        for (i = 0; i < len; i++) {
-                val = *src;
-                SSVAL(dst, 0, val);
-                dst++;
-                src++;
-                if (val == 0)
-                        break;
-        }
-        return i;
-}
 /*
 * Creates the MD4 Hash of the users password in NT UNICODE.
 */
 int
-E_md4hash(const unsigned char *passwd, unsigned char *p16)
+E_md4hash(const unsigned char *passwd, unsigned char *p16,
+        const struct nls_table *codepage)
 {
        int rc;
        int len;
-        __u16 wpwd[129];
+        __le16 wpwd[129];
        /* Password cannot be longer than 128 characters */
-        if (passwd) {
+        if (passwd) /* Password must be converted to NT unicode */
-                len = strlen((char *) passwd);
+                len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
-                if (len > 128)
+        else {
-                        len = 128;
-                /* Password must be converted to NT unicode */
-                _my_mbstowcs(wpwd, passwd, len);
-        } else
                len = 0;
+                *wpwd = 0; /* Ensure string is null terminated */
+        }
-        wpwd[len] = 0;  /* Ensure string is null terminated */
+        rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16));
-        /* Calculate length in bytes */
+        memset(wpwd, 0, 129 * sizeof(__le16));
-        len = _my_wcslen(wpwd) * sizeof(__u16);
-        rc = mdfour(p16, (unsigned char *) wpwd, len);
-        memset(wpwd, 0, 129 * 2);
        return rc;
 }
 /* Does the NT MD4 hash then des encryption. */
 int
-SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
+SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24,
+                const struct nls_table *codepage)
 {
        int rc;
        unsigned char p16[16], p21[21];
@@ -275,7 +236,7 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
        memset(p16, '\0', 16);
        memset(p21, '\0', 21);
-        rc = E_md4hash(passwd, p16);
+        rc = E_md4hash(passwd, p16, codepage);
        if (rc) {
                cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
                return rc;
diff --git a/fs/dcache.c b/fs/dcache.c
index 274f13e2f09..89509b5a090 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -36,6 +36,7 @@
 #include <linux/bit_spinlock.h>
 #include <linux/rculist_bl.h>
 #include <linux/prefetch.h>
+#include <linux/ratelimit.h>
 #include "internal.h"
 /*
@@ -546,9 +547,11 @@ int d_invalidate(struct dentry * dentry)
         * would make it unreachable from the root,
         * we might still populate it if it was a
         * working directory or similar).
+         * We also need to leave mountpoints alone,
+         * directory or not.
         */
-        if (dentry->d_count > 1) {
+        if (dentry->d_count > 1 && dentry->d_inode) {
-                if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
+                if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
                        spin_unlock(&dentry->d_lock);
                        return -EBUSY;
                }
@@ -2381,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                                actual = __d_unalias(inode, dentry, alias);
                        }
                        write_sequnlock(&rename_lock);
-                        if (IS_ERR(actual))
+                        if (IS_ERR(actual)) {
+                                if (PTR_ERR(actual) == -ELOOP)
+                                        pr_warn_ratelimited(
+                                                "VFS: Lookup of '%s' in %s %s"
+                                                " would have caused loop\n",
+                                                dentry->d_name.name,
+                                                inode->i_sb->s_type->name,
+                                                inode->i_sb->s_id);
                                dput(alias);
+                        }
                        goto out_nolock;
                }
        }
@@ -2428,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 /**
 * prepend_path - Prepend path string to a buffer
 * @path: the dentry/vfsmount to report
- * @root: root vfsmnt/dentry (may be modified by this function)
+ * @root: root vfsmnt/dentry
 * @buffer: pointer to the end of the buffer
 * @buflen: pointer to buffer length
 *
 * Caller holds the rename_lock.
- *
- * If path is not reachable from the supplied root, then the value of
- * root is changed (without modifying refcounts).
 */
-static int prepend_path(const struct path *path, struct path *root,
+static int prepend_path(const struct path *path,
+                        const struct path *root,
                        char **buffer, int *buflen)
 {
        struct dentry *dentry = path->dentry;
@@ -2472,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root,
                dentry = parent;
        }
-out:
        if (!error && !slash)
                error = prepend(buffer, buflen, "/", 1);
+out:
        br_read_unlock(vfsmount_lock);
        return error;
@@ -2489,15 +2498,17 @@ global_root:
                WARN(1, "Root dentry has weird name <%.*s>\n",
                     (int) dentry->d_name.len, dentry->d_name.name);
        }
-        root->mnt = vfsmnt;
+        if (!slash)
-        root->dentry = dentry;
+                error = prepend(buffer, buflen, "/", 1);
+        if (!error)
+                error = vfsmnt->mnt_ns ? 1 : 2;
        goto out;
 }
 /**
 * __d_path - return the path of a dentry
 * @path: the dentry/vfsmount to report
- * @root: root vfsmnt/dentry (may be modified by this function)
+ * @root: root vfsmnt/dentry
 * @buf: buffer to return value in
 * @buflen: buffer length
 *
@@ -2508,10 +2519,10 @@ global_root:
 *
 * "buflen" should be positive.
 *
- * If path is not reachable from the supplied root, then the value of
+ * If the path is not reachable from the supplied root, return %NULL.
- * root is changed (without modifying refcounts).
 */
-char *__d_path(const struct path *path, struct path *root,
+char *__d_path(const struct path *path,
+               const struct path *root,
               char *buf, int buflen)
 {
        char *res = buf + buflen;
@@ -2522,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root,
        error = prepend_path(path, root, &res, &buflen);
        write_sequnlock(&rename_lock);
-        if (error)
+        if (error < 0)
+                return ERR_PTR(error);
+        if (error > 0)
+                return NULL;
+        return res;
+}
+char *d_absolute_path(const struct path *path,
+               char *buf, int buflen)
+{
+        struct path root = {};
+        char *res = buf + buflen;
+        int error;
+        prepend(&res, &buflen, "\0", 1);
+        write_seqlock(&rename_lock);
+        error = prepend_path(path, &root, &res, &buflen);
+        write_sequnlock(&rename_lock);
+        if (error > 1)
+                error = -EINVAL;
+        if (error < 0)
                return ERR_PTR(error);
        return res;
 }
@@ -2530,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root,
 /*
 * same as __d_path but appends "(deleted)" for unlinked files.
 */
-static int path_with_deleted(const struct path *path, struct path *root,
+static int path_with_deleted(const struct path *path,
-                                 char **buf, int *buflen)
+                             const struct path *root,
+                             char **buf, int *buflen)
 {
        prepend(buf, buflen, "\0", 1);
        if (d_unlinked(path->dentry)) {
@@ -2568,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen)
 {
        char *res = buf + buflen;
        struct path root;
-        struct path tmp;
        int error;
        /*
@@ -2583,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen)
        get_fs_root(current->fs, &root);
        write_seqlock(&rename_lock);
-        tmp = root;
+        error = path_with_deleted(path, &root, &res, &buflen);
-        error = path_with_deleted(path, &tmp, &res, &buflen);
+        if (error < 0)
-        if (error)
                res = ERR_PTR(error);
        write_sequnlock(&rename_lock);
        path_put(&root);
@@ -2606,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
 {
        char *res = buf + buflen;
        struct path root;
-        struct path tmp;
        int error;
        if (path->dentry->d_op && path->dentry->d_op->d_dname)
@@ -2614,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
        get_fs_root(current->fs, &root);
        write_seqlock(&rename_lock);
-        tmp = root;
+        error = path_with_deleted(path, &root, &res, &buflen);
-        error = path_with_deleted(path, &tmp, &res, &buflen);
+        if (error > 0)
-        if (!error && !path_equal(&tmp, &root))
                error = prepend_unreachable(&res, &buflen);
        write_sequnlock(&rename_lock);
        path_put(&root);
@@ -2747,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
        write_seqlock(&rename_lock);
        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
-                struct path tmp = root;
                char *cwd = page + PAGE_SIZE;
                int buflen = PAGE_SIZE;
                prepend(&cwd, &buflen, "\0", 1);
-                error = prepend_path(&pwd, &tmp, &cwd, &buflen);
+                error = prepend_path(&pwd, &root, &cwd, &buflen);
                write_sequnlock(&rename_lock);
-                if (error)
+                if (error < 0)
                        goto out;
                /* Unreachable from current root */
-                if (!path_equal(&tmp, &root)) {
+                if (error > 0) {
                        error = prepend_unreachable(&cwd, &buflen);
                        if (error)
                                goto out;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 58609bde3b9..2a834255c75 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -967,7 +967,7 @@ static void ecryptfs_set_default_crypt_stat_vals(
 /**
 * ecryptfs_new_file_context
- * @ecryptfs_dentry: The eCryptfs dentry
+ * @ecryptfs_inode: The eCryptfs inode
 *
 * If the crypto context for the file has not yet been established,
 * this is where we do that.  Establishing a new crypto context
@@ -984,13 +984,13 @@ static void ecryptfs_set_default_crypt_stat_vals(
 *
 * Returns zero on success; non-zero otherwise
 */
-int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry)
+int ecryptfs_new_file_context(struct inode *ecryptfs_inode)
 {
        struct ecryptfs_crypt_stat *crypt_stat =
-            &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+            &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
            &ecryptfs_superblock_to_private(
-                    ecryptfs_dentry->d_sb)->mount_crypt_stat;
+                    ecryptfs_inode->i_sb)->mount_crypt_stat;
        int cipher_name_len;
        int rc = 0;
@@ -1299,12 +1299,12 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
 }
 static int
-ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
+ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode,
                                    char *virt, size_t virt_len)
 {
        int rc;
-        rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
+        rc = ecryptfs_write_lower(ecryptfs_inode, virt,
                                  0, virt_len);
        if (rc < 0)
                printk(KERN_ERR "%s: Error attempting to write header "
@@ -1338,7 +1338,8 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
 /**
 * ecryptfs_write_metadata
- * @ecryptfs_dentry: The eCryptfs dentry
+ * @ecryptfs_dentry: The eCryptfs dentry, which should be negative
+ * @ecryptfs_inode: The newly created eCryptfs inode
 *
 * Write the file headers out.  This will likely involve a userspace
 * callout, in which the session key is encrypted with one or more
@@ -1348,10 +1349,11 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
 *
 * Returns zero on success; non-zero on error
 */
-int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
+int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
+                            struct inode *ecryptfs_inode)
 {
        struct ecryptfs_crypt_stat *crypt_stat =
-                &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+                &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
        unsigned int order;
        char *virt;
        size_t virt_len;
@@ -1391,7 +1393,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
                rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
                                                      size);
        else
-                rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt,
+                rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt,
                                                         virt_len);
        if (rc) {
                printk(KERN_ERR "%s: Error writing metadata out to lower file; "
@@ -1943,7 +1945,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
 /* We could either offset on every reverse map or just pad some 0x00's
 * at the front here */
-static const unsigned char filename_rev_map[] = {
+static const unsigned char filename_rev_map[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
@@ -1959,7 +1961,7 @@ static const unsigned char filename_rev_map[] = {
        0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
        0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
        0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
-        0x3D, 0x3E, 0x3F
+        0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */
 };
 /**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 54481a3b2c7..a9f29b12fbf 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -584,9 +584,10 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
 int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode);
 int ecryptfs_encrypt_page(struct page *page);
 int ecryptfs_decrypt_page(struct page *page);
-int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
+int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
+                            struct inode *ecryptfs_inode);
 int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
-int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
+int ecryptfs_new_file_context(struct inode *ecryptfs_inode);
 void ecryptfs_write_crypt_stat_flags(char *page_virt,
                                     struct ecryptfs_crypt_stat *crypt_stat,
                                     size_t *written);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index c6ac98cf9ba..d3f95f941c4 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -139,6 +139,27 @@ out:
        return rc;
 }
+static void ecryptfs_vma_close(struct vm_area_struct *vma)
+{
+        filemap_write_and_wait(vma->vm_file->f_mapping);
+}
+static const struct vm_operations_struct ecryptfs_file_vm_ops = {
+        .close          = ecryptfs_vma_close,
+        .fault          = filemap_fault,
+};
+static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        int rc;
+        rc = generic_file_mmap(file, vma);
+        if (!rc)
+                vma->vm_ops = &ecryptfs_file_vm_ops;
+        return rc;
+}
 struct kmem_cache *ecryptfs_file_info_cache;
 /**
@@ -349,7 +370,7 @@ const struct file_operations ecryptfs_main_fops = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl = ecryptfs_compat_ioctl,
 #endif
-        .mmap = generic_file_mmap,
+        .mmap = ecryptfs_file_mmap,
        .open = ecryptfs_open,
        .flush = ecryptfs_flush,
        .release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a36d327f152..32f90a3ae63 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -172,22 +172,23 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
 * it. It will also update the eCryptfs directory inode to mimic the
 * stat of the lower directory inode.
 *
- * Returns zero on success; non-zero on error condition
+ * Returns the new eCryptfs inode on success; an ERR_PTR on error condition
 */
-static int
+static struct inode *
 ecryptfs_do_create(struct inode *directory_inode,
                   struct dentry *ecryptfs_dentry, int mode)
 {
        int rc;
        struct dentry *lower_dentry;
        struct dentry *lower_dir_dentry;
+        struct inode *inode;
        lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
        lower_dir_dentry = lock_parent(lower_dentry);
        if (IS_ERR(lower_dir_dentry)) {
                ecryptfs_printk(KERN_ERR, "Error locking directory of "
                                "dentry\n");
-                rc = PTR_ERR(lower_dir_dentry);
+                inode = ERR_CAST(lower_dir_dentry);
                goto out;
        }
        rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
@@ -195,20 +196,19 @@ ecryptfs_do_create(struct inode *directory_inode,
        if (rc) {
                printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
                       "rc = [%d]\n", __func__, rc);
+                inode = ERR_PTR(rc);
                goto out_lock;
        }
-        rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
+        inode = __ecryptfs_get_inode(lower_dentry->d_inode,
-                                directory_inode->i_sb);
+                                     directory_inode->i_sb);
-        if (rc) {
+        if (IS_ERR(inode))
-                ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
                goto out_lock;
-        }
        fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode);
        fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
 out_lock:
        unlock_dir(lower_dir_dentry);
 out:
-        return rc;
+        return inode;
 }
 /**
@@ -219,26 +219,26 @@ out:
 *
 * Returns zero on success
 */
-static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
+static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
+                                    struct inode *ecryptfs_inode)
 {
        struct ecryptfs_crypt_stat *crypt_stat =
-                &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+                &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
        int rc = 0;
-        if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+        if (S_ISDIR(ecryptfs_inode->i_mode)) {
                ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
                crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
                goto out;
        }
        ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
-        rc = ecryptfs_new_file_context(ecryptfs_dentry);
+        rc = ecryptfs_new_file_context(ecryptfs_inode);
        if (rc) {
                ecryptfs_printk(KERN_ERR, "Error creating new file "
                                "context; rc = [%d]\n", rc);
                goto out;
        }
-        rc = ecryptfs_get_lower_file(ecryptfs_dentry,
+        rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode);
-                                     ecryptfs_dentry->d_inode);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to initialize "
                        "the lower file for the dentry with name "
@@ -246,10 +246,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
                        ecryptfs_dentry->d_name.name, rc);
                goto out;
        }
-        rc = ecryptfs_write_metadata(ecryptfs_dentry);
+        rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode);
        if (rc)
                printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
-        ecryptfs_put_lower_file(ecryptfs_dentry->d_inode);
+        ecryptfs_put_lower_file(ecryptfs_inode);
 out:
        return rc;
 }
@@ -269,18 +269,28 @@ static int
 ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
                int mode, struct nameidata *nd)
 {
+        struct inode *ecryptfs_inode;
        int rc;
-        /* ecryptfs_do_create() calls ecryptfs_interpose() */
+        ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry,
-        rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode);
+                                            mode);
-        if (unlikely(rc)) {
+        if (unlikely(IS_ERR(ecryptfs_inode))) {
                ecryptfs_printk(KERN_WARNING, "Failed to create file in"
                                "lower filesystem\n");
+                rc = PTR_ERR(ecryptfs_inode);
                goto out;
        }
        /* At this point, a file exists on "disk"; we need to make sure
         * that this on disk file is prepared to be an ecryptfs file */
-        rc = ecryptfs_initialize_file(ecryptfs_dentry);
+        rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode);
+        if (rc) {
+                drop_nlink(ecryptfs_inode);
+                unlock_new_inode(ecryptfs_inode);
+                iput(ecryptfs_inode);
+                goto out;
+        }
+        d_instantiate(ecryptfs_dentry, ecryptfs_inode);
+        unlock_new_inode(ecryptfs_inode);
 out:
        return rc;
 }
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index fa9a286c877..da42f32c49b 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -5,7 +5,7 @@
 # selected by any of the users.
 config ORE
        tristate
-        depends on EXOFS_FS
+        depends on EXOFS_FS || PNFS_OBJLAYOUT
        select ASYNC_XOR
        default SCSI_OSD_ULD
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index fcfa86ae6fa..d271ad83720 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -23,6 +23,7 @@
 */
 #include <linux/slab.h>
+#include <linux/module.h>
 #include <asm/div64.h>
 #include <linux/lcm.h>
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 057b237b8b6..e6085ec192d 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -35,6 +35,7 @@
 #include <linux/parser.h>
 #include <linux/vfs.h>
 #include <linux/random.h>
+#include <linux/module.h>
 #include <linux/exportfs.h>
 #include <linux/slab.h>
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f6dba4505f1..12ccacda44e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -565,7 +565,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
        brelse(bitmap_bh);
        printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
               ", computed = %llu, %llu\n",
-               EXT4_B2C(sbi, ext4_free_blocks_count(es)),
+               EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
               desc_count, bitmap_count);
        return bitmap_count;
 #else
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cc5a6da030a..848f436df29 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2270,6 +2270,7 @@ retry:
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
+                        blk_finish_plug(&plug);
                        goto out_writepages;
                }
@@ -2372,7 +2373,7 @@ static int ext4_nonda_switch(struct super_block *sb)
         * start pushing delalloc when 1/2 of free blocks are dirty.
         */
        if (free_blocks < 2 * dirty_blocks)
-                writeback_inodes_sb_if_idle(sb);
+                writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
        return 0;
 }
@@ -2806,8 +2807,8 @@ out:
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        /* queue the work to convert unwritten extents to written */
-        queue_work(wq, &io_end->work);
        iocb->private = NULL;
+        queue_work(wq, &io_end->work);
        /* XXX: probably should move into the real I/O completion handler */
        inode_dio_done(inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9953d80145a..3858767ec67 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1683,7 +1683,9 @@ static int parse_options(char *options, struct super_block *sb,
                        data_opt = EXT4_MOUNT_WRITEBACK_DATA;
                datacheck:
                        if (is_remount) {
-                                if (test_opt(sb, DATA_FLAGS) != data_opt) {
+                                if (!sbi->s_journal)
+                                        ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+                                else if (test_opt(sb, DATA_FLAGS) != data_opt) {
                                        ext4_msg(sb, KERN_ERR,
                                                "Cannot change data mode on remount");
                                        return 0;
@@ -3099,8 +3101,6 @@ static void ext4_destroy_lazyinit_thread(void)
 }
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
-                                __releases(kernel_lock)
-                                __acquires(kernel_lock)
 {
        char *orig_data = kstrdup(data, GFP_KERNEL);
        struct buffer_head *bh;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 04cf3b91e50..ac86f8b3e3c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,11 +41,23 @@ struct wb_writeback_work {
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
+        enum wb_reason reason;          /* why was writeback initiated? */
        struct list_head list;          /* pending work list */
        struct completion *done;        /* set if the caller waits */
 };
+const char *wb_reason_name[] = {
+        [WB_REASON_BACKGROUND]          = "background",
+        [WB_REASON_TRY_TO_FREE_PAGES]   = "try_to_free_pages",
+        [WB_REASON_SYNC]                = "sync",
+        [WB_REASON_PERIODIC]            = "periodic",
+        [WB_REASON_LAPTOP_TIMER]        = "laptop_timer",
+        [WB_REASON_FREE_MORE_MEM]       = "free_more_memory",
+        [WB_REASON_FS_FREE_SPACE]       = "fs_free_space",
+        [WB_REASON_FORKER_THREAD]       = "forker_thread"
+};
 /*
 * Include the creation of the trace points after defining the
 * wb_writeback_work structure so that the definition remains local to this
@@ -115,7 +127,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-                      bool range_cyclic)
+                      bool range_cyclic, enum wb_reason reason)
 {
        struct wb_writeback_work *work;
@@ -135,6 +147,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
        work->sync_mode = WB_SYNC_NONE;
        work->nr_pages  = nr_pages;
        work->range_cyclic = range_cyclic;
+        work->reason    = reason;
        bdi_queue_work(bdi, work);
 }
@@ -143,6 +156,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 * bdi_start_writeback - start writeback
 * @bdi: the backing device to write from
 * @nr_pages: the number of pages to write
+ * @reason: reason why some writeback work was initiated
 *
 * Description:
 *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
@@ -150,9 +164,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 *   completion. Caller need not hold sb s_umount semaphore.
 *
 */
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+                        enum wb_reason reason)
 {
-        __bdi_start_writeback(bdi, nr_pages, true);
+        __bdi_start_writeback(bdi, nr_pages, true, reason);
 }
 /**
@@ -251,7 +266,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 */
 static int move_expired_inodes(struct list_head *delaying_queue,
                               struct list_head *dispatch_queue,
-                               unsigned long *older_than_this)
+                               struct wb_writeback_work *work)
 {
        LIST_HEAD(tmp);
        struct list_head *pos, *node;
@@ -262,8 +277,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
-                if (older_than_this &&
+                if (work->older_than_this &&
-                    inode_dirtied_after(inode, *older_than_this))
+                    inode_dirtied_after(inode, *work->older_than_this))
                        break;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
@@ -302,13 +317,13 @@ out:
 *                                           |
 *                                           +--> dequeue for IO
 */
-static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
+static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 {
        int moved;
        assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
-        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
-        trace_writeback_queue_io(wb, older_than_this, moved);
+        trace_writeback_queue_io(wb, work, moved);
 }
 static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -641,31 +656,40 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
        return wrote;
 }
-long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+                                enum wb_reason reason)
 {
        struct wb_writeback_work work = {
                .nr_pages       = nr_pages,
                .sync_mode      = WB_SYNC_NONE,
                .range_cyclic   = 1,
+                .reason         = reason,
        };
        spin_lock(&wb->list_lock);
        if (list_empty(&wb->b_io))
-                queue_io(wb, NULL);
+                queue_io(wb, &work);
        __writeback_inodes_wb(wb, &work);
        spin_unlock(&wb->list_lock);
        return nr_pages - work.nr_pages;
 }
-static inline bool over_bground_thresh(void)
+static bool over_bground_thresh(struct backing_dev_info *bdi)
 {
        unsigned long background_thresh, dirty_thresh;
        global_dirty_limits(&background_thresh, &dirty_thresh);
-        return (global_page_state(NR_FILE_DIRTY) +
+        if (global_page_state(NR_FILE_DIRTY) +
-                global_page_state(NR_UNSTABLE_NFS) > background_thresh);
+            global_page_state(NR_UNSTABLE_NFS) > background_thresh)
+                return true;
+        if (bdi_stat(bdi, BDI_RECLAIMABLE) >
+                                bdi_dirty_limit(bdi, background_thresh))
+                return true;
+        return false;
 }
 /*
@@ -675,7 +699,7 @@ static inline bool over_bground_thresh(void)
 static void wb_update_bandwidth(struct bdi_writeback *wb,
                                unsigned long start_time)
 {
-        __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
+        __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
 }
 /*
@@ -727,7 +751,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
-                if (work->for_background && !over_bground_thresh())
+                if (work->for_background && !over_bground_thresh(wb->bdi))
                        break;
                if (work->for_kupdate) {
@@ -738,7 +762,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                trace_writeback_start(wb->bdi, work);
                if (list_empty(&wb->b_io))
-                        queue_io(wb, work->older_than_this);
+                        queue_io(wb, work);
                if (work->sb)
                        progress = writeback_sb_inodes(work->sb, wb, work);
                else
@@ -811,13 +835,14 @@ static unsigned long get_nr_dirty_pages(void)
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-        if (over_bground_thresh()) {
+        if (over_bground_thresh(wb->bdi)) {
                struct wb_writeback_work work = {
                        .nr_pages       = LONG_MAX,
                        .sync_mode      = WB_SYNC_NONE,
                        .for_background = 1,
                        .range_cyclic   = 1,
+                        .reason         = WB_REASON_BACKGROUND,
                };
                return wb_writeback(wb, &work);
@@ -851,6 +876,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
                        .sync_mode      = WB_SYNC_NONE,
                        .for_kupdate    = 1,
                        .range_cyclic   = 1,
+                        .reason         = WB_REASON_PERIODIC,
                };
                return wb_writeback(wb, &work);
@@ -969,7 +995,7 @@ int bdi_writeback_thread(void *data)
 * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
 * the whole world.
 */
-void wakeup_flusher_threads(long nr_pages)
+void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 {
        struct backing_dev_info *bdi;
@@ -982,7 +1008,7 @@ void wakeup_flusher_threads(long nr_pages)
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                if (!bdi_has_dirty_io(bdi))
                        continue;
-                __bdi_start_writeback(bdi, nr_pages, false);
+                __bdi_start_writeback(bdi, nr_pages, false, reason);
        }
        rcu_read_unlock();
 }
@@ -1198,12 +1224,15 @@ static void wait_sb_inodes(struct super_block *sb)
 * writeback_inodes_sb_nr -     writeback dirty inodes from given super_block
 * @sb: the superblock
 * @nr: the number of pages to write
+ * @reason: reason why some writeback work initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
-void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
+void writeback_inodes_sb_nr(struct super_block *sb,
+                            unsigned long nr,
+                            enum wb_reason reason)
 {
        DECLARE_COMPLETION_ONSTACK(done);
        struct wb_writeback_work work = {
@@ -1212,6 +1241,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
                .tagged_writepages      = 1,
                .done                   = &done,
                .nr_pages               = nr,
+                .reason                 = reason,
        };
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1223,29 +1253,31 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
 /**
 * writeback_inodes_sb  -       writeback dirty inodes from given super_block
 * @sb: the superblock
+ * @reason: reason why some writeback work was initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
-void writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 {
-        return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
+        return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 /**
 * writeback_inodes_sb_if_idle  -       start writeback if none underway
 * @sb: the superblock
+ * @reason: reason why some writeback work was initiated
 *
 * Invoke writeback_inodes_sb if no writeback is currently underway.
 * Returns 1 if writeback was started, 0 if not.
 */
-int writeback_inodes_sb_if_idle(struct super_block *sb)
+int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
 {
        if (!writeback_in_progress(sb->s_bdi)) {
                down_read(&sb->s_umount);
-                writeback_inodes_sb(sb);
+                writeback_inodes_sb(sb, reason);
                up_read(&sb->s_umount);
                return 1;
        } else
@@ -1257,16 +1289,18 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
 * writeback_inodes_sb_if_idle  -       start writeback if none underway
 * @sb: the superblock
 * @nr: the number of pages to write
+ * @reason: reason why some writeback work was initiated
 *
 * Invoke writeback_inodes_sb if no writeback is currently underway.
 * Returns 1 if writeback was started, 0 if not.
 */
 int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
-                                   unsigned long nr)
+                                   unsigned long nr,
+                                   enum wb_reason reason)
 {
        if (!writeback_in_progress(sb->s_bdi)) {
                down_read(&sb->s_umount);
-                writeback_inodes_sb_nr(sb, nr);
+                writeback_inodes_sb_nr(sb, nr, reason);
                up_read(&sb->s_umount);
                return 1;
        } else
@@ -1290,6 +1324,7 @@ void sync_inodes_sb(struct super_block *sb)
                .nr_pages       = LONG_MAX,
                .range_cyclic   = 0,
                .done           = &done,
+                .reason         = WB_REASON_SYNC,
        };
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b6cca47f7b0..3426521f320 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -47,6 +47,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/stat.h>
+#include <linux/module.h>
 #include "fuse_i.h"
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7e823bbd245..cb23c2be731 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -14,6 +14,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/kthread.h>
+#include <linux/export.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index e673a88b8ae..b1ce4c7ad3f 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -40,6 +40,8 @@ int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in)
        src = in->name;
        srclen = in->len;
+        if (srclen > HFS_NAMELEN)
+                srclen = HFS_NAMELEN;
        dst = out;
        dstlen = HFS_MAX_NAMELEN;
        if (nls_io) {
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 7da2a06508e..f79dab83e17 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -21,6 +21,7 @@
 */
 #include <linux/gfp.h>
 #include <linux/kernel.h>
+#include <linux/export.h>
 #include <linux/ioprio.h>
 #include <linux/blkdev.h>
 #include <linux/capability.h>
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index de4247021d2..5b6c9d1a2fb 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -53,6 +53,78 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this,
        return 0;
 }
+/*
+ * jffs2_selected_compress:
+ * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB).
+ *      If 0, just take the first available compression mode.
+ * @data_in: Pointer to uncompressed data
+ * @cpage_out: Pointer to returned pointer to buffer for compressed data
+ * @datalen: On entry, holds the amount of data available for compression.
+ *      On exit, expected to hold the amount of data actually compressed.
+ * @cdatalen: On entry, holds the amount of space available for compressed
+ *      data. On exit, expected to hold the actual size of the compressed
+ *      data.
+ *
+ * Returns: the compression type used.  Zero is used to show that the data
+ * could not be compressed; probably because we couldn't find the requested
+ * compression mode.
+ */
+static int jffs2_selected_compress(u8 compr, unsigned char *data_in,
+                unsigned char **cpage_out, u32 *datalen, u32 *cdatalen)
+{
+        struct jffs2_compressor *this;
+        int err, ret = JFFS2_COMPR_NONE;
+        uint32_t orig_slen, orig_dlen;
+        char *output_buf;
+        output_buf = kmalloc(*cdatalen, GFP_KERNEL);
+        if (!output_buf) {
+                printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
+                return ret;
+        }
+        orig_slen = *datalen;
+        orig_dlen = *cdatalen;
+        spin_lock(&jffs2_compressor_list_lock);
+        list_for_each_entry(this, &jffs2_compressor_list, list) {
+                /* Skip decompress-only and disabled modules */
+                if (!this->compress || this->disabled)
+                        continue;
+                /* Skip if not the desired compression type */
+                if (compr && (compr != this->compr))
+                        continue;
+                /*
+                 * Either compression type was unspecified, or we found our
+                 * compressor; either way, we're good to go.
+                 */
+                this->usecount++;
+                spin_unlock(&jffs2_compressor_list_lock);
+                *datalen  = orig_slen;
+                *cdatalen = orig_dlen;
+                err = this->compress(data_in, output_buf, datalen, cdatalen);
+                spin_lock(&jffs2_compressor_list_lock);
+                this->usecount--;
+                if (!err) {
+                        /* Success */
+                        ret = this->compr;
+                        this->stat_compr_blocks++;
+                        this->stat_compr_orig_size += *datalen;
+                        this->stat_compr_new_size += *cdatalen;
+                        break;
+                }
+        }
+        spin_unlock(&jffs2_compressor_list_lock);
+        if (ret == JFFS2_COMPR_NONE)
+                kfree(output_buf);
+        else
+                *cpage_out = output_buf;
+        return ret;
+}
 /* jffs2_compress:
 * @data_in: Pointer to uncompressed data
 * @cpage_out: Pointer to returned pointer to buffer for compressed data
@@ -76,47 +148,23 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                        uint32_t *datalen, uint32_t *cdatalen)
 {
        int ret = JFFS2_COMPR_NONE;
-        int compr_ret;
+        int mode, compr_ret;
        struct jffs2_compressor *this, *best=NULL;
        unsigned char *output_buf = NULL, *tmp_buf;
        uint32_t orig_slen, orig_dlen;
        uint32_t best_slen=0, best_dlen=0;
-        switch (jffs2_compression_mode) {
+        if (c->mount_opts.override_compr)
+                mode = c->mount_opts.compr;
+        else
+                mode = jffs2_compression_mode;
+        switch (mode) {
        case JFFS2_COMPR_MODE_NONE:
                break;
        case JFFS2_COMPR_MODE_PRIORITY:
-                output_buf = kmalloc(*cdatalen,GFP_KERNEL);
+                ret = jffs2_selected_compress(0, data_in, cpage_out, datalen,
-                if (!output_buf) {
+                                cdatalen);
-                        printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
-                        goto out;
-                }
-                orig_slen = *datalen;
-                orig_dlen = *cdatalen;
-                spin_lock(&jffs2_compressor_list_lock);
-                list_for_each_entry(this, &jffs2_compressor_list, list) {
-                        /* Skip decompress-only backwards-compatibility and disabled modules */
-                        if ((!this->compress)||(this->disabled))
-                                continue;
-                        this->usecount++;
-                        spin_unlock(&jffs2_compressor_list_lock);
-                        *datalen  = orig_slen;
-                        *cdatalen = orig_dlen;
-                        compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
-                        spin_lock(&jffs2_compressor_list_lock);
-                        this->usecount--;
-                        if (!compr_ret) {
-                                ret = this->compr;
-                                this->stat_compr_blocks++;
-                                this->stat_compr_orig_size += *datalen;
-                                this->stat_compr_new_size  += *cdatalen;
-                                break;
-                        }
-                }
-                spin_unlock(&jffs2_compressor_list_lock);
-                if (ret == JFFS2_COMPR_NONE)
-                        kfree(output_buf);
                break;
        case JFFS2_COMPR_MODE_SIZE:
        case JFFS2_COMPR_MODE_FAVOURLZO:
@@ -174,22 +222,28 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                        best->stat_compr_orig_size += best_slen;
                        best->stat_compr_new_size  += best_dlen;
                        ret = best->compr;
+                        *cpage_out = output_buf;
                }
                spin_unlock(&jffs2_compressor_list_lock);
                break;
+        case JFFS2_COMPR_MODE_FORCELZO:
+                ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in,
+                                cpage_out, datalen, cdatalen);
+                break;
+        case JFFS2_COMPR_MODE_FORCEZLIB:
+                ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in,
+                                cpage_out, datalen, cdatalen);
+                break;
        default:
                printk(KERN_ERR "JFFS2: unknown compression mode.\n");
        }
- out:
        if (ret == JFFS2_COMPR_NONE) {
                *cpage_out = data_in;
                *datalen = *cdatalen;
                none_stat_compr_blocks++;
                none_stat_compr_size += *datalen;
        }
-        else {
-                *cpage_out = output_buf;
-        }
        return ret;
 }
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 13bb7597ab3..5e91d578f4e 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -40,6 +40,8 @@
 #define JFFS2_COMPR_MODE_PRIORITY   1
 #define JFFS2_COMPR_MODE_SIZE       2
 #define JFFS2_COMPR_MODE_FAVOURLZO  3
+#define JFFS2_COMPR_MODE_FORCELZO   4
+#define JFFS2_COMPR_MODE_FORCEZLIB  5
 #define FAVOUR_LZO_PERCENT 80
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 7286e44ac66..4b8afe39a87 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -379,7 +379,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
        jffs2_do_setattr(inode, &iattr);
 }
-int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
+int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
 {
        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 0bc6a6c80a5..55a0c1dcead 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -29,6 +29,11 @@
 struct jffs2_inodirty;
+struct jffs2_mount_opts {
+        bool override_compr;
+        unsigned int compr;
+};
 /* A struct for the overall file system control.  Pointers to
   jffs2_sb_info structs are named `c' in the source code.
   Nee jffs_control
@@ -126,6 +131,7 @@ struct jffs2_sb_info {
 #endif
        struct jffs2_summary *summary;          /* Summary information */
+        struct jffs2_mount_opts mount_opts;
 #ifdef CONFIG_JFFS2_FS_XATTR
 #define XATTRINDEX_HASHSIZE     (57)
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 6c1755c59c0..ab65ee3ec85 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -176,7 +176,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags);
 struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode,
                               struct jffs2_raw_inode *ri);
 int jffs2_statfs (struct dentry *, struct kstatfs *);
-int jffs2_remount_fs (struct super_block *, int *, char *);
+int jffs2_do_remount_fs(struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
 void jffs2_gc_release_inode(struct jffs2_sb_info *c,
                            struct jffs2_inode_info *f);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 8d8cd3419d0..28107ca136e 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -275,9 +275,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
        else
                c->mtd->unpoint(c->mtd, 0, c->mtd->size);
 #endif
-        if (s)
+        kfree(s);
-                kfree(s);
        return ret;
 }
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 853b8e30008..e7e97445411 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -17,11 +17,13 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/mount.h>
+#include <linux/parser.h>
 #include <linux/jffs2.h>
 #include <linux/pagemap.h>
 #include <linux/mtd/super.h>
 #include <linux/ctype.h>
 #include <linux/namei.h>
+#include <linux/seq_file.h>
 #include <linux/exportfs.h>
 #include "compr.h"
 #include "nodelist.h"
@@ -75,6 +77,37 @@ static void jffs2_write_super(struct super_block *sb)
        unlock_super(sb);
 }
+static const char *jffs2_compr_name(unsigned int compr)
+{
+        switch (compr) {
+        case JFFS2_COMPR_MODE_NONE:
+                return "none";
+#ifdef CONFIG_JFFS2_LZO
+        case JFFS2_COMPR_MODE_FORCELZO:
+                return "lzo";
+#endif
+#ifdef CONFIG_JFFS2_ZLIB
+        case JFFS2_COMPR_MODE_FORCEZLIB:
+                return "zlib";
+#endif
+        default:
+                /* should never happen; programmer error */
+                WARN_ON(1);
+                return "";
+        }
+}
+static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+        struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb);
+        struct jffs2_mount_opts *opts = &c->mount_opts;
+        if (opts->override_compr)
+                seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr));
+        return 0;
+}
 static int jffs2_sync_fs(struct super_block *sb, int wait)
 {
        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
@@ -133,6 +166,85 @@ static const struct export_operations jffs2_export_ops = {
        .fh_to_parent = jffs2_fh_to_parent,
 };
+/*
+ * JFFS2 mount options.
+ *
+ * Opt_override_compr: override default compressor
+ * Opt_err: just end of array marker
+ */
+enum {
+        Opt_override_compr,
+        Opt_err,
+};
+static const match_table_t tokens = {
+        {Opt_override_compr, "compr=%s"},
+        {Opt_err, NULL},
+};
+static int jffs2_parse_options(struct jffs2_sb_info *c, char *data)
+{
+        substring_t args[MAX_OPT_ARGS];
+        char *p, *name;
+        if (!data)
+                return 0;
+        while ((p = strsep(&data, ","))) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_override_compr:
+                        name = match_strdup(&args[0]);
+                        if (!name)
+                                return -ENOMEM;
+                        if (!strcmp(name, "none"))
+                                c->mount_opts.compr = JFFS2_COMPR_MODE_NONE;
+#ifdef CONFIG_JFFS2_LZO
+                        else if (!strcmp(name, "lzo"))
+                                c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO;
+#endif
+#ifdef CONFIG_JFFS2_ZLIB
+                        else if (!strcmp(name, "zlib"))
+                                c->mount_opts.compr =
+                                                JFFS2_COMPR_MODE_FORCEZLIB;
+#endif
+                        else {
+                                printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"",
+                                                name);
+                                kfree(name);
+                                return -EINVAL;
+                        }
+                        kfree(name);
+                        c->mount_opts.override_compr = true;
+                        break;
+                default:
+                        printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n",
+                                        p);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+        int err;
+        err = jffs2_parse_options(c, data);
+        if (err)
+                return -EINVAL;
+        return jffs2_do_remount_fs(sb, flags, data);
+}
 static const struct super_operations jffs2_super_operations =
 {
        .alloc_inode =  jffs2_alloc_inode,
@@ -143,6 +255,7 @@ static const struct super_operations jffs2_super_operations =
        .remount_fs =   jffs2_remount_fs,
        .evict_inode =  jffs2_evict_inode,
        .dirty_inode =  jffs2_dirty_inode,
+        .show_options = jffs2_show_options,
        .sync_fs =      jffs2_sync_fs,
 };
@@ -166,6 +279,12 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
        c->os_priv = sb;
        sb->s_fs_info = c;
+        ret = jffs2_parse_options(c, data);
+        if (ret) {
+                kfree(c);
+                return -EINVAL;
+        }
        /* Initialize JFFS2 superblock locks, the further initialization will
         * be done later */
        mutex_init(&c->alloc_sem);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4515bea0268..b09e51d2f81 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -578,8 +578,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
        if (!jffs2_is_writebuffered(c))
                return 0;
-        if (mutex_trylock(&c->alloc_sem)) {
+        if (!mutex_is_locked(&c->alloc_sem)) {
-                mutex_unlock(&c->alloc_sem);
                printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n");
                BUG();
        }
@@ -1026,7 +1025,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
        int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
        struct mtd_oob_ops ops;
-        ops.mode = MTD_OOB_AUTO;
+        ops.mode = MTD_OPS_AUTO_OOB;
        ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
        ops.oobbuf = c->oobbuf;
        ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1069,7 +1068,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
        struct mtd_oob_ops ops;
        int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
-        ops.mode = MTD_OOB_AUTO;
+        ops.mode = MTD_OPS_AUTO_OOB;
        ops.ooblen = cmlen;
        ops.oobbuf = c->oobbuf;
        ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1095,7 +1094,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
        struct mtd_oob_ops ops;
        int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
-        ops.mode = MTD_OOB_AUTO;
+        ops.mode = MTD_OPS_AUTO_OOB;
        ops.ooblen = cmlen;
        ops.oobbuf = (uint8_t *)&oob_cleanmarker;
        ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 583636f745e..cc5f811ed38 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -67,6 +67,7 @@
 #include <linux/buffer_head.h>          /* for sync_blockdev() */
 #include <linux/bio.h>
 #include <linux/freezer.h>
+#include <linux/export.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/seq_file.h>
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index f2697e4df10..e795c234ea3 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/bio.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/module.h>
 #include <linux/mtd/mtd.h>
 #include <linux/statfs.h>
 #include <linux/buffer_head.h>
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 3f32bcb0d9b..ef175cb8cfd 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -16,38 +16,26 @@
 #include <linux/bitops.h>
 #include <linux/sched.h>
-static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
 static DEFINE_SPINLOCK(bitmap_lock);
-static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits)
+/*
+ * bitmap consists of blocks filled with 16bit words
+ * bit set == busy, bit clear == free
+ * endianness is a mess, but for counting zero bits it really doesn't matter...
+ */
+static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits)
 {
-        unsigned i, j, sum = 0;
+        __u32 sum = 0;
-        struct buffer_head *bh;
+        unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8);
-  
-        for (i=0; i<numblocks-1; i++) {
-                if (!(bh=map[i])) 
-                        return(0);
-                for (j=0; j<bh->b_size; j++)
-                        sum += nibblemap[bh->b_data[j] & 0xf]
-                                + nibblemap[(bh->b_data[j]>>4) & 0xf];
-        }
-        if (numblocks==0 || !(bh=map[numblocks-1]))
+        while (blocks--) {
-                return(0);
+                unsigned words = blocksize / 2;
-        i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2;
+                __u16 *p = (__u16 *)(*map++)->b_data;
-        for (j=0; j<i; j++) {
+                while (words--)
-                sum += nibblemap[bh->b_data[j] & 0xf]
+                        sum += 16 - hweight16(*p++);
-                        + nibblemap[(bh->b_data[j]>>4) & 0xf];
        }
-        i = numbits%16;
+        return sum;
-        if (i!=0) {
-                i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1);
-                sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf];
-                sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf];
-        }
-        return(sum);
 }
 void minix_free_block(struct inode *inode, unsigned long block)
@@ -105,10 +93,12 @@ int minix_new_block(struct inode * inode)
        return 0;
 }
-unsigned long minix_count_free_blocks(struct minix_sb_info *sbi)
+unsigned long minix_count_free_blocks(struct super_block *sb)
 {
-        return (count_free(sbi->s_zmap, sbi->s_zmap_blocks,
+        struct minix_sb_info *sbi = minix_sb(sb);
-                sbi->s_nzones - sbi->s_firstdatazone + 1)
+        u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1);
+        return (count_free(sbi->s_zmap, sb->s_blocksize, bits)
                << sbi->s_log_zone_size);
 }
@@ -273,7 +263,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
        return inode;
 }
-unsigned long minix_count_free_inodes(struct minix_sb_info *sbi)
+unsigned long minix_count_free_inodes(struct super_block *sb)
 {
-        return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1);
+        struct minix_sb_info *sbi = minix_sb(sb);
+        u32 bits = sbi->s_ninodes + 1;
+        return count_free(sbi->s_imap, sb->s_blocksize, bits);
 }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 64cdcd662ff..1d9e33966db 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -279,6 +279,27 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
        else if (sbi->s_mount_state & MINIX_ERROR_FS)
                printk("MINIX-fs: mounting file system with errors, "
                        "running fsck is recommended\n");
+        /* Apparently minix can create filesystems that allocate more blocks for
+         * the bitmaps than needed.  We simply ignore that, but verify it didn't
+         * create one with not enough blocks and bail out if so.
+         */
+        block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
+        if (sbi->s_imap_blocks < block) {
+                printk("MINIX-fs: file system does not have enough "
+                                "imap blocks allocated.  Refusing to mount\n");
+                goto out_iput;
+        }
+        block = minix_blocks_needed(
+                        (sbi->s_nzones - (sbi->s_firstdatazone + 1)),
+                        s->s_blocksize);
+        if (sbi->s_zmap_blocks < block) {
+                printk("MINIX-fs: file system does not have enough "
+                                "zmap blocks allocated.  Refusing to mount.\n");
+                goto out_iput;
+        }
        return 0;
 out_iput:
@@ -339,10 +360,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_type = sb->s_magic;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
-        buf->f_bfree = minix_count_free_blocks(sbi);
+        buf->f_bfree = minix_count_free_blocks(sb);
        buf->f_bavail = buf->f_bfree;
        buf->f_files = sbi->s_ninodes;
-        buf->f_ffree = minix_count_free_inodes(sbi);
+        buf->f_ffree = minix_count_free_inodes(sb);
        buf->f_namelen = sbi->s_namelen;
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 341e2122879..26bbd55e82e 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -48,10 +48,10 @@ extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, stru
 extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
 extern struct inode * minix_new_inode(const struct inode *, int, int *);
 extern void minix_free_inode(struct inode * inode);
-extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
+extern unsigned long minix_count_free_inodes(struct super_block *sb);
 extern int minix_new_block(struct inode * inode);
 extern void minix_free_block(struct inode *inode, unsigned long block);
-extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi);
+extern unsigned long minix_count_free_blocks(struct super_block *sb);
 extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
@@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
        return list_entry(inode, struct minix_inode_info, vfs_inode);
 }
+static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize)
+{
+        return DIV_ROUND_UP(bits, blocksize * 8);
+}
 #if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
        defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
@@ -125,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
        if (!size)
                return 0;
-        size = (size >> 4) + ((size & 15) > 0);
+        size >>= 4;
        while (*p++ == 0xffff) {
                if (--size == 0)
                        return (p - addr) << 4;
diff --git a/fs/namei.c b/fs/namei.c
index ac6d214da82..5008f01787f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -852,7 +852,7 @@ static int follow_managed(struct path *path, unsigned flags)
                mntput(path->mnt);
        if (ret == -EISDIR)
                ret = 0;
-        return ret;
+        return ret < 0 ? ret : need_mntput;
 }
 int follow_down_one(struct path *path)
@@ -900,6 +900,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                        break;
                path->mnt = mounted;
                path->dentry = mounted->mnt_root;
+                nd->flags |= LOOKUP_JUMPED;
                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
                /*
                 * Update the inode too. We don't need to re-check the
@@ -1213,6 +1214,8 @@ retry:
                path_put_conditional(path, nd);
                return err;
        }
+        if (err)
+                nd->flags |= LOOKUP_JUMPED;
        *inode = path->dentry->d_inode;
        return 0;
 }
@@ -2146,6 +2149,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        }
        /* create side of things */
+        /*
+         * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been
+         * cleared when we got to the last component we are about to look up
+         */
        error = complete_walk(nd);
        if (error)
                return ERR_PTR(error);
@@ -2214,6 +2221,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (error < 0)
                goto exit_dput;
+        if (error)
+                nd->flags |= LOOKUP_JUMPED;
        error = -ENOENT;
        if (!path->dentry->d_inode)
                goto exit_dput;
@@ -2223,6 +2233,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        path_to_nameidata(path, nd);
        nd->inode = path->dentry->d_inode;
+        /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
+        error = complete_walk(nd);
+        if (error)
+                goto exit;
        error = -EISDIR;
        if (S_ISDIR(nd->inode->i_mode))
                goto exit;
diff --git a/fs/namespace.c b/fs/namespace.c
index e5e1c7d1839..cfc6d4448aa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1048,15 +1048,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
        if (err)
                goto out;
        seq_putc(m, ' ');
-        seq_path_root(m, &mnt_path, &root, " \t\n\\");
-        if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
+        /* mountpoints outside of chroot jail will give SEQ_SKIP on this */
-                /*
+        err = seq_path_root(m, &mnt_path, &root, " \t\n\\");
-                 * Mountpoint is outside root, discard that one.  Ugly,
+        if (err)
-                 * but less so than trying to do that in iterator in a
+                goto out;
-                 * race-free way (due to renames).
-                 */
-                return SEQ_SKIP;
-        }
        seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
        show_mnt_opts(m, mnt);
@@ -2483,11 +2480,43 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
                __mnt_make_longterm(mnt);
                new_ns->root = mnt;
                list_add(&new_ns->list, &new_ns->root->mnt_list);
+        } else {
+                mntput(mnt);
        }
        return new_ns;
 }
 EXPORT_SYMBOL(create_mnt_ns);
+struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
+{
+        struct mnt_namespace *ns;
+        struct super_block *s;
+        struct path path;
+        int err;
+        ns = create_mnt_ns(mnt);
+        if (IS_ERR(ns))
+                return ERR_CAST(ns);
+        err = vfs_path_lookup(mnt->mnt_root, mnt,
+                        name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+        put_mnt_ns(ns);
+        if (err)
+                return ERR_PTR(err);
+        /* trade a vfsmount reference for active sb one */
+        s = path.mnt->mnt_sb;
+        atomic_inc(&s->s_active);
+        mntput(path.mnt);
+        /* lock the sucker */
+        down_write(&s->s_umount);
+        /* ... and return the root of (sub)tree on it */
+        return path.dentry;
+}
+EXPORT_SYMBOL(mount_subtree);
 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
                char __user *, type, unsigned long, flags, void __user *, data)
 {
@@ -2744,3 +2773,8 @@ void kern_unmount(struct vfsmount *mnt)
        }
 }
 EXPORT_SYMBOL(kern_unmount);
+bool our_mnt(struct vfsmount *mnt)
+{
+        return check_mnt(mnt);
+}
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 918ad647afe..726e59a9e50 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
                                      struct xdr_stream *xdr,
                                      struct cb_recallanyargs *args)
 {
-        __be32 *p;
+        uint32_t bitmap[2];
+        __be32 *p, status;
        args->craa_addr = svc_addr(rqstp);
        p = read_buf(xdr, 4);
        if (unlikely(p == NULL))
                return htonl(NFS4ERR_BADXDR);
        args->craa_objs_to_keep = ntohl(*p++);
-        p = read_buf(xdr, 4);
+        status = decode_bitmap(xdr, bitmap);
-        if (unlikely(p == NULL))
+        if (unlikely(status))
-                return htonl(NFS4ERR_BADXDR);
+                return status;
-        args->craa_type_mask = ntohl(*p);
+        args->craa_type_mask = bitmap[0];
        return 0;
 }
@@ -986,4 +987,5 @@ struct svc_version nfs4_callback_version4 = {
        .vs_proc = nfs4_callback_procedures1,
        .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
        .vs_dispatch = NULL,
+        .vs_hidden = 1,
 };
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b238d95ac48..ac289909814 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1468,12 +1468,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
+                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
-                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
                                res = ERR_CAST(inode);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 91c01f0a4c3..eca56d4b39c 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -40,48 +40,8 @@
 #define NFSDBG_FACILITY         NFSDBG_FILE
-static int nfs_file_open(struct inode *, struct file *);
-static int nfs_file_release(struct inode *, struct file *);
-static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
-static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
-static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
-                                        struct pipe_inode_info *pipe,
-                                        size_t count, unsigned int flags);
-static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
-                                unsigned long nr_segs, loff_t pos);
-static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
-                                        struct file *filp, loff_t *ppos,
-                                        size_t count, unsigned int flags);
-static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
-                                unsigned long nr_segs, loff_t pos);
-static int  nfs_file_flush(struct file *, fl_owner_t id);
-static int  nfs_file_fsync(struct file *, loff_t, loff_t, int datasync);
-static int nfs_check_flags(int flags);
-static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
-static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
-static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
 static const struct vm_operations_struct nfs_file_vm_ops;
-const struct file_operations nfs_file_operations = {
-        .llseek         = nfs_file_llseek,
-        .read           = do_sync_read,
-        .write          = do_sync_write,
-        .aio_read       = nfs_file_read,
-        .aio_write      = nfs_file_write,
-        .mmap           = nfs_file_mmap,
-        .open           = nfs_file_open,
-        .flush          = nfs_file_flush,
-        .release        = nfs_file_release,
-        .fsync          = nfs_file_fsync,
-        .lock           = nfs_lock,
-        .flock          = nfs_flock,
-        .splice_read    = nfs_file_splice_read,
-        .splice_write   = nfs_file_splice_write,
-        .check_flags    = nfs_check_flags,
-        .setlease       = nfs_setlease,
-};
 const struct inode_operations nfs_file_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
@@ -137,11 +97,9 @@ nfs_file_open(struct inode *inode, struct file *filp)
 static int
 nfs_file_release(struct inode *inode, struct file *filp)
 {
-        struct dentry *dentry = filp->f_path.dentry;
        dprintk("NFS: release(%s/%s)\n",
-                        dentry->d_parent->d_name.name,
+                        filp->f_path.dentry->d_parent->d_name.name,
-                        dentry->d_name.name);
+                        filp->f_path.dentry->d_name.name);
        nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
        return nfs_release(inode, filp);
@@ -228,14 +186,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
        struct dentry * dentry = iocb->ki_filp->f_path.dentry;
        struct inode * inode = dentry->d_inode;
        ssize_t result;
-        size_t count = iov_length(iov, nr_segs);
        if (iocb->ki_filp->f_flags & O_DIRECT)
                return nfs_file_direct_read(iocb, iov, nr_segs, pos);
        dprintk("NFS: read(%s/%s, %lu@%lu)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-                (unsigned long) count, (unsigned long) pos);
+                (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
        result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
        if (!result) {
@@ -889,3 +846,54 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
                        file->f_path.dentry->d_name.name, arg);
        return -EINVAL;
 }
+const struct file_operations nfs_file_operations = {
+        .llseek         = nfs_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .aio_read       = nfs_file_read,
+        .aio_write      = nfs_file_write,
+        .mmap           = nfs_file_mmap,
+        .open           = nfs_file_open,
+        .flush          = nfs_file_flush,
+        .release        = nfs_file_release,
+        .fsync          = nfs_file_fsync,
+        .lock           = nfs_lock,
+        .flock          = nfs_flock,
+        .splice_read    = nfs_file_splice_read,
+        .splice_write   = nfs_file_splice_write,
+        .check_flags    = nfs_check_flags,
+        .setlease       = nfs_setlease,
+};
+#ifdef CONFIG_NFS_V4
+static int
+nfs4_file_open(struct inode *inode, struct file *filp)
+{
+        /*
+         * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to
+         * this point, then something is very wrong
+         */
+        dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp);
+        return -ENOTDIR;
+}
+const struct file_operations nfs4_file_operations = {
+        .llseek         = nfs_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .aio_read       = nfs_file_read,
+        .aio_write      = nfs_file_write,
+        .mmap           = nfs_file_mmap,
+        .open           = nfs4_file_open,
+        .flush          = nfs_file_flush,
+        .release        = nfs_file_release,
+        .fsync          = nfs_file_fsync,
+        .lock           = nfs_lock,
+        .flock          = nfs_flock,
+        .splice_read    = nfs_file_splice_read,
+        .splice_write   = nfs_file_splice_write,
+        .check_flags    = nfs_check_flags,
+        .setlease       = nfs_setlease,
+};
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c07a55aec83..50a15fa8cf9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                 */
                inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
                if (S_ISREG(inode->i_mode)) {
-                        inode->i_fop = &nfs_file_operations;
+                        inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
                        inode->i_data.a_ops = &nfs_file_aops;
                        inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
                } else if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c1a1bd8ddf1..3f4d95751d5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -299,6 +299,8 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
                struct list_head *head);
+extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
+                struct inode *inode);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_readdata_release(struct nfs_read_data *rdata);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 85f1690ca08..d4bc9ed9174 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -853,6 +853,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .dentry_ops     = &nfs_dentry_operations,
        .dir_inode_ops  = &nfs3_dir_inode_operations,
        .file_inode_ops = &nfs3_file_inode_operations,
+        .file_ops       = &nfs_file_operations,
        .getroot        = nfs3_proc_get_root,
        .getattr        = nfs3_proc_getattr,
        .setattr        = nfs3_proc_setattr,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 09119418402..a62d36b9a99 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -31,6 +31,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/module.h>
 #include "internal.h"
 #include "nfs4filelayout.h"
@@ -449,9 +450,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        fl->dsaddr = dsaddr;
-        if (fl->first_stripe_index < 0 ||
+        if (fl->first_stripe_index >= dsaddr->stripe_count) {
-            fl->first_stripe_index >= dsaddr->stripe_count) {
+                dprintk("%s Bad first_stripe_index %u\n",
-                dprintk("%s Bad first_stripe_index %d\n",
                                __func__, fl->first_stripe_index);
                goto out_put;
        }
@@ -552,7 +552,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
        /* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
         * Futher checking is done in filelayout_check_layout */
-        if (fl->num_fh < 0 || fl->num_fh >
+        if (fl->num_fh >
            max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
                goto out_err;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d2ae413c986..be2bbac1381 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2464,8 +2464,7 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst
                case -NFS4ERR_BADNAME:
                        return -ENOENT;
                case -NFS4ERR_MOVED:
-                        err = nfs4_get_referral(dir, name, fattr, fhandle);
+                        return nfs4_get_referral(dir, name, fattr, fhandle);
-                        break;
                case -NFS4ERR_WRONGSEC:
                        nfs_fixup_secinfo_attributes(fattr, fhandle);
                }
@@ -5950,6 +5949,7 @@ static void nfs4_layoutcommit_release(void *calldata)
 {
        struct nfs4_layoutcommit_data *data = calldata;
        struct pnfs_layout_segment *lseg, *tmp;
+        unsigned long *bitlock = &NFS_I(data->args.inode)->flags;
        pnfs_cleanup_layoutcommit(data);
        /* Matched by references in pnfs_set_layoutcommit */
@@ -5959,6 +5959,11 @@ static void nfs4_layoutcommit_release(void *calldata)
                                       &lseg->pls_flags))
                        put_lseg(lseg);
        }
+        clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+        smp_mb__after_clear_bit();
+        wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
        put_rpccred(data->cred);
        kfree(data);
 }
@@ -6247,6 +6252,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .dentry_ops     = &nfs4_dentry_operations,
        .dir_inode_ops  = &nfs4_dir_inode_operations,
        .file_inode_ops = &nfs4_file_inode_operations,
+        .file_ops       = &nfs4_file_operations,
        .getroot        = nfs4_proc_get_root,
        .getattr        = nfs4_proc_getattr,
        .setattr        = nfs4_proc_setattr,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1dce12f41a4..e6161b213ed 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
        if (status)
                goto out;
        status = decode_secinfo(xdr, res);
-        if (status)
-                goto out;
 out:
        return status;
 }
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index d0cda12fddc..c807ab93140 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -38,21 +38,15 @@
 */
 #include <linux/module.h>
-#include <scsi/osd_initiator.h>
+#include <scsi/osd_ore.h>
 #include "objlayout.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-#define _LLU(x) ((unsigned long long)x)
-enum { BIO_MAX_PAGES_KMALLOC =
-                (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
-};
 struct objio_dev_ent {
        struct nfs4_deviceid_node id_node;
-        struct osd_dev *od;
+        struct ore_dev od;
 };
 static void
@@ -60,8 +54,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
 {
        struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
-        dprintk("%s: free od=%p\n", __func__, de->od);
+        dprintk("%s: free od=%p\n", __func__, de->od.od);
-        osduld_put_device(de->od);
+        osduld_put_device(de->od.od);
        kfree(de);
 }
@@ -98,12 +92,12 @@ _dev_list_add(const struct nfs_server *nfss,
                                nfss->pnfs_curr_ld,
                                nfss->nfs_client,
                                d_id);
-        de->od = od;
+        de->od.od = od;
        d = nfs4_insert_deviceid_node(&de->id_node);
        n = container_of(d, struct objio_dev_ent, id_node);
        if (n != de) {
-                dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
+                dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
                objio_free_deviceid_node(&de->id_node);
                de = n;
        }
@@ -111,28 +105,11 @@ _dev_list_add(const struct nfs_server *nfss,
        return de;
 }
-struct caps_buffers {
-        u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
-        u8 creds[OSD_CAP_LEN];
-};
 struct objio_segment {
        struct pnfs_layout_segment lseg;
-        struct pnfs_osd_object_cred *comps;
+        struct ore_layout layout;
+        struct ore_components oc;
-        unsigned mirrors_p1;
-        unsigned stripe_unit;
-        unsigned group_width;   /* Data stripe_units without integrity comps */
-        u64 group_depth;
-        unsigned group_count;
-        unsigned max_io_size;
-        unsigned comps_index;
-        unsigned num_comps;
-        /* variable length */
-        struct objio_dev_ent *ods[];
 };
 static inline struct objio_segment *
@@ -141,59 +118,44 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
        return container_of(lseg, struct objio_segment, lseg);
 }
-struct objio_state;
-typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
 struct objio_state {
        /* Generic layer */
-        struct objlayout_io_state ol_state;
+        struct objlayout_io_res oir;
-        struct objio_segment *layout;
+        bool sync;
+        /*FIXME: Support for extra_bytes at ore_get_rw_state() */
-        struct kref kref;
+        struct ore_io_state *ios;
-        objio_done_fn done;
-        void *private;
-        unsigned long length;
-        unsigned numdevs; /* Actually used devs in this IO */
-        /* A per-device variable array of size numdevs */
-        struct _objio_per_comp {
-                struct bio *bio;
-                struct osd_request *or;
-                unsigned long length;
-                u64 offset;
-                unsigned dev;
-        } per_dev[];
 };
 /* Send and wait for a get_device_info of devices in the layout,
   then look them up with the osd_initiator library */
-static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
-                                struct objio_segment *objio_seg, unsigned comp,
+        struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
-                                gfp_t gfp_flags)
+        gfp_t gfp_flags)
 {
        struct pnfs_osd_deviceaddr *deviceaddr;
-        struct nfs4_deviceid *d_id;
        struct objio_dev_ent *ode;
        struct osd_dev *od;
        struct osd_dev_info odi;
        int err;
-        d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
        ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
-        if (ode)
+        if (ode) {
-                return ode;
+                objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+                return 0;
+        }
        err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
        if (unlikely(err)) {
                dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
                        __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
-                return ERR_PTR(err);
+                return err;
        }
        odi.systemid_len = deviceaddr->oda_systemid.len;
        if (odi.systemid_len > sizeof(odi.systemid)) {
+                dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
+                        __func__, sizeof(odi.systemid));
                err = -EINVAL;
                goto out;
        } else if (odi.systemid_len)
@@ -218,96 +180,53 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
        ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
                            gfp_flags);
+        objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+        dprintk("Adding new dev_id(%llx:%llx)\n",
+                _DEVID_LO(d_id), _DEVID_HI(d_id));
 out:
-        dprintk("%s: return=%d\n", __func__, err);
        objlayout_put_deviceinfo(deviceaddr);
-        return err ? ERR_PTR(err) : ode;
+        return err;
 }
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+static void copy_single_comp(struct ore_components *oc, unsigned c,
-        struct objio_segment *objio_seg,
+                             struct pnfs_osd_object_cred *src_comp)
-        gfp_t gfp_flags)
 {
-        unsigned i;
+        struct ore_comp *ocomp = &oc->comps[c];
-        int err;
-        /* lookup all devices */
+        WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
-        for (i = 0; i < objio_seg->num_comps; i++) {
+        WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
-                struct objio_dev_ent *ode;
-                ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
+        ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
-                if (unlikely(IS_ERR(ode))) {
+        ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
-                        err = PTR_ERR(ode);
-                        goto out;
-                }
-                objio_seg->ods[i] = ode;
-        }
-        err = 0;
-out:
+        memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
-        dprintk("%s: return=%d\n", __func__, err);
-        return err;
 }
-static int _verify_data_map(struct pnfs_osd_layout *layout)
+int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
+                       struct objio_segment **pseg)
 {
-        struct pnfs_osd_data_map *data_map = &layout->olo_map;
+        struct __alloc_objio_segment {
-        u64 stripe_length;
+                struct objio_segment olseg;
-        u32 group_width;
+                struct ore_dev *ods[numdevs];
+                struct ore_comp comps[numdevs];
-/* FIXME: Only raid0 for now. if not go through MDS */
+        } *aolseg;
-        if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
-                printk(KERN_ERR "Only RAID_0 for now\n");
-                return -ENOTSUPP;
-        }
-        if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
-                printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
-                          data_map->odm_num_comps, data_map->odm_mirror_cnt);
-                return -EINVAL;
-        }
-        if (data_map->odm_group_width)
+        aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
-                group_width = data_map->odm_group_width;
+        if (unlikely(!aolseg)) {
-        else
+                dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
-                group_width = data_map->odm_num_comps /
+                        numdevs, sizeof(*aolseg));
-                                                (data_map->odm_mirror_cnt + 1);
+                return -ENOMEM;
-        stripe_length = (u64)data_map->odm_stripe_unit * group_width;
-        if (stripe_length >= (1ULL << 32)) {
-                printk(KERN_ERR "Total Stripe length(0x%llx)"
-                          " >= 32bit is not supported\n", _LLU(stripe_length));
-                return -ENOTSUPP;
        }
-        if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
+        aolseg->olseg.oc.numdevs = numdevs;
-                printk(KERN_ERR "Stripe Unit(0x%llx)"
+        aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
-                          " must be Multples of PAGE_SIZE(0x%lx)\n",
+        aolseg->olseg.oc.comps = aolseg->comps;
-                          _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
+        aolseg->olseg.oc.ods = aolseg->ods;
-                return -ENOTSUPP;
-        }
+        *pseg = &aolseg->olseg;
        return 0;
 }
-static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
-                             struct pnfs_osd_object_cred *src_comp,
-                             struct caps_buffers *caps_p)
-{
-        WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
-        WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
-        *cur_comp = *src_comp;
-        memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
-               sizeof(caps_p->caps_key));
-        cur_comp->oc_cap_key.cred = caps_p->caps_key;
-        memcpy(caps_p->creds, src_comp->oc_cap.cred,
-               sizeof(caps_p->creds));
-        cur_comp->oc_cap.cred = caps_p->creds;
-}
 int objio_alloc_lseg(struct pnfs_layout_segment **outp,
        struct pnfs_layout_hdr *pnfslay,
        struct pnfs_layout_range *range,
@@ -317,59 +236,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
        struct objio_segment *objio_seg;
        struct pnfs_osd_xdr_decode_layout_iter iter;
        struct pnfs_osd_layout layout;
-        struct pnfs_osd_object_cred *cur_comp, src_comp;
+        struct pnfs_osd_object_cred src_comp;
-        struct caps_buffers *caps_p;
+        unsigned cur_comp;
        int err;
        err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
        if (unlikely(err))
                return err;
-        err = _verify_data_map(&layout);
+        err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
        if (unlikely(err))
                return err;
-        objio_seg = kzalloc(sizeof(*objio_seg) +
+        objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
-                            sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
+        objio_seg->layout.group_width = layout.olo_map.odm_group_width;
-                            sizeof(*objio_seg->comps) * layout.olo_num_comps +
+        objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
-                            sizeof(struct caps_buffers) * layout.olo_num_comps,
+        objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
-                            gfp_flags);
+        objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
-        if (!objio_seg)
-                return -ENOMEM;
-        objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
+        err = ore_verify_layout(layout.olo_map.odm_num_comps,
-        cur_comp = objio_seg->comps;
+                                          &objio_seg->layout);
-        caps_p = (void *)(cur_comp + layout.olo_num_comps);
-        while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
-                copy_single_comp(cur_comp++, &src_comp, caps_p++);
        if (unlikely(err))
                goto err;
-        objio_seg->num_comps = layout.olo_num_comps;
+        objio_seg->oc.first_dev = layout.olo_comps_index;
-        objio_seg->comps_index = layout.olo_comps_index;
+        cur_comp = 0;
-        err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
+        while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
-        if (err)
+                copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
-                goto err;
+                err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
+                                           &src_comp.oc_object_id.oid_device_id,
-        objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+                                           gfp_flags);
-        objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
+                if (err)
-        if (layout.olo_map.odm_group_width) {
+                        goto err;
-                objio_seg->group_width = layout.olo_map.odm_group_width;
+                ++cur_comp;
-                objio_seg->group_depth = layout.olo_map.odm_group_depth;
-                objio_seg->group_count = layout.olo_map.odm_num_comps /
-                                                objio_seg->mirrors_p1 /
-                                                objio_seg->group_width;
-        } else {
-                objio_seg->group_width = layout.olo_map.odm_num_comps /
-                                                objio_seg->mirrors_p1;
-                objio_seg->group_depth = -1;
-                objio_seg->group_count = 1;
        }
+        /* pnfs_osd_xdr_decode_layout_comp returns false on error */
-        /* Cache this calculation it will hit for every page */
+        if (unlikely(err))
-        objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
+                goto err;
-                                  objio_seg->stripe_unit) *
-                                 objio_seg->group_width;
        *outp = &objio_seg->lseg;
        return 0;
@@ -386,43 +289,63 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
        int i;
        struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
-        for (i = 0; i < objio_seg->num_comps; i++) {
+        for (i = 0; i < objio_seg->oc.numdevs; i++) {
-                if (!objio_seg->ods[i])
+                struct ore_dev *od = objio_seg->oc.ods[i];
+                struct objio_dev_ent *ode;
+                if (!od)
                        break;
-                nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
+                ode = container_of(od, typeof(*ode), od);
+                nfs4_put_deviceid_node(&ode->id_node);
        }
        kfree(objio_seg);
 }
-int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
+static int
-                         struct objlayout_io_state **outp,
+objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
-                         gfp_t gfp_flags)
+        struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
+        loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
+        struct objio_state **outp)
 {
        struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
-        struct objio_state *ios;
+        struct ore_io_state *ios;
-        const unsigned first_size = sizeof(*ios) +
+        int ret;
-                                objio_seg->num_comps * sizeof(ios->per_dev[0]);
+        struct __alloc_objio_state {
-        const unsigned sec_size = objio_seg->num_comps *
+                struct objio_state objios;
-                                                sizeof(ios->ol_state.ioerrs[0]);
+                struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
+        } *aos;
-        ios = kzalloc(first_size + sec_size, gfp_flags);
-        if (unlikely(!ios))
+        aos = kzalloc(sizeof(*aos), gfp_flags);
+        if (unlikely(!aos))
                return -ENOMEM;
-        ios->layout = objio_seg;
+        objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
-        ios->ol_state.ioerrs = ((void *)ios) + first_size;
+                        aos->ioerrs, rpcdata, pnfs_layout_type);
-        ios->ol_state.num_comps = objio_seg->num_comps;
-        *outp = &ios->ol_state;
+        ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
+                               offset, count, &ios);
+        if (unlikely(ret)) {
+                kfree(aos);
+                return ret;
+        }
+        ios->pages = pages;
+        ios->pgbase = pgbase;
+        ios->private = aos;
+        BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
+        aos->objios.sync = 0;
+        aos->objios.ios = ios;
+        *outp = &aos->objios;
        return 0;
 }
-void objio_free_io_state(struct objlayout_io_state *ol_state)
+void objio_free_result(struct objlayout_io_res *oir)
 {
-        struct objio_state *ios = container_of(ol_state, struct objio_state,
+        struct objio_state *objios = container_of(oir, struct objio_state, oir);
-                                               ol_state);
-        kfree(ios);
+        ore_put_io_state(objios->ios);
+        kfree(objios);
 }
 enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
@@ -455,539 +378,152 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
        }
 }
-static void _clear_bio(struct bio *bio)
+static void __on_dev_error(struct ore_io_state *ios,
+        struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
+        u64 dev_offset, u64  dev_len)
 {
-        struct bio_vec *bv;
+        struct objio_state *objios = ios->private;
-        unsigned i;
+        struct pnfs_osd_objid pooid;
+        struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
-        __bio_for_each_segment(bv, bio, i, 0) {
+        /* FIXME: what to do with more-then-one-group layouts. We need to
-                unsigned this_count = bv->bv_len;
+         * translate from ore_io_state index to oc->comps index
+         */
-                if (likely(PAGE_SIZE == this_count))
+        unsigned comp = dev_index;
-                        clear_highpage(bv->bv_page);
-                else
-                        zero_user(bv->bv_page, bv->bv_offset, this_count);
-        }
-}
-static int _io_check(struct objio_state *ios, bool is_write)
-{
-        enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
-        int lin_ret = 0;
-        int i;
-        for (i = 0; i <  ios->numdevs; i++) {
-                struct osd_sense_info osi;
-                struct osd_request *or = ios->per_dev[i].or;
-                int ret;
-                if (!or)
-                        continue;
-                ret = osd_req_decode_sense(or, &osi);
+        pooid.oid_device_id = ode->id_node.deviceid;
-                if (likely(!ret))
+        pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
-                        continue;
+        pooid.oid_object_id = ios->oc->comps[comp].obj.id;
-                if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+        objlayout_io_set_result(&objios->oir, comp,
-                        /* start read offset passed endof file */
+                                &pooid, osd_pri_2_pnfs_err(oep),
-                        BUG_ON(is_write);
+                                dev_offset, dev_len, !ios->reading);
-                        _clear_bio(ios->per_dev[i].bio);
-                        dprintk("%s: start read offset passed end of file "
-                                "offset=0x%llx, length=0x%lx\n", __func__,
-                                _LLU(ios->per_dev[i].offset),
-                                ios->per_dev[i].length);
-                        continue; /* we recovered */
-                }
-                objlayout_io_set_result(&ios->ol_state, i,
-                                        &ios->layout->comps[i].oc_object_id,
-                                        osd_pri_2_pnfs_err(osi.osd_err_pri),
-                                        ios->per_dev[i].offset,
-                                        ios->per_dev[i].length,
-                                        is_write);
-                if (osi.osd_err_pri >= oep) {
-                        oep = osi.osd_err_pri;
-                        lin_ret = ret;
-                }
-        }
-        return lin_ret;
-}
-/*
- * Common IO state helpers.
- */
-static void _io_free(struct objio_state *ios)
-{
-        unsigned i;
-        for (i = 0; i < ios->numdevs; i++) {
-                struct _objio_per_comp *per_dev = &ios->per_dev[i];
-                if (per_dev->or) {
-                        osd_end_request(per_dev->or);
-                        per_dev->or = NULL;
-                }
-                if (per_dev->bio) {
-                        bio_put(per_dev->bio);
-                        per_dev->bio = NULL;
-                }
-        }
-}
-struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
-{
-        unsigned min_dev = ios->layout->comps_index;
-        unsigned max_dev = min_dev + ios->layout->num_comps;
-        BUG_ON(dev < min_dev || max_dev <= dev);
-        return ios->layout->ods[dev - min_dev]->od;
-}
-struct _striping_info {
-        u64 obj_offset;
-        u64 group_length;
-        unsigned dev;
-        unsigned unit_off;
-};
-static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
-                              struct _striping_info *si)
-{
-        u32     stripe_unit = ios->layout->stripe_unit;
-        u32     group_width = ios->layout->group_width;
-        u64     group_depth = ios->layout->group_depth;
-        u32     U = stripe_unit * group_width;
-        u64     T = U * group_depth;
-        u64     S = T * ios->layout->group_count;
-        u64     M = div64_u64(file_offset, S);
-        /*
-        G = (L - (M * S)) / T
-        H = (L - (M * S)) % T
-        */
-        u64     LmodU = file_offset - M * S;
-        u32     G = div64_u64(LmodU, T);
-        u64     H = LmodU - G * T;
-        u32     N = div_u64(H, U);
-        div_u64_rem(file_offset, stripe_unit, &si->unit_off);
-        si->obj_offset = si->unit_off + (N * stripe_unit) +
-                                  (M * group_depth * stripe_unit);
-        /* "H - (N * U)" is just "H % U" so it's bound to u32 */
-        si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
-        si->dev *= ios->layout->mirrors_p1;
-        si->group_length = T - H;
-}
-static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
-                unsigned pgbase, struct _objio_per_comp *per_dev, int len,
-                gfp_t gfp_flags)
-{
-        unsigned pg = *cur_pg;
-        int cur_len = len;
-        struct request_queue *q =
-                        osd_request_queue(_io_od(ios, per_dev->dev));
-        if (per_dev->bio == NULL) {
-                unsigned pages_in_stripe = ios->layout->group_width *
-                                      (ios->layout->stripe_unit / PAGE_SIZE);
-                unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
-                                    ios->layout->group_width;
-                if (BIO_MAX_PAGES_KMALLOC < bio_size)
-                        bio_size = BIO_MAX_PAGES_KMALLOC;
-                per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
-                if (unlikely(!per_dev->bio)) {
-                        dprintk("Faild to allocate BIO size=%u\n", bio_size);
-                        return -ENOMEM;
-                }
-        }
-        while (cur_len > 0) {
-                unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
-                unsigned added_len;
-                BUG_ON(ios->ol_state.nr_pages <= pg);
-                cur_len -= pglen;
-                added_len = bio_add_pc_page(q, per_dev->bio,
-                                        ios->ol_state.pages[pg], pglen, pgbase);
-                if (unlikely(pglen != added_len))
-                        return -ENOMEM;
-                pgbase = 0;
-                ++pg;
-        }
-        BUG_ON(cur_len);
-        per_dev->length += len;
-        *cur_pg = pg;
-        return 0;
-}
-static int _prepare_one_group(struct objio_state *ios, u64 length,
-                              struct _striping_info *si, unsigned *last_pg,
-                              gfp_t gfp_flags)
-{
-        unsigned stripe_unit = ios->layout->stripe_unit;
-        unsigned mirrors_p1 = ios->layout->mirrors_p1;
-        unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
-        unsigned dev = si->dev;
-        unsigned first_dev = dev - (dev % devs_in_group);
-        unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
-        unsigned cur_pg = *last_pg;
-        int ret = 0;
-        while (length) {
-                struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
-                unsigned cur_len, page_off = 0;
-                if (!per_dev->length) {
-                        per_dev->dev = dev;
-                        if (dev < si->dev) {
-                                per_dev->offset = si->obj_offset + stripe_unit -
-                                                                   si->unit_off;
-                                cur_len = stripe_unit;
-                        } else if (dev == si->dev) {
-                                per_dev->offset = si->obj_offset;
-                                cur_len = stripe_unit - si->unit_off;
-                                page_off = si->unit_off & ~PAGE_MASK;
-                                BUG_ON(page_off &&
-                                      (page_off != ios->ol_state.pgbase));
-                        } else { /* dev > si->dev */
-                                per_dev->offset = si->obj_offset - si->unit_off;
-                                cur_len = stripe_unit;
-                        }
-                        if (max_comp < dev - first_dev)
-                                max_comp = dev - first_dev;
-                } else {
-                        cur_len = stripe_unit;
-                }
-                if (cur_len >= length)
-                        cur_len = length;
-                ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
-                                       cur_len, gfp_flags);
-                if (unlikely(ret))
-                        goto out;
-                dev += mirrors_p1;
-                dev = (dev % devs_in_group) + first_dev;
-                length -= cur_len;
-                ios->length += cur_len;
-        }
-out:
-        ios->numdevs = max_comp + mirrors_p1;
-        *last_pg = cur_pg;
-        return ret;
-}
-static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
-{
-        u64 length = ios->ol_state.count;
-        u64 offset = ios->ol_state.offset;
-        struct _striping_info si;
-        unsigned last_pg = 0;
-        int ret = 0;
-        while (length) {
-                _calc_stripe_info(ios, offset, &si);
-                if (length < si.group_length)
-                        si.group_length = length;
-                ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
-                if (unlikely(ret))
-                        goto out;
-                offset += si.group_length;
-                length -= si.group_length;
-        }
-out:
-        if (!ios->length)
-                return ret;
-        return 0;
-}
-static ssize_t _sync_done(struct objio_state *ios)
-{
-        struct completion *waiting = ios->private;
-        complete(waiting);
-        return 0;
-}
-static void _last_io(struct kref *kref)
-{
-        struct objio_state *ios = container_of(kref, struct objio_state, kref);
-        ios->done(ios);
-}
-static void _done_io(struct osd_request *or, void *p)
-{
-        struct objio_state *ios = p;
-        kref_put(&ios->kref, _last_io);
-}
-static ssize_t _io_exec(struct objio_state *ios)
-{
-        DECLARE_COMPLETION_ONSTACK(wait);
-        ssize_t status = 0; /* sync status */
-        unsigned i;
-        objio_done_fn saved_done_fn = ios->done;
-        bool sync = ios->ol_state.sync;
-        if (sync) {
-                ios->done = _sync_done;
-                ios->private = &wait;
-        }
-        kref_init(&ios->kref);
-        for (i = 0; i < ios->numdevs; i++) {
-                struct osd_request *or = ios->per_dev[i].or;
-                if (!or)
-                        continue;
-                kref_get(&ios->kref);
-                osd_execute_request_async(or, _done_io, ios);
-        }
-        kref_put(&ios->kref, _last_io);
-        if (sync) {
-                wait_for_completion(&wait);
-                status = saved_done_fn(ios);
-        }
-        return status;
 }
 /*
 * read
 */
-static ssize_t _read_done(struct objio_state *ios)
+static void _read_done(struct ore_io_state *ios, void *private)
 {
+        struct objio_state *objios = private;
        ssize_t status;
-        int ret = _io_check(ios, false);
+        int ret = ore_check_io(ios, &__on_dev_error);
-        _io_free(ios);
+        /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
        if (likely(!ret))
                status = ios->length;
        else
                status = ret;
-        objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
+        objlayout_read_done(&objios->oir, status, objios->sync);
-        return status;
 }
-static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
+int objio_read_pagelist(struct nfs_read_data *rdata)
 {
-        struct osd_request *or = NULL;
+        struct objio_state *objios;
-        struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
-        unsigned dev = per_dev->dev;
-        struct pnfs_osd_object_cred *cred =
-                        &ios->layout->comps[cur_comp];
-        struct osd_obj_id obj = {
-                .partition = cred->oc_object_id.oid_partition_id,
-                .id = cred->oc_object_id.oid_object_id,
-        };
        int ret;
-        or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
+        ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
-        if (unlikely(!or)) {
+                        rdata->lseg, rdata->args.pages, rdata->args.pgbase,
-                ret = -ENOMEM;
+                        rdata->args.offset, rdata->args.count, rdata,
-                goto err;
+                        GFP_KERNEL, &objios);
-        }
-        per_dev->or = or;
-        osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
-        ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
-        if (ret) {
-                dprintk("%s: Faild to osd_finalize_request() => %d\n",
-                        __func__, ret);
-                goto err;
-        }
-        dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
-                __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
-                per_dev->length);
-err:
-        return ret;
-}
-static ssize_t _read_exec(struct objio_state *ios)
-{
-        unsigned i;
-        int ret;
-        for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
-                if (!ios->per_dev[i].length)
-                        continue;
-                ret = _read_mirrors(ios, i);
-                if (unlikely(ret))
-                        goto err;
-        }
-        ios->done = _read_done;
-        return _io_exec(ios); /* In sync mode exec returns the io status */
-err:
-        _io_free(ios);
-        return ret;
-}
-ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
-{
-        struct objio_state *ios = container_of(ol_state, struct objio_state,
-                                               ol_state);
-        int ret;
-        ret = _io_rw_pagelist(ios, GFP_KERNEL);
        if (unlikely(ret))
                return ret;
-        return _read_exec(ios);
+        objios->ios->done = _read_done;
+        dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
+                rdata->args.offset, rdata->args.count);
+        return ore_read(objios->ios);
 }
 /*
 * write
 */
-static ssize_t _write_done(struct objio_state *ios)
+static void _write_done(struct ore_io_state *ios, void *private)
 {
+        struct objio_state *objios = private;
        ssize_t status;
-        int ret = _io_check(ios, true);
+        int ret = ore_check_io(ios, &__on_dev_error);
-        _io_free(ios);
+        /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
        if (likely(!ret)) {
                /* FIXME: should be based on the OSD's persistence model
                 * See OSD2r05 Section 4.13 Data persistence model */
-                ios->ol_state.committed = NFS_FILE_SYNC;
+                objios->oir.committed = NFS_FILE_SYNC;
                status = ios->length;
        } else {
                status = ret;
        }
-        objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
+        objlayout_write_done(&objios->oir, status, objios->sync);
-        return status;
 }
-static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 {
-        struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
+        struct objio_state *objios = priv;
-        unsigned dev = ios->per_dev[cur_comp].dev;
+        struct nfs_write_data *wdata = objios->oir.rpcdata;
-        unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+        pgoff_t index = offset / PAGE_SIZE;
-        int ret;
+        struct page *page = find_get_page(wdata->inode->i_mapping, index);
-        for (; cur_comp < last_comp; ++cur_comp, ++dev) {
-                struct osd_request *or = NULL;
-                struct pnfs_osd_object_cred *cred =
-                                        &ios->layout->comps[cur_comp];
-                struct osd_obj_id obj = {
-                        .partition = cred->oc_object_id.oid_partition_id,
-                        .id = cred->oc_object_id.oid_object_id,
-                };
-                struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
-                struct bio *bio;
-                or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
-                if (unlikely(!or)) {
-                        ret = -ENOMEM;
-                        goto err;
-                }
-                per_dev->or = or;
-                if (per_dev != master_dev) {
-                        bio = bio_kmalloc(GFP_NOFS,
-                                          master_dev->bio->bi_max_vecs);
-                        if (unlikely(!bio)) {
-                                dprintk("Faild to allocate BIO size=%u\n",
-                                        master_dev->bio->bi_max_vecs);
-                                ret = -ENOMEM;
-                                goto err;
-                        }
-                        __bio_clone(bio, master_dev->bio);
-                        bio->bi_bdev = NULL;
-                        bio->bi_next = NULL;
-                        per_dev->bio = bio;
-                        per_dev->dev = dev;
-                        per_dev->length = master_dev->length;
-                        per_dev->offset =  master_dev->offset;
-                } else {
-                        bio = master_dev->bio;
-                        bio->bi_rw |= REQ_WRITE;
-                }
-                osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
-                ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+        if (!page) {
-                if (ret) {
+                page = find_or_create_page(wdata->inode->i_mapping,
-                        dprintk("%s: Faild to osd_finalize_request() => %d\n",
+                                                index, GFP_NOFS);
-                                __func__, ret);
+                if (unlikely(!page)) {
-                        goto err;
+                        dprintk("%s: grab_cache_page Failed index=0x%lx\n",
+                                __func__, index);
+                        return NULL;
                }
+                unlock_page(page);
-                dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
-                        __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
-                        per_dev->length);
        }
+        if (PageDirty(page) || PageWriteback(page))
+                *uptodate = true;
+        else
+                *uptodate = PageUptodate(page);
+        dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
+        return page;
+}
-err:
+static void __r4w_put_page(void *priv, struct page *page)
-        return ret;
+{
+        dprintk("%s: index=0x%lx\n", __func__, page->index);
+        page_cache_release(page);
+        return;
 }
-static ssize_t _write_exec(struct objio_state *ios)
+static const struct _ore_r4w_op _r4w_op = {
+        .get_page = &__r4w_get_page,
+        .put_page = &__r4w_put_page,
+};
+int objio_write_pagelist(struct nfs_write_data *wdata, int how)
 {
-        unsigned i;
+        struct objio_state *objios;
        int ret;
-        for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+        ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
-                if (!ios->per_dev[i].length)
+                        wdata->lseg, wdata->args.pages, wdata->args.pgbase,
-                        continue;
+                        wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
-                ret = _write_mirrors(ios, i);
+                        &objios);
-                if (unlikely(ret))
+        if (unlikely(ret))
-                        goto err;
+                return ret;
-        }
-        ios->done = _write_done;
-        return _io_exec(ios); /* In sync mode exec returns the io->status */
-err:
+        objios->sync = 0 != (how & FLUSH_SYNC);
-        _io_free(ios);
+        objios->ios->r4w = &_r4w_op;
-        return ret;
-}
-ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+        if (!objios->sync)
-{
+                objios->ios->done = _write_done;
-        struct objio_state *ios = container_of(ol_state, struct objio_state,
-                                               ol_state);
-        int ret;
-        /* TODO: ios->stable = stable; */
+        dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
-        ret = _io_rw_pagelist(ios, GFP_NOFS);
+                wdata->args.offset, wdata->args.count);
+        ret = ore_write(objios->ios);
        if (unlikely(ret))
                return ret;
-        return _write_exec(ios);
+        if (objios->sync)
+                _write_done(objios->ios, objios);
+        return 0;
 }
 static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
@@ -997,7 +533,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
                return false;
        return pgio->pg_count + req->wb_bytes <=
-                        OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
+                        OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
 }
 static const struct nfs_pageio_ops objio_pg_read_ops = {
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1d06f8e2ade..72074e3a04f 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -156,77 +156,39 @@ last_byte_offset(u64 start, u64 len)
        return end > start ? end - 1 : NFS4_MAX_UINT64;
 }
-static struct objlayout_io_state *
+void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
-objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+                           struct page ***p_pages, unsigned *p_pgbase,
-                        struct page **pages,
+                           u64 offset, unsigned long count)
-                        unsigned pgbase,
-                        loff_t offset,
-                        size_t count,
-                        struct pnfs_layout_segment *lseg,
-                        void *rpcdata,
-                        gfp_t gfp_flags)
 {
-        struct objlayout_io_state *state;
        u64 lseg_end_offset;
-        dprintk("%s: allocating io_state\n", __func__);
-        if (objio_alloc_io_state(lseg, &state, gfp_flags))
-                return NULL;
        BUG_ON(offset < lseg->pls_range.offset);
        lseg_end_offset = end_offset(lseg->pls_range.offset,
                                     lseg->pls_range.length);
        BUG_ON(offset >= lseg_end_offset);
-        if (offset + count > lseg_end_offset) {
+        WARN_ON(offset + count > lseg_end_offset);
-                count = lseg->pls_range.length -
-                                (offset - lseg->pls_range.offset);
-                dprintk("%s: truncated count %Zd\n", __func__, count);
-        }
-        if (pgbase > PAGE_SIZE) {
+        if (*p_pgbase > PAGE_SIZE) {
-                pages += pgbase >> PAGE_SHIFT;
+                dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
-                pgbase &= ~PAGE_MASK;
+                *p_pages += *p_pgbase >> PAGE_SHIFT;
+                *p_pgbase &= ~PAGE_MASK;
        }
-        INIT_LIST_HEAD(&state->err_list);
-        state->lseg = lseg;
-        state->rpcdata = rpcdata;
-        state->pages = pages;
-        state->pgbase = pgbase;
-        state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        state->offset = offset;
-        state->count = count;
-        state->sync = 0;
-        return state;
-}
-static void
-objlayout_free_io_state(struct objlayout_io_state *state)
-{
-        dprintk("%s: freeing io_state\n", __func__);
-        if (unlikely(!state))
-                return;
-        objio_free_io_state(state);
 }
 /*
 * I/O done common code
 */
 static void
-objlayout_iodone(struct objlayout_io_state *state)
+objlayout_iodone(struct objlayout_io_res *oir)
 {
-        dprintk("%s: state %p status\n", __func__, state);
+        if (likely(oir->status >= 0)) {
+                objio_free_result(oir);
-        if (likely(state->status >= 0)) {
-                objlayout_free_io_state(state);
        } else {
-                struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+                struct objlayout *objlay = oir->objlay;
                spin_lock(&objlay->lock);
                objlay->delta_space_valid = OBJ_DSU_INVALID;
-                list_add(&objlay->err_list, &state->err_list);
+                list_add(&objlay->err_list, &oir->err_list);
                spin_unlock(&objlay->lock);
        }
 }
@@ -238,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state)
 * the error for later reporting at layout-return.
 */
 void
-objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
                        struct pnfs_osd_objid *pooid, int osd_error,
                        u64 offset, u64 length, bool is_write)
 {
-        struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+        struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
-        BUG_ON(index >= state->num_comps);
+        BUG_ON(index >= oir->num_comps);
        if (osd_error) {
                ioerr->oer_component = *pooid;
                ioerr->oer_comp_offset = offset;
@@ -285,21 +247,18 @@ static void _rpc_read_complete(struct work_struct *work)
 }
 void
-objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-        int eof = state->eof;
+        struct nfs_read_data *rdata = oir->rpcdata;
-        struct nfs_read_data *rdata;
-        state->status = status;
+        oir->status = rdata->task.tk_status = status;
-        dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof);
+        if (status >= 0)
-        rdata = state->rpcdata;
-        rdata->task.tk_status = status;
-        if (status >= 0) {
                rdata->res.count = status;
-                rdata->res.eof = eof;
+        objlayout_iodone(oir);
-        }
+        /* must not use oir after this point */
-        objlayout_iodone(state);
-        /* must not use state after this point */
+        dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
+                status, rdata->res.eof, sync);
        if (sync)
                pnfs_ld_read_done(rdata);
@@ -317,40 +276,36 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
 {
        loff_t offset = rdata->args.offset;
        size_t count = rdata->args.count;
-        struct objlayout_io_state *state;
+        int err;
-        ssize_t status = 0;
        loff_t eof;
-        dprintk("%s: Begin inode %p offset %llu count %d\n",
-                __func__, rdata->inode, offset, (int)count);
        eof = i_size_read(rdata->inode);
        if (unlikely(offset + count > eof)) {
                if (offset >= eof) {
-                        status = 0;
+                        err = 0;
                        rdata->res.count = 0;
                        rdata->res.eof = 1;
+                        /*FIXME: do we need to call pnfs_ld_read_done() */
                        goto out;
                }
                count = eof - offset;
        }
-        state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
+        rdata->res.eof = (offset + count) >= eof;
-                                         rdata->args.pages, rdata->args.pgbase,
+        _fix_verify_io_params(rdata->lseg, &rdata->args.pages,
-                                         offset, count,
+                              &rdata->args.pgbase,
-                                         rdata->lseg, rdata,
+                              rdata->args.offset, rdata->args.count);
-                                         GFP_KERNEL);
-        if (unlikely(!state)) {
-                status = -ENOMEM;
-                goto out;
-        }
-        state->eof = state->offset + state->count >= eof;
+        dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
+                __func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
-        status = objio_read_pagelist(state);
+        err = objio_read_pagelist(rdata);
 out:
-        dprintk("%s: Return status %Zd\n", __func__, status);
+        if (unlikely(err)) {
-        rdata->pnfs_error = status;
+                rdata->pnfs_error = err;
+                dprintk("%s: Returned Error %d\n", __func__, err);
+                return PNFS_NOT_ATTEMPTED;
+        }
        return PNFS_ATTEMPTED;
 }
@@ -371,26 +326,20 @@ static void _rpc_write_complete(struct work_struct *work)
 }
 void
-objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
+objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
-                     bool sync)
 {
-        struct nfs_write_data *wdata;
+        struct nfs_write_data *wdata = oir->rpcdata;
-        dprintk("%s: Begin\n", __func__);
+        oir->status = wdata->task.tk_status = status;
-        wdata = state->rpcdata;
-        state->status = status;
-        wdata->task.tk_status = status;
        if (status >= 0) {
                wdata->res.count = status;
-                wdata->verf.committed = state->committed;
+                wdata->verf.committed = oir->committed;
-                dprintk("%s: Return status %d committed %d\n",
+        }
-                        __func__, wdata->task.tk_status,
+        objlayout_iodone(oir);
-                        wdata->verf.committed);
+        /* must not use oir after this point */
-        } else
-                dprintk("%s: Return status %d\n",
+        dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
-                        __func__, wdata->task.tk_status);
+                status, wdata->verf.committed, sync);
-        objlayout_iodone(state);
-        /* must not use state after this point */
        if (sync)
                pnfs_ld_write_done(wdata);
@@ -407,30 +356,18 @@ enum pnfs_try_status
 objlayout_write_pagelist(struct nfs_write_data *wdata,
                         int how)
 {
-        struct objlayout_io_state *state;
+        int err;
-        ssize_t status;
-        dprintk("%s: Begin inode %p offset %llu count %u\n",
-                __func__, wdata->inode, wdata->args.offset, wdata->args.count);
-        state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
-                                         wdata->args.pages,
-                                         wdata->args.pgbase,
-                                         wdata->args.offset,
-                                         wdata->args.count,
-                                         wdata->lseg, wdata,
-                                         GFP_NOFS);
-        if (unlikely(!state)) {
-                status = -ENOMEM;
-                goto out;
-        }
-        state->sync = how & FLUSH_SYNC;
+        _fix_verify_io_params(wdata->lseg, &wdata->args.pages,
+                              &wdata->args.pgbase,
+                              wdata->args.offset, wdata->args.count);
-        status = objio_write_pagelist(state, how & FLUSH_STABLE);
+        err = objio_write_pagelist(wdata, how);
- out:
+        if (unlikely(err)) {
-        dprintk("%s: Return status %Zd\n", __func__, status);
+                wdata->pnfs_error = err;
-        wdata->pnfs_error = status;
+                dprintk("%s: Returned Error %d\n", __func__, err);
+                return PNFS_NOT_ATTEMPTED;
+        }
        return PNFS_ATTEMPTED;
 }
@@ -537,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err,
 static void
 encode_accumulated_error(struct objlayout *objlay, __be32 *p)
 {
-        struct objlayout_io_state *state, *tmp;
+        struct objlayout_io_res *oir, *tmp;
        struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
-        list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+        list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
                unsigned i;
-                for (i = 0; i < state->num_comps; i++) {
+                for (i = 0; i < oir->num_comps; i++) {
-                        struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+                        struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
                        if (!ioerr->oer_errno)
                                continue;
@@ -563,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
                        merge_ioerr(&accumulated_err, ioerr);
                }
-                list_del(&state->err_list);
+                list_del(&oir->err_list);
-                objlayout_free_io_state(state);
+                objio_free_result(oir);
        }
        pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
@@ -576,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
                              const struct nfs4_layoutreturn_args *args)
 {
        struct objlayout *objlay = OBJLAYOUT(pnfslay);
-        struct objlayout_io_state *state, *tmp;
+        struct objlayout_io_res *oir, *tmp;
        __be32 *start;
        dprintk("%s: Begin\n", __func__);
@@ -585,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
        spin_lock(&objlay->lock);
-        list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+        list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
                __be32 *last_xdr = NULL, *p;
                unsigned i;
                int res = 0;
-                for (i = 0; i < state->num_comps; i++) {
+                for (i = 0; i < oir->num_comps; i++) {
-                        struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+                        struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
                        if (!ioerr->oer_errno)
                                continue;
@@ -615,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
                        }
                        last_xdr = p;
-                        pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
+                        pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
                }
                /* TODO: use xdr_write_pages */
@@ -631,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
                        encode_accumulated_error(objlay, last_xdr);
                        goto loop_done;
                }
-                list_del(&state->err_list);
+                list_del(&oir->err_list);
-                objlayout_free_io_state(state);
+                objio_free_result(oir);
        }
 loop_done:
        spin_unlock(&objlay->lock);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index a8244c8e042..8ec34727ed2 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -74,19 +74,11 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
 * per-I/O operation state
 * embedded in objects provider io_state data structure
 */
-struct objlayout_io_state {
+struct objlayout_io_res {
-        struct pnfs_layout_segment *lseg;
+        struct objlayout *objlay;
-        struct page **pages;
-        unsigned pgbase;
-        unsigned nr_pages;
-        unsigned long count;
-        loff_t offset;
-        bool sync;
        void *rpcdata;
        int status;             /* res */
-        int eof;                /* res */
        int committed;          /* res */
        /* Error reporting (layout_return) */
@@ -100,6 +92,18 @@ struct objlayout_io_state {
        struct pnfs_osd_ioerr *ioerrs;
 };
+static inline
+void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
+                        struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
+                        struct pnfs_layout_hdr *pnfs_layout_type)
+{
+        oir->objlay = OBJLAYOUT(pnfs_layout_type);
+        oir->rpcdata = rpcdata;
+        INIT_LIST_HEAD(&oir->err_list);
+        oir->num_comps = num_comps;
+        oir->ioerrs = ioerrs;
+}
 /*
 * Raid engine I/O API
 */
@@ -110,28 +114,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
        gfp_t gfp_flags);
 extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
-extern int objio_alloc_io_state(
+/* objio_free_result will free these @oir structs recieved from
-        struct pnfs_layout_segment *lseg,
+ * objlayout_{read,write}_done
-        struct objlayout_io_state **outp,
+ */
-        gfp_t gfp_flags);
+extern void objio_free_result(struct objlayout_io_res *oir);
-extern void objio_free_io_state(struct objlayout_io_state *state);
-extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
+extern int objio_read_pagelist(struct nfs_read_data *rdata);
-extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
+extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
-                                    bool stable);
 /*
 * callback API
 */
-extern void objlayout_io_set_result(struct objlayout_io_state *state,
+extern void objlayout_io_set_result(struct objlayout_io_res *oir,
                        unsigned index, struct pnfs_osd_objid *pooid,
                        int osd_error, u64 offset, u64 length, bool is_write);
 static inline void
-objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
 {
-        struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
        /* If one of the I/Os errored out and the delta_space_used was
         * invalid we render the complete report as invalid. Protocol mandate
         * the DSU be accurate or not reported.
@@ -144,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
        spin_unlock(&objlay->lock);
 }
-extern void objlayout_read_done(struct objlayout_io_state *state,
+extern void objlayout_read_done(struct objlayout_io_res *oir,
                                ssize_t status, bool sync);
-extern void objlayout_write_done(struct objlayout_io_state *state,
+extern void objlayout_write_done(struct objlayout_io_res *oir,
                                 ssize_t status, bool sync);
 extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b60970cc7f1..5668f7c54c4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -18,6 +18,7 @@
 #include <linux/nfs_page.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
+#include <linux/export.h>
 #include "internal.h"
 #include "pnfs.h"
@@ -41,7 +42,7 @@ nfs_page_free(struct nfs_page *p)
 /**
 * nfs_create_request - Create an NFS read/write request.
- * @file: file descriptor to use
+ * @ctx: open context to use
 * @inode: inode to which the request is attached
 * @page: page to write
 * @offset: starting offset within the page for the write
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ee73d9a4f70..8e672a2b2d6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -29,6 +29,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/module.h>
 #include "internal.h"
 #include "pnfs.h"
 #include "iostat.h"
@@ -1259,6 +1260,25 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
+static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
+{
+        struct nfs_pageio_descriptor pgio;
+        put_lseg(data->lseg);
+        data->lseg = NULL;
+        dprintk("pnfs write error = %d\n", data->pnfs_error);
+        nfs_pageio_init_read_mds(&pgio, data->inode);
+        while (!list_empty(&data->pages)) {
+                struct nfs_page *req = nfs_list_entry(data->pages.next);
+                nfs_list_remove_request(req);
+                nfs_pageio_add_request(&pgio, req);
+        }
+        nfs_pageio_complete(&pgio);
+}
 /*
 * Called by non rpc-based layout drivers
 */
@@ -1267,11 +1287,8 @@ void pnfs_ld_read_done(struct nfs_read_data *data)
        if (likely(!data->pnfs_error)) {
                __nfs4_read_done_cb(data);
                data->mds_ops->rpc_call_done(&data->task, data);
-        } else {
+        } else
-                put_lseg(data->lseg);
+                pnfs_ld_handle_read_error(data);
-                data->lseg = NULL;
-                dprintk("pnfs write error = %d\n", data->pnfs_error);
-        }
        data->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
@@ -1443,17 +1460,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
        /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
        data = kzalloc(sizeof(*data), GFP_NOFS);
        if (!data) {
-                mark_inode_dirty_sync(inode);
                status = -ENOMEM;
                goto out;
        }
+        if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+                goto out_free;
+        if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
+                if (!sync) {
+                        status = -EAGAIN;
+                        goto out_free;
+                }
+                status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
+                                        nfs_wait_bit_killable, TASK_KILLABLE);
+                if (status)
+                        goto out_free;
+        }
        INIT_LIST_HEAD(&data->lseg_list);
        spin_lock(&inode->i_lock);
        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+                clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
                spin_unlock(&inode->i_lock);
-                kfree(data);
+                wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
-                goto out;
+                goto out_free;
        }
        pnfs_list_write_lseg(inode, &data->lseg_list);
@@ -1475,6 +1506,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
        status = nfs4_proc_layoutcommit(data, sync);
 out:
+        if (status)
+                mark_inode_dirty_sync(inode);
        dprintk("<-- %s status %d\n", __func__, status);
        return status;
+out_free:
+        kfree(data);
+        goto out;
 }
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6fda5228ef5..4f359d2a26e 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -28,6 +28,7 @@
 *  such damages.
 */
+#include <linux/export.h>
 #include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ac40b8535d7..f48125da198 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -710,6 +710,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .dentry_ops     = &nfs_dentry_operations,
        .dir_inode_ops  = &nfs_dir_inode_operations,
        .file_inode_ops = &nfs_file_inode_operations,
+        .file_ops       = &nfs_file_operations,
        .getroot        = nfs_proc_get_root,
        .getattr        = nfs_proc_getattr,
        .setattr        = nfs_proc_setattr,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 8b48ec63f72..cfa175c223d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -109,7 +109,7 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
        }
 }
-static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
+void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
                struct inode *inode)
 {
        nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
@@ -534,23 +534,13 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
 static void nfs_readpage_release_full(void *calldata)
 {
        struct nfs_read_data *data = calldata;
-        struct nfs_pageio_descriptor pgio;
-        if (data->pnfs_error) {
-                nfs_pageio_init_read_mds(&pgio, data->inode);
-                pgio.pg_recoalesce = 1;
-        }
        while (!list_empty(&data->pages)) {
                struct nfs_page *req = nfs_list_entry(data->pages.next);
                nfs_list_remove_request(req);
-                if (!data->pnfs_error)
+                nfs_readpage_release(req);
-                        nfs_readpage_release(req);
-                else
-                        nfs_pageio_add_request(&pgio, req);
        }
-        if (data->pnfs_error)
-                nfs_pageio_complete(&pgio);
        nfs_readdata_release(calldata);
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 480b3b6bf71..134777406ee 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2787,43 +2787,18 @@ static void nfs_referral_loop_unprotect(void)
 static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
                const char *export_path)
 {
-        struct mnt_namespace *ns_private;
-        struct super_block *s;
        struct dentry *dentry;
-        struct path path;
+        int ret = nfs_referral_loop_protect();
-        int ret;
-        ns_private = create_mnt_ns(root_mnt);
-        ret = PTR_ERR(ns_private);
-        if (IS_ERR(ns_private))
-                goto out_mntput;
-        ret = nfs_referral_loop_protect();
-        if (ret != 0)
-                goto out_put_mnt_ns;
-        ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
+        if (ret) {
-                        export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+                mntput(root_mnt);
+                return ERR_PTR(ret);
+        }
+        dentry = mount_subtree(root_mnt, export_path);
        nfs_referral_loop_unprotect();
-        put_mnt_ns(ns_private);
-        if (ret != 0)
-                goto out_err;
-        s = path.mnt->mnt_sb;
-        atomic_inc(&s->s_active);
-        dentry = dget(path.dentry);
-        path_put(&path);
-        down_write(&s->s_umount);
        return dentry;
-out_put_mnt_ns:
-        put_mnt_ns(ns_private);
-out_mntput:
-        mntput(root_mnt);
-out_err:
-        return ERR_PTR(ret);
 }
 static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2219c88d96b..1dda78db6a7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -20,6 +20,7 @@
 #include <linux/nfs_mount.h>
 #include <linux/nfs_page.h>
 #include <linux/backing-dev.h>
+#include <linux/export.h>
 #include <asm/uaccess.h>
@@ -1243,7 +1244,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct nfs_writeargs    *argp = &data->args;
        struct nfs_writeres     *resp = &data->res;
-        struct nfs_server       *server = NFS_SERVER(data->inode);
        int status;
        dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1277,7 +1277,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                if (time_before(complain, jiffies)) {
                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
-                                server->nfs_client->cl_hostname,
+                                NFS_SERVER(data->inode)->nfs_client->cl_hostname,
                                resp->verf->committed, argp->stable);
                        complain = jiffies + 300 * HZ;
                }
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index ad88f1c0a4c..9c51aff02ae 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,6 +36,7 @@
 #include <linux/slab.h>
 #include <linux/nfs_fs.h>
+#include <linux/export.h>
 #include "acl.h"
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index db34a585e11..c45a2ea4a09 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -13,6 +13,7 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/gss_api.h>
 #include <linux/sunrpc/gss_krb5_enctypes.h>
+#include <linux/module.h>
 #include "idmap.h"
 #include "nfsd.h"
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index dc5a1bf476b..eda7d7e55e0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/freezer.h>
+#include <linux/module.h>
 #include <linux/fs_struct.h>
 #include <linux/swap.h>
@@ -256,6 +257,8 @@ static void nfsd_last_thread(struct svc_serv *serv)
        nfsd_serv = NULL;
        nfsd_shutdown();
+        svc_rpcb_cleanup(serv);
        printk(KERN_WARNING "nfsd: last server has exited, flushing export "
                            "cache\n");
        nfsd_export_flush();
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ed553c60de8..3165aebb43c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5699,7 +5699,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
                                           OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out;
+                goto out_commit;
        }
        dquot_free_space_nodirty(inode,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c1efe939c77..78b68af3b0e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -290,7 +290,15 @@ static int ocfs2_readpage(struct file *file, struct page *page)
        }
        if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+                /*
+                 * Unlock the page and cycle ip_alloc_sem so that we don't
+                 * busyloop waiting for ip_alloc_sem to unlock
+                 */
                ret = AOP_TRUNCATED_PAGE;
+                unlock_page(page);
+                unlock = 0;
+                down_read(&oi->ip_alloc_sem);
+                up_read(&oi->ip_alloc_sem);
                goto out_inode_unlock;
        }
@@ -563,6 +571,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
        int level;
+        wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -570,6 +579,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
        if (ocfs2_iocb_is_sem_locked(iocb))
                ocfs2_iocb_clear_sem_locked(iocb);
+        if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+                ocfs2_iocb_clear_unaligned_aio(iocb);
+                if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
+                    waitqueue_active(wq)) {
+                        wake_up_all(wq);
+                }
+        }
        ocfs2_iocb_clear_rw_locked(iocb);
        level = ocfs2_iocb_rw_locked_level(iocb);
@@ -863,6 +881,12 @@ struct ocfs2_write_ctxt {
        struct page                     *w_target_page;
        /*
+         * w_target_locked is used for page_mkwrite path indicating no unlocking
+         * against w_target_page in ocfs2_write_end_nolock.
+         */
+        unsigned int                    w_target_locked:1;
+        /*
         * ocfs2_write_end() uses this to know what the real range to
         * write in the target should be.
         */
@@ -895,6 +919,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
 static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
 {
+        int i;
+        /*
+         * w_target_locked is only set to true in the page_mkwrite() case.
+         * The intent is to allow us to lock the target page from write_begin()
+         * to write_end(). The caller must hold a ref on w_target_page.
+         */
+        if (wc->w_target_locked) {
+                BUG_ON(!wc->w_target_page);
+                for (i = 0; i < wc->w_num_pages; i++) {
+                        if (wc->w_target_page == wc->w_pages[i]) {
+                                wc->w_pages[i] = NULL;
+                                break;
+                        }
+                }
+                mark_page_accessed(wc->w_target_page);
+                page_cache_release(wc->w_target_page);
+        }
        ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
        brelse(wc->w_di_bh);
@@ -1132,20 +1174,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                         */
                        lock_page(mmap_page);
+                        /* Exit and let the caller retry */
                        if (mmap_page->mapping != mapping) {
+                                WARN_ON(mmap_page->mapping);
                                unlock_page(mmap_page);
-                                /*
+                                ret = -EAGAIN;
-                                 * Sanity check - the locking in
-                                 * ocfs2_pagemkwrite() should ensure
-                                 * that this code doesn't trigger.
-                                 */
-                                ret = -EINVAL;
-                                mlog_errno(ret);
                                goto out;
                        }
                        page_cache_get(mmap_page);
                        wc->w_pages[i] = mmap_page;
+                        wc->w_target_locked = true;
                } else {
                        wc->w_pages[i] = find_or_create_page(mapping, index,
                                                             GFP_NOFS);
@@ -1160,6 +1199,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                        wc->w_target_page = wc->w_pages[i];
        }
 out:
+        if (ret)
+                wc->w_target_locked = false;
        return ret;
 }
@@ -1817,11 +1858,23 @@ try_again:
         */
        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
                                         cluster_of_pages, mmap_page);
-        if (ret) {
+        if (ret && ret != -EAGAIN) {
                mlog_errno(ret);
                goto out_quota;
        }
+        /*
+         * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+         * the target page. In this case, we exit with no error and no target
+         * page. This will trigger the caller, page_mkwrite(), to re-try
+         * the operation.
+         */
+        if (ret == -EAGAIN) {
+                BUG_ON(wc->w_target_page);
+                ret = 0;
+                goto out_quota;
+        }
        ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
                                          len);
        if (ret) {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 75cf3ad987a..ffb2da370a9 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
        OCFS2_IOCB_RW_LOCK = 0,
        OCFS2_IOCB_RW_LOCK_LEVEL,
        OCFS2_IOCB_SEM,
+        OCFS2_IOCB_UNALIGNED_IO,
        OCFS2_IOCB_NUM_LOCKS
 };
@@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
        clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_is_sem_locked(iocb) \
        test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_unaligned_aio(iocb) \
+        set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_unaligned_aio(iocb) \
+        clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_unaligned_aio(iocb) \
+        test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define OCFS2_IOEND_WQ_HASH_SZ  37
+#define ocfs2_ioend_wq(v)   (&ocfs2__ioend_wq[((unsigned long)(v)) %\
+                                            OCFS2_IOEND_WQ_HASH_SZ])
+extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9a3e6bbff27..a4e855e3690 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -216,6 +216,7 @@ struct o2hb_region {
        struct list_head        hr_all_item;
        unsigned                hr_unclean_stop:1,
+                                hr_aborted_start:1,
                                hr_item_pinned:1,
                                hr_item_dropped:1;
@@ -254,6 +255,10 @@ struct o2hb_region {
         * a more complete api that doesn't lead to this sort of fragility. */
        atomic_t                hr_steady_iterations;
+        /* terminate o2hb thread if it does not reach steady state
+         * (hr_steady_iterations == 0) within hr_unsteady_iterations */
+        atomic_t                hr_unsteady_iterations;
        char                    hr_dev_name[BDEVNAME_SIZE];
        unsigned int            hr_timeout_ms;
@@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work)
 static void o2hb_arm_write_timeout(struct o2hb_region *reg)
 {
+        /* Arm writeout only after thread reaches steady state */
+        if (atomic_read(&reg->hr_steady_iterations) != 0)
+                return;
        mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
             O2HB_MAX_WRITE_TIMEOUT_MS);
@@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
        return read == computed;
 }
-/* We want to make sure that nobody is heartbeating on top of us --
+/*
- * this will help detect an invalid configuration. */
+ * Compare the slot data with what we wrote in the last iteration.
-static void o2hb_check_last_timestamp(struct o2hb_region *reg)
+ * If the match fails, print an appropriate error message. This is to
+ * detect errors like... another node hearting on the same slot,
+ * flaky device that is losing writes, etc.
+ * Returns 1 if check succeeds, 0 otherwise.
+ */
+static int o2hb_check_own_slot(struct o2hb_region *reg)
 {
        struct o2hb_disk_slot *slot;
        struct o2hb_disk_heartbeat_block *hb_block;
@@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
        slot = &reg->hr_slots[o2nm_this_node()];
        /* Don't check on our 1st timestamp */
        if (!slot->ds_last_time)
-                return;
+                return 0;
        hb_block = slot->ds_raw_block;
        if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
            le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
            hb_block->hb_node == slot->ds_node_num)
-                return;
+                return 1;
 #define ERRSTR1         "Another node is heartbeating on device"
 #define ERRSTR2         "Heartbeat generation mismatch on device"
@@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
             (unsigned long long)slot->ds_last_time, hb_block->hb_node,
             (unsigned long long)le64_to_cpu(hb_block->hb_generation),
             (unsigned long long)le64_to_cpu(hb_block->hb_seq));
+        return 0;
 }
 static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
        o2nm_node_put(node);
 }
-static void o2hb_set_quorum_device(struct o2hb_region *reg,
+static void o2hb_set_quorum_device(struct o2hb_region *reg)
-                                   struct o2hb_disk_slot *slot)
 {
-        assert_spin_locked(&o2hb_live_lock);
        if (!o2hb_global_heartbeat_active())
                return;
-        if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+        /* Prevent race with o2hb_heartbeat_group_drop_item() */
+        if (kthread_should_stop())
+                return;
+        /* Tag region as quorum only after thread reaches steady state */
+        if (atomic_read(&reg->hr_steady_iterations) != 0)
                return;
+        spin_lock(&o2hb_live_lock);
+        if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+                goto unlock;
        /*
         * A region can be added to the quorum only when it sees all
         * live nodes heartbeat on it. In other words, the region has been
@@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
         */
        if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
                   sizeof(o2hb_live_node_bitmap)))
-                return;
+                goto unlock;
-        if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
-                return;
-        printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
+        printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
-               config_item_name(&reg->hr_item));
+               config_item_name(&reg->hr_item), reg->hr_dev_name);
        set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
@@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
                           O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
                o2hb_region_unpin(NULL);
+unlock:
+        spin_unlock(&o2hb_live_lock);
 }
 static int o2hb_check_slot(struct o2hb_region *reg,
@@ -925,8 +947,6 @@ fire_callbacks:
                slot->ds_equal_samples = 0;
        }
 out:
-        o2hb_set_quorum_device(reg, slot);
        spin_unlock(&o2hb_live_lock);
        o2hb_run_event_list(&event);
@@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes,
 static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 {
-        int i, ret, highest_node, change = 0;
+        int i, ret, highest_node;
+        int membership_change = 0, own_slot_ok = 0;
        unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
        struct o2hb_bio_wait_ctxt write_wc;
@@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
                                       sizeof(configured_nodes));
        if (ret) {
                mlog_errno(ret);
-                return ret;
+                goto bail;
        }
        /*
@@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
        highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
        if (highest_node >= O2NM_MAX_NODES) {
-                mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
+                mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
-                return -EINVAL;
+                ret = -EINVAL;
+                goto bail;
        }
        /* No sense in reading the slots of nodes that don't exist
@@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
        ret = o2hb_read_slots(reg, highest_node + 1);
        if (ret < 0) {
                mlog_errno(ret);
-                return ret;
+                goto bail;
        }
        /* With an up to date view of the slots, we can check that no
         * other node has been improperly configured to heartbeat in
         * our slot. */
-        o2hb_check_last_timestamp(reg);
+        own_slot_ok = o2hb_check_own_slot(reg);
        /* fill in the proper info for our next heartbeat */
        o2hb_prepare_block(reg, reg->hr_generation);
-        /* And fire off the write. Note that we don't wait on this I/O
-         * until later. */
        ret = o2hb_issue_node_write(reg, &write_wc);
        if (ret < 0) {
                mlog_errno(ret);
-                return ret;
+                goto bail;
        }
        i = -1;
        while((i = find_next_bit(configured_nodes,
                                 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
-                change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+                membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
        }
        /*
@@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
                 * disk */
                mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
                     write_wc.wc_error, reg->hr_dev_name);
-                return write_wc.wc_error;
+                ret = write_wc.wc_error;
+                goto bail;
        }
-        o2hb_arm_write_timeout(reg);
+        /* Skip disarming the timeout if own slot has stale/bad data */
+        if (own_slot_ok) {
+                o2hb_set_quorum_device(reg);
+                o2hb_arm_write_timeout(reg);
+        }
+bail:
        /* let the person who launched us know when things are steady */
-        if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
+        if (atomic_read(&reg->hr_steady_iterations) != 0) {
-                if (atomic_dec_and_test(&reg->hr_steady_iterations))
+                if (!ret && own_slot_ok && !membership_change) {
+                        if (atomic_dec_and_test(&reg->hr_steady_iterations))
+                                wake_up(&o2hb_steady_queue);
+                }
+        }
+        if (atomic_read(&reg->hr_steady_iterations) != 0) {
+                if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
+                        printk(KERN_NOTICE "o2hb: Unable to stabilize "
+                               "heartbeart on region %s (%s)\n",
+                               config_item_name(&reg->hr_item),
+                               reg->hr_dev_name);
+                        atomic_set(&reg->hr_steady_iterations, 0);
+                        reg->hr_aborted_start = 1;
                        wake_up(&o2hb_steady_queue);
+                        ret = -EIO;
+                }
        }
-        return 0;
+        return ret;
 }
 /* Subtract b from a, storing the result in a. a *must* have a larger
@@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data)
        /* Pin node */
        o2nm_depend_this_node();
-        while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+        while (!kthread_should_stop() &&
+               !reg->hr_unclean_stop && !reg->hr_aborted_start) {
                /* We track the time spent inside
                 * o2hb_do_disk_heartbeat so that we avoid more than
                 * hr_timeout_ms between disk writes. On busy systems
@@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data)
                 * likely to time itself out. */
                do_gettimeofday(&before_hb);
-                i = 0;
+                ret = o2hb_do_disk_heartbeat(reg);
-                do {
-                        ret = o2hb_do_disk_heartbeat(reg);
-                } while (ret && ++i < 2);
                do_gettimeofday(&after_hb);
                elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
@@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data)
                     after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
                     elapsed_msec);
-                if (elapsed_msec < reg->hr_timeout_ms) {
+                if (!kthread_should_stop() &&
+                    elapsed_msec < reg->hr_timeout_ms) {
                        /* the kthread api has blocked signals for us so no
                         * need to record the return value. */
                        msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
@@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data)
         * to timeout on this region when we could just as easily
         * write a clear generation - thus indicating to them that
         * this node has left this region.
-         *
+         */
-         * XXX: Should we skip this on unclean_stop? */
+        if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
-        o2hb_prepare_block(reg, 0);
+                o2hb_prepare_block(reg, 0);
-        ret = o2hb_issue_node_write(reg, &write_wc);
+                ret = o2hb_issue_node_write(reg, &write_wc);
-        if (ret == 0) {
+                if (ret == 0)
-                o2hb_wait_on_io(reg, &write_wc);
+                        o2hb_wait_on_io(reg, &write_wc);
-        } else {
+                else
-                mlog_errno(ret);
+                        mlog_errno(ret);
        }
        /* Unpin node */
        o2nm_undepend_this_node();
-        mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+        mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
        return 0;
 }
@@ -1158,6 +1198,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
        struct o2hb_debug_buf *db = inode->i_private;
        struct o2hb_region *reg;
        unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned long lts;
        char *buf = NULL;
        int i = -1;
        int out = 0;
@@ -1194,9 +1235,11 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
        case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
                reg = (struct o2hb_region *)db->db_data;
-                out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+                lts = reg->hr_last_timeout_start;
-                                jiffies_to_msecs(jiffies -
+                /* If 0, it has never been set before */
-                                                 reg->hr_last_timeout_start));
+                if (lts)
+                        lts = jiffies_to_msecs(jiffies - lts);
+                out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
                goto done;
        case O2HB_DB_TYPE_REGION_PINNED:
@@ -1426,6 +1469,8 @@ static void o2hb_region_release(struct config_item *item)
        struct page *page;
        struct o2hb_region *reg = to_o2hb_region(item);
+        mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
        if (reg->hr_tmp_block)
                kfree(reg->hr_tmp_block);
@@ -1792,7 +1837,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
                        live_threshold <<= 1;
                spin_unlock(&o2hb_live_lock);
        }
-        atomic_set(&reg->hr_steady_iterations, live_threshold + 1);
+        ++live_threshold;
+        atomic_set(&reg->hr_steady_iterations, live_threshold);
+        /* unsteady_iterations is double the steady_iterations */
+        atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
        hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
                              reg->hr_item.ci_name);
@@ -1809,14 +1857,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
        ret = wait_event_interruptible(o2hb_steady_queue,
                                atomic_read(&reg->hr_steady_iterations) == 0);
        if (ret) {
-                /* We got interrupted (hello ptrace!).  Clean up */
+                atomic_set(&reg->hr_steady_iterations, 0);
-                spin_lock(&o2hb_live_lock);
+                reg->hr_aborted_start = 1;
-                hb_task = reg->hr_task;
+        }
-                reg->hr_task = NULL;
-                spin_unlock(&o2hb_live_lock);
-                if (hb_task)
+        if (reg->hr_aborted_start) {
-                        kthread_stop(hb_task);
+                ret = -EIO;
                goto out;
        }
@@ -1833,8 +1879,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
                ret = -EIO;
        if (hb_task && o2hb_global_heartbeat_active())
-                printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
+                printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
-                       config_item_name(&reg->hr_item));
+                       config_item_name(&reg->hr_item), reg->hr_dev_name);
 out:
        if (filp)
@@ -2092,13 +2138,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
        /* stop the thread when the user removes the region dir */
        spin_lock(&o2hb_live_lock);
-        if (o2hb_global_heartbeat_active()) {
-                clear_bit(reg->hr_region_num, o2hb_region_bitmap);
-                clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
-                if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
-                        quorum_region = 1;
-                clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
-        }
        hb_task = reg->hr_task;
        reg->hr_task = NULL;
        reg->hr_item_dropped = 1;
@@ -2107,19 +2146,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
        if (hb_task)
                kthread_stop(hb_task);
+        if (o2hb_global_heartbeat_active()) {
+                spin_lock(&o2hb_live_lock);
+                clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+                clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+                if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+                        quorum_region = 1;
+                clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+                spin_unlock(&o2hb_live_lock);
+                printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
+                       ((atomic_read(&reg->hr_steady_iterations) == 0) ?
+                        "stopped" : "start aborted"), config_item_name(item),
+                       reg->hr_dev_name);
+        }
        /*
         * If we're racing a dev_write(), we need to wake them.  They will
         * check reg->hr_task
         */
        if (atomic_read(&reg->hr_steady_iterations) != 0) {
+                reg->hr_aborted_start = 1;
                atomic_set(&reg->hr_steady_iterations, 0);
                wake_up(&o2hb_steady_queue);
        }
-        if (o2hb_global_heartbeat_active())
-                printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
-                       config_item_name(&reg->hr_item));
        config_item_put(item);
        if (!o2hb_global_heartbeat_active() || !quorum_region)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 3a5835904b3..dc45deb19e6 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -47,6 +47,7 @@
 #define SC_DEBUG_NAME           "sock_containers"
 #define NST_DEBUG_NAME          "send_tracking"
 #define STATS_DEBUG_NAME        "stats"
+#define NODES_DEBUG_NAME        "connected_nodes"
 #define SHOW_SOCK_CONTAINERS    0
 #define SHOW_SOCK_STATS         1
@@ -55,6 +56,7 @@ static struct dentry *o2net_dentry;
 static struct dentry *sc_dentry;
 static struct dentry *nst_dentry;
 static struct dentry *stats_dentry;
+static struct dentry *nodes_dentry;
 static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -491,53 +493,87 @@ static const struct file_operations sc_seq_fops = {
        .release = sc_fop_release,
 };
-int o2net_debugfs_init(void)
+static int o2net_fill_bitmap(char *buf, int len)
 {
-        o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+        unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-        if (!o2net_dentry) {
+        int i = -1, out = 0;
-                mlog_errno(-ENOMEM);
-                goto bail;
-        }
-        nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
+        o2net_fill_node_map(map, sizeof(map));
-                                         o2net_dentry, NULL,
-                                         &nst_seq_fops);
-        if (!nst_dentry) {
-                mlog_errno(-ENOMEM);
-                goto bail;
-        }
-        sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
+        while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
-                                        o2net_dentry, NULL,
+                out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
-                                        &sc_seq_fops);
+        out += snprintf(buf + out, PAGE_SIZE - out, "\n");
-        if (!sc_dentry) {
-                mlog_errno(-ENOMEM);
-                goto bail;
-        }
-        stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
+        return out;
-                                           o2net_dentry, NULL,
+}
-                                           &stats_seq_fops);
-        if (!stats_dentry) {
+static int nodes_fop_open(struct inode *inode, struct file *file)
-                mlog_errno(-ENOMEM);
+{
-                goto bail;
+        char *buf;
-        }
+        buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
+        file->private_data = buf;
        return 0;
-bail:
-        debugfs_remove(stats_dentry);
-        debugfs_remove(sc_dentry);
-        debugfs_remove(nst_dentry);
-        debugfs_remove(o2net_dentry);
-        return -ENOMEM;
 }
+static int o2net_debug_release(struct inode *inode, struct file *file)
+{
+        kfree(file->private_data);
+        return 0;
+}
+static ssize_t o2net_debug_read(struct file *file, char __user *buf,
+                                size_t nbytes, loff_t *ppos)
+{
+        return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                       i_size_read(file->f_mapping->host));
+}
+static const struct file_operations nodes_fops = {
+        .open           = nodes_fop_open,
+        .release        = o2net_debug_release,
+        .read           = o2net_debug_read,
+        .llseek         = generic_file_llseek,
+};
 void o2net_debugfs_exit(void)
 {
+        debugfs_remove(nodes_dentry);
        debugfs_remove(stats_dentry);
        debugfs_remove(sc_dentry);
        debugfs_remove(nst_dentry);
        debugfs_remove(o2net_dentry);
 }
+int o2net_debugfs_init(void)
+{
+        mode_t mode = S_IFREG|S_IRUSR;
+        o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+        if (o2net_dentry)
+                nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
+                                        o2net_dentry, NULL, &nst_seq_fops);
+        if (nst_dentry)
+                sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
+                                        o2net_dentry, NULL, &sc_seq_fops);
+        if (sc_dentry)
+                stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
+                                        o2net_dentry, NULL, &stats_seq_fops);
+        if (stats_dentry)
+                nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
+                                        o2net_dentry, NULL, &nodes_fops);
+        if (nodes_dentry)
+                return 0;
+        o2net_debugfs_exit();
+        mlog_errno(-ENOMEM);
+        return -ENOMEM;
+}
 #endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index db5ee4b4f47..044e7b58d31 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -59,6 +59,7 @@
 #include <linux/idr.h>
 #include <linux/kref.h>
 #include <linux/net.h>
+#include <linux/export.h>
 #include <net/tcp.h>
 #include <asm/uaccess.h>
@@ -545,7 +546,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
        }
        if (was_valid && !valid) {
-                printk(KERN_NOTICE "o2net: no longer connected to "
+                printk(KERN_NOTICE "o2net: No longer connected to "
                       SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
                o2net_complete_nodes_nsw(nn);
        }
@@ -555,7 +556,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
                cancel_delayed_work(&nn->nn_connect_expired);
                printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
                       o2nm_this_node() > sc->sc_node->nd_num ?
-                                "connected to" : "accepted connection from",
+                       "Connected to" : "Accepted connection from",
                       SC_NODEF_ARGS(sc));
        }
@@ -643,7 +644,7 @@ static void o2net_state_change(struct sock *sk)
                        o2net_sc_queue_work(sc, &sc->sc_connect_work);
                        break;
                default:
-                        printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+                        printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
                              " shutdown, state %d\n",
                              SC_NODEF_ARGS(sc), sk->sk_state);
                        o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
@@ -1034,6 +1035,25 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
        return ret;
 }
+/* Get a map of all nodes to which this node is currently connected to */
+void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+{
+        struct o2net_sock_container *sc;
+        int node, ret;
+        BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+        memset(map, 0, bytes);
+        for (node = 0; node < O2NM_MAX_NODES; ++node) {
+                o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+                if (!ret) {
+                        set_bit(node, map);
+                        sc_put(sc);
+                }
+        }
+}
+EXPORT_SYMBOL_GPL(o2net_fill_node_map);
 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                           size_t caller_veclen, u8 target_node, int *status)
 {
@@ -1284,11 +1304,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
        struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
        if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
-                mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
+                printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net "
-                     "version %llu but %llu is required, disconnecting\n",
+                       "protocol version %llu but %llu is required. "
-                     SC_NODEF_ARGS(sc),
+                       "Disconnecting.\n", SC_NODEF_ARGS(sc),
-                     (unsigned long long)be64_to_cpu(hand->protocol_version),
+                       (unsigned long long)be64_to_cpu(hand->protocol_version),
-                     O2NET_PROTOCOL_VERSION);
+                       O2NET_PROTOCOL_VERSION);
                /* don't bother reconnecting if its the wrong version. */
                o2net_ensure_shutdown(nn, sc, -ENOTCONN);
@@ -1302,33 +1322,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
         */
        if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
                                o2net_idle_timeout()) {
-                mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
+                printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network "
-                     "%u ms, but we use %u ms locally.  disconnecting\n",
+                       "idle timeout of %u ms, but we use %u ms locally. "
-                     SC_NODEF_ARGS(sc),
+                       "Disconnecting.\n", SC_NODEF_ARGS(sc),
-                     be32_to_cpu(hand->o2net_idle_timeout_ms),
+                       be32_to_cpu(hand->o2net_idle_timeout_ms),
-                     o2net_idle_timeout());
+                       o2net_idle_timeout());
                o2net_ensure_shutdown(nn, sc, -ENOTCONN);
                return -1;
        }
        if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
                        o2net_keepalive_delay()) {
-                mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
+                printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive "
-                     "%u ms, but we use %u ms locally.  disconnecting\n",
+                       "delay of %u ms, but we use %u ms locally. "
-                     SC_NODEF_ARGS(sc),
+                       "Disconnecting.\n", SC_NODEF_ARGS(sc),
-                     be32_to_cpu(hand->o2net_keepalive_delay_ms),
+                       be32_to_cpu(hand->o2net_keepalive_delay_ms),
-                     o2net_keepalive_delay());
+                       o2net_keepalive_delay());
                o2net_ensure_shutdown(nn, sc, -ENOTCONN);
                return -1;
        }
        if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
                        O2HB_MAX_WRITE_TIMEOUT_MS) {
-                mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
+                printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat "
-                     "%u ms, but we use %u ms locally.  disconnecting\n",
+                       "timeout of %u ms, but we use %u ms locally. "
-                     SC_NODEF_ARGS(sc),
+                       "Disconnecting.\n", SC_NODEF_ARGS(sc),
-                     be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
+                       be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
-                     O2HB_MAX_WRITE_TIMEOUT_MS);
+                       O2HB_MAX_WRITE_TIMEOUT_MS);
                o2net_ensure_shutdown(nn, sc, -ENOTCONN);
                return -1;
        }
@@ -1539,28 +1559,16 @@ static void o2net_idle_timer(unsigned long data)
 {
        struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
        struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
 #ifdef CONFIG_DEBUG_FS
-        ktime_t now = ktime_get();
+        unsigned long msecs = ktime_to_ms(ktime_get()) -
+                ktime_to_ms(sc->sc_tv_timer);
+#else
+        unsigned long msecs = o2net_idle_timeout();
 #endif
-        printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
+        printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
-             "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
+               "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
-                     o2net_idle_timeout() / 1000,
+               msecs / 1000, msecs % 1000);
-                     o2net_idle_timeout() % 1000);
-#ifdef CONFIG_DEBUG_FS
-        mlog(ML_NOTICE, "Here are some times that might help debug the "
-             "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
-             "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
-             (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
-             (long long)ktime_to_us(sc->sc_tv_data_ready),
-             (long long)ktime_to_us(sc->sc_tv_advance_start),
-             (long long)ktime_to_us(sc->sc_tv_advance_stop),
-             sc->sc_msg_key, sc->sc_msg_type,
-             (long long)ktime_to_us(sc->sc_tv_func_start),
-             (long long)ktime_to_us(sc->sc_tv_func_stop));
-#endif
        /*
         * Initialize the nn_timeout so that the next connection attempt
@@ -1693,8 +1701,8 @@ static void o2net_start_connect(struct work_struct *work)
 out:
        if (ret) {
-                mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
+                printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
-                     "with errno %d\n", SC_NODEF_ARGS(sc), ret);
+                       " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
                /* 0 err so that another will be queued and attempted
                 * from set_nn_state */
                if (sc)
@@ -1717,8 +1725,8 @@ static void o2net_connect_expired(struct work_struct *work)
        spin_lock(&nn->nn_lock);
        if (!nn->nn_sc_valid) {
-                mlog(ML_ERROR, "no connection established with node %u after "
+                printk(KERN_NOTICE "o2net: No connection established with "
-                     "%u.%u seconds, giving up and returning errors.\n",
+                       "node %u after %u.%u seconds, giving up.\n",
                     o2net_num_from_nn(nn),
                     o2net_idle_timeout() / 1000,
                     o2net_idle_timeout() % 1000);
@@ -1861,21 +1869,21 @@ static int o2net_accept_one(struct socket *sock)
        node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
        if (node == NULL) {
-                mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n",
+                printk(KERN_NOTICE "o2net: Attempt to connect from unknown "
-                     &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+                       "node at %pI4:%d\n", &sin.sin_addr.s_addr,
+                       ntohs(sin.sin_port));
                ret = -EINVAL;
                goto out;
        }
        if (o2nm_this_node() >= node->nd_num) {
                local_node = o2nm_get_node_by_num(o2nm_this_node());
-                mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
+                printk(KERN_NOTICE "o2net: Unexpected connect attempt seen "
-                     "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
+                       "at node '%s' (%u, %pI4:%d) from node '%s' (%u, "
-                     local_node->nd_name, local_node->nd_num,
+                       "%pI4:%d)\n", local_node->nd_name, local_node->nd_num,
-                     &(local_node->nd_ipv4_address),
+                       &(local_node->nd_ipv4_address),
-                     ntohs(local_node->nd_ipv4_port),
+                       ntohs(local_node->nd_ipv4_port), node->nd_name,
-                     node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
+                       node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port));
-                     ntohs(sin.sin_port));
                ret = -EINVAL;
                goto out;
        }
@@ -1900,10 +1908,10 @@ static int o2net_accept_one(struct socket *sock)
                ret = 0;
        spin_unlock(&nn->nn_lock);
        if (ret) {
-                mlog(ML_NOTICE, "attempt to connect from node '%s' at "
+                printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' "
-                     "%pI4:%d but it already has an open connection\n",
+                       "at %pI4:%d but it already has an open connection\n",
-                     node->nd_name, &sin.sin_addr.s_addr,
+                       node->nd_name, &sin.sin_addr.s_addr,
-                     ntohs(sin.sin_port));
+                       ntohs(sin.sin_port));
                goto out;
        }
@@ -1983,7 +1991,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
        ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
        if (ret < 0) {
-                mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
+                printk(KERN_ERR "o2net: Error %d while creating socket\n", ret);
                goto out;
        }
@@ -2000,16 +2008,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
        sock->sk->sk_reuse = 1;
        ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
        if (ret < 0) {
-                mlog(ML_ERROR, "unable to bind socket at %pI4:%u, "
+                printk(KERN_ERR "o2net: Error %d while binding socket at "
-                     "ret=%d\n", &addr, ntohs(port), ret);
+                       "%pI4:%u\n", ret, &addr, ntohs(port)); 
                goto out;
        }
        ret = sock->ops->listen(sock, 64);
-        if (ret < 0) {
+        if (ret < 0)
-                mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n",
+                printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n",
-                     &addr, ntohs(port), ret);
+                       ret, &addr, ntohs(port));
-        }
 out:
        if (ret) {
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index fd6179eb26d..5bada2a69b5 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
                           struct list_head *unreg_list);
 void o2net_unregister_handler_list(struct list_head *list);
+void o2net_fill_node_map(unsigned long *map, unsigned bytes);
 struct o2nm_node;
 int o2net_register_hb_callbacks(void);
 void o2net_unregister_hb_callbacks(void);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index e2878b5895f..8fe4e2892ab 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1184,8 +1184,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                        if (pde)
                                le16_add_cpu(&pde->rec_len,
                                                le16_to_cpu(de->rec_len));
-                        else
+                        de->inode = 0;
-                                de->inode = 0;
                        dir->i_version++;
                        ocfs2_journal_dirty(handle, bh);
                        goto bail;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d602abb51b6..a5952ceecba 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -859,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
 void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
 void dlm_put(struct dlm_ctxt *dlm);
 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -877,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res)
        kref_get(&res->refs);
 }
 void dlm_lockres_put(struct dlm_lock_resource *res);
-void __dlm_unhash_lockres(struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
-                          struct dlm_lock_resource *res);
 struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
                                                     const char *name,
                                                     unsigned int len,
@@ -902,46 +901,15 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
                                          const char *name,
                                          unsigned int namelen);
-#define dlm_lockres_set_refmap_bit(bit,res)  \
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
-        __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__)
+                                struct dlm_lock_resource *res, int bit);
-#define dlm_lockres_clear_refmap_bit(bit,res)  \
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
-        __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__)
+                                  struct dlm_lock_resource *res, int bit);
-static inline void __dlm_lockres_set_refmap_bit(int bit,
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
-                                                struct dlm_lock_resource *res,
+                                   struct dlm_lock_resource *res);
-                                                const char *file,
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
-                                                int line)
+                                   struct dlm_lock_resource *res);
-{
-        //printk("%s:%d:%.*s: setting bit %d\n", file, line,
-        //     res->lockname.len, res->lockname.name, bit);
-        set_bit(bit, res->refmap);
-}
-static inline void __dlm_lockres_clear_refmap_bit(int bit,
-                                                  struct dlm_lock_resource *res,
-                                                  const char *file,
-                                                  int line)
-{
-        //printk("%s:%d:%.*s: clearing bit %d\n", file, line,
-        //     res->lockname.len, res->lockname.name, bit);
-        clear_bit(bit, res->refmap);
-}
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
-                                   struct dlm_lock_resource *res,
-                                   const char *file,
-                                   int line);
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
-                                   struct dlm_lock_resource *res,
-                                   int new_lockres,
-                                   const char *file,
-                                   int line);
-#define dlm_lockres_drop_inflight_ref(d,r)  \
-        __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref(d,r)  \
-        __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref_new(d,r)  \
-        __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 56f82cb912e..0e28e242226 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -30,6 +30,7 @@
 #include <linux/sysctl.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
+#include <linux/export.h>
 #include "cluster/heartbeat.h"
 #include "cluster/nodemanager.h"
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6ed6b95dcf9..92f2ead0fab 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -157,16 +157,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing,
 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
-void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-        if (!hlist_unhashed(&lockres->hash_node)) {
+        if (hlist_unhashed(&res->hash_node))
-                hlist_del_init(&lockres->hash_node);
+                return;
-                dlm_lockres_put(lockres);
-        }
+        mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len,
+             res->lockname.name);
+        hlist_del_init(&res->hash_node);
+        dlm_lockres_put(res);
 }
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
-                       struct dlm_lock_resource *res)
 {
        struct hlist_head *bucket;
        struct qstr *q;
@@ -180,6 +182,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
        dlm_lockres_get(res);
        hlist_add_head(&res->hash_node, bucket);
+        mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len,
+             res->lockname.name);
 }
 struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
@@ -539,17 +544,17 @@ again:
 static void __dlm_print_nodes(struct dlm_ctxt *dlm)
 {
-        int node = -1;
+        int node = -1, num = 0;
        assert_spin_locked(&dlm->spinlock);
-        printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
+        printk("( ");
        while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
                                     node + 1)) < O2NM_MAX_NODES) {
                printk("%d ", node);
+                ++num;
        }
-        printk("\n");
+        printk(") %u nodes\n", num);
 }
 static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -566,11 +571,10 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
        node = exit_msg->node_idx;
-        printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
        spin_lock(&dlm->spinlock);
        clear_bit(node, dlm->domain_map);
        clear_bit(node, dlm->exit_domain_map);
+        printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name);
        __dlm_print_nodes(dlm);
        /* notify anything attached to the heartbeat events */
@@ -755,6 +759,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
                dlm_mark_domain_leaving(dlm);
                dlm_leave_domain(dlm);
+                printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name);
                dlm_force_free_mles(dlm);
                dlm_complete_dlm_shutdown(dlm);
        }
@@ -970,7 +975,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
                clear_bit(assert->node_idx, dlm->exit_domain_map);
                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
-                printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
+                printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ",
                       assert->node_idx, dlm->name);
                __dlm_print_nodes(dlm);
@@ -1701,8 +1706,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 bail:
        spin_lock(&dlm->spinlock);
        __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
-        if (!status)
+        if (!status) {
+                printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name);
                __dlm_print_nodes(dlm);
+        }
        spin_unlock(&dlm->spinlock);
        if (ctxt) {
@@ -2131,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
                goto leave;
        }
-        if (!o2hb_check_local_node_heartbeating()) {
-                mlog(ML_ERROR, "the local node has not been configured, or is "
-                     "not heartbeating\n");
-                ret = -EPROTO;
-                goto leave;
-        }
        mlog(0, "register called for domain \"%s\"\n", domain);
 retry:
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 8d39e0fd66f..975810b9849 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -183,10 +183,6 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
                        kick_thread = 1;
                }
        }
-        /* reduce the inflight count, this may result in the lockres
-         * being purged below during calc_usage */
-        if (lock->ml.node == dlm->node_num)
-                dlm_lockres_drop_inflight_ref(dlm, res);
        spin_unlock(&res->spinlock);
        wake_up(&res->wq);
@@ -231,10 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
             lock->ml.type, res->lockname.len,
             res->lockname.name, flags);
+        /*
+         * Wait if resource is getting recovered, remastered, etc.
+         * If the resource was remastered and new owner is self, then exit.
+         */
        spin_lock(&res->spinlock);
-        /* will exit this call with spinlock held */
        __dlm_wait_on_lockres(res);
+        if (res->owner == dlm->node_num) {
+                spin_unlock(&res->spinlock);
+                return DLM_RECOVERING;
+        }
        res->state |= DLM_LOCK_RES_IN_PROGRESS;
        /* add lock to local (secondary) queue */
@@ -319,27 +321,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
        tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
                                    sizeof(create), res->owner, &status);
        if (tmpret >= 0) {
-                // successfully sent and received
+                ret = status;
-                ret = status;  // this is already a dlm_status
                if (ret == DLM_REJECTED) {
-                        mlog(ML_ERROR, "%s:%.*s: BUG.  this is a stale lockres "
+                        mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
-                             "no longer owned by %u.  that node is coming back "
+                             "owned by node %u. That node is coming back up "
-                             "up currently.\n", dlm->name, create.namelen,
+                             "currently.\n", dlm->name, create.namelen,
                             create.name, res->owner);
                        dlm_print_one_lock_resource(res);
                        BUG();
                }
        } else {
-                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
-                     "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
+                     "node %u\n", dlm->name, create.namelen, create.name,
-                     res->owner);
+                     tmpret, res->owner);
-                if (dlm_is_host_down(tmpret)) {
+                if (dlm_is_host_down(tmpret))
                        ret = DLM_RECOVERING;
-                        mlog(0, "node %u died so returning DLM_RECOVERING "
+                else
-                             "from lock message!\n", res->owner);
-                } else {
                        ret = dlm_err_to_dlm_status(tmpret);
-                }
        }
        return ret;
@@ -440,7 +438,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
                /* zero memory only if kernel-allocated */
                lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
                if (!lksb) {
-                        kfree(lock);
+                        kmem_cache_free(dlm_lock_cache, lock);
                        return NULL;
                }
                kernel_allocated = 1;
@@ -718,18 +716,10 @@ retry_lock:
                if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
                    status == DLM_FORWARD) {
-                        mlog(0, "retrying lock with migration/"
-                             "recovery/in progress\n");
                        msleep(100);
-                        /* no waiting for dlm_reco_thread */
                        if (recovery) {
                                if (status != DLM_RECOVERING)
                                        goto retry_lock;
-                                mlog(0, "%s: got RECOVERING "
-                                     "for $RECOVERY lock, master "
-                                     "was %u\n", dlm->name,
-                                     res->owner);
                                /* wait to see the node go down, then
                                 * drop down and allow the lockres to
                                 * get cleaned up.  need to remaster. */
@@ -741,6 +731,14 @@ retry_lock:
                        }
                }
+                /* Inflight taken in dlm_get_lock_resource() is dropped here */
+                spin_lock(&res->spinlock);
+                dlm_lockres_drop_inflight_ref(dlm, res);
+                spin_unlock(&res->spinlock);
+                dlm_lockres_calc_usage(dlm, res);
+                dlm_kick_thread(dlm, res);
                if (status != DLM_NORMAL) {
                        lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
                        if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 11eefb8c12e..005261c333b 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -631,39 +631,54 @@ error:
        return NULL;
 }
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
-                                   struct dlm_lock_resource *res,
+                                struct dlm_lock_resource *res, int bit)
-                                   int new_lockres,
-                                   const char *file,
-                                   int line)
 {
-        if (!new_lockres)
+        assert_spin_locked(&res->spinlock);
-                assert_spin_locked(&res->spinlock);
+        mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
+             res->lockname.name, bit, __builtin_return_address(0));
+        set_bit(bit, res->refmap);
+}
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+                                  struct dlm_lock_resource *res, int bit)
+{
+        assert_spin_locked(&res->spinlock);
+        mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
+             res->lockname.name, bit, __builtin_return_address(0));
+        clear_bit(bit, res->refmap);
+}
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+                                   struct dlm_lock_resource *res)
+{
+        assert_spin_locked(&res->spinlock);
-        if (!test_bit(dlm->node_num, res->refmap)) {
-                BUG_ON(res->inflight_locks != 0);
-                dlm_lockres_set_refmap_bit(dlm->node_num, res);
-        }
        res->inflight_locks++;
-        mlog(0, "%s:%.*s: inflight++: now %u\n",
-             dlm->name, res->lockname.len, res->lockname.name,
+        mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
-             res->inflight_locks);
+             res->lockname.len, res->lockname.name, res->inflight_locks,
+             __builtin_return_address(0));
 }
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
-                                   struct dlm_lock_resource *res,
+                                   struct dlm_lock_resource *res)
-                                   const char *file,
-                                   int line)
 {
        assert_spin_locked(&res->spinlock);
        BUG_ON(res->inflight_locks == 0);
        res->inflight_locks--;
-        mlog(0, "%s:%.*s: inflight--: now %u\n",
-             dlm->name, res->lockname.len, res->lockname.name,
+        mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
-             res->inflight_locks);
+             res->lockname.len, res->lockname.name, res->inflight_locks,
-        if (res->inflight_locks == 0)
+             __builtin_return_address(0));
-                dlm_lockres_clear_refmap_bit(dlm->node_num, res);
        wake_up(&res->wq);
 }
@@ -697,7 +712,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
        unsigned int hash;
        int tries = 0;
        int bit, wait_on_recovery = 0;
-        int drop_inflight_if_nonlocal = 0;
        BUG_ON(!lockid);
@@ -709,36 +723,33 @@ lookup:
        spin_lock(&dlm->spinlock);
        tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
        if (tmpres) {
-                int dropping_ref = 0;
                spin_unlock(&dlm->spinlock);
                spin_lock(&tmpres->spinlock);
-                /* We wait for the other thread that is mastering the resource */
+                /* Wait on the thread that is mastering the resource */
                if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
                        __dlm_wait_on_lockres(tmpres);
                        BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+                        spin_unlock(&tmpres->spinlock);
+                        dlm_lockres_put(tmpres);
+                        tmpres = NULL;
+                        goto lookup;
                }
-                if (tmpres->owner == dlm->node_num) {
+                /* Wait on the resource purge to complete before continuing */
-                        BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
+                if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
-                        dlm_lockres_grab_inflight_ref(dlm, tmpres);
+                        BUG_ON(tmpres->owner == dlm->node_num);
-                } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
+                        __dlm_wait_on_lockres_flags(tmpres,
-                        dropping_ref = 1;
+                                                    DLM_LOCK_RES_DROPPING_REF);
-                spin_unlock(&tmpres->spinlock);
-                /* wait until done messaging the master, drop our ref to allow
-                 * the lockres to be purged, start over. */
-                if (dropping_ref) {
-                        spin_lock(&tmpres->spinlock);
-                        __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
                        spin_unlock(&tmpres->spinlock);
                        dlm_lockres_put(tmpres);
                        tmpres = NULL;
                        goto lookup;
                }
-                mlog(0, "found in hash!\n");
+                /* Grab inflight ref to pin the resource */
+                dlm_lockres_grab_inflight_ref(dlm, tmpres);
+                spin_unlock(&tmpres->spinlock);
                if (res)
                        dlm_lockres_put(res);
                res = tmpres;
@@ -829,8 +840,8 @@ lookup:
                 * but they might own this lockres.  wait on them. */
                bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
                if (bit < O2NM_MAX_NODES) {
-                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
+                        mlog(0, "%s: res %.*s, At least one node (%d) "
-                             "recover before lock mastery can begin\n",
+                             "to recover before lock mastery can begin\n",
                             dlm->name, namelen, (char *)lockid, bit);
                        wait_on_recovery = 1;
                }
@@ -843,12 +854,11 @@ lookup:
        /* finally add the lockres to its hash bucket */
        __dlm_insert_lockres(dlm, res);
-        /* since this lockres is new it doesn't not require the spinlock */
-        dlm_lockres_grab_inflight_ref_new(dlm, res);
-        /* if this node does not become the master make sure to drop
+        /* Grab inflight ref to pin the resource */
-         * this inflight reference below */
+        spin_lock(&res->spinlock);
-        drop_inflight_if_nonlocal = 1;
+        dlm_lockres_grab_inflight_ref(dlm, res);
+        spin_unlock(&res->spinlock);
        /* get an extra ref on the mle in case this is a BLOCK
         * if so, the creator of the BLOCK may try to put the last
@@ -864,8 +874,8 @@ redo_request:
                 * dlm spinlock would be detectable be a change on the mle,
                 * so we only need to clear out the recovery map once. */
                if (dlm_is_recovery_lock(lockid, namelen)) {
-                        mlog(ML_NOTICE, "%s: recovery map is not empty, but "
+                        mlog(0, "%s: Recovery map is not empty, but must "
-                             "must master $RECOVERY lock now\n", dlm->name);
+                             "master $RECOVERY lock now\n", dlm->name);
                        if (!dlm_pre_master_reco_lockres(dlm, res))
                                wait_on_recovery = 0;
                        else {
@@ -883,8 +893,8 @@ redo_request:
                spin_lock(&dlm->spinlock);
                bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
                if (bit < O2NM_MAX_NODES) {
-                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
+                        mlog(0, "%s: res %.*s, At least one node (%d) "
-                             "recover before lock mastery can begin\n",
+                             "to recover before lock mastery can begin\n",
                             dlm->name, namelen, (char *)lockid, bit);
                        wait_on_recovery = 1;
                } else
@@ -913,8 +923,8 @@ redo_request:
                         * yet, keep going until it does.  this is how the
                         * master will know that asserts are needed back to
                         * the lower nodes. */
-                        mlog(0, "%s:%.*s: requests only up to %u but master "
+                        mlog(0, "%s: res %.*s, Requests only up to %u but "
-                             "is %u, keep going\n", dlm->name, namelen,
+                             "master is %u, keep going\n", dlm->name, namelen,
                             lockid, nodenum, mle->master);
                }
        }
@@ -924,13 +934,12 @@ wait:
        ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
        if (ret < 0) {
                wait_on_recovery = 1;
-                mlog(0, "%s:%.*s: node map changed, redo the "
+                mlog(0, "%s: res %.*s, Node map changed, redo the master "
-                     "master request now, blocked=%d\n",
+                     "request now, blocked=%d\n", dlm->name, res->lockname.len,
-                     dlm->name, res->lockname.len,
                     res->lockname.name, blocked);
                if (++tries > 20) {
-                        mlog(ML_ERROR, "%s:%.*s: spinning on "
+                        mlog(ML_ERROR, "%s: res %.*s, Spinning on "
-                             "dlm_wait_for_lock_mastery, blocked=%d\n",
+                             "dlm_wait_for_lock_mastery, blocked = %d\n",
                             dlm->name, res->lockname.len,
                             res->lockname.name, blocked);
                        dlm_print_one_lock_resource(res);
@@ -940,7 +949,8 @@ wait:
                goto redo_request;
        }
-        mlog(0, "lockres mastered by %u\n", res->owner);
+        mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
+             res->lockname.name, res->owner);
        /* make sure we never continue without this */
        BUG_ON(res->owner == O2NM_MAX_NODES);
@@ -952,8 +962,6 @@ wait:
 wake_waiters:
        spin_lock(&res->spinlock);
-        if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
-                dlm_lockres_drop_inflight_ref(dlm, res);
        res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
        spin_unlock(&res->spinlock);
        wake_up(&res->wq);
@@ -1426,9 +1434,7 @@ way_up_top:
                }
                if (res->owner == dlm->node_num) {
-                        mlog(0, "%s:%.*s: setting bit %u in refmap\n",
+                        dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
-                             dlm->name, namelen, name, request->node_idx);
-                        dlm_lockres_set_refmap_bit(request->node_idx, res);
                        spin_unlock(&res->spinlock);
                        response = DLM_MASTER_RESP_YES;
                        if (mle)
@@ -1493,10 +1499,8 @@ way_up_top:
                                 * go back and clean the mles on any
                                 * other nodes */
                                dispatch_assert = 1;
-                                dlm_lockres_set_refmap_bit(request->node_idx, res);
+                                dlm_lockres_set_refmap_bit(dlm, res,
-                                mlog(0, "%s:%.*s: setting bit %u in refmap\n",
+                                                           request->node_idx);
-                                     dlm->name, namelen, name,
-                                     request->node_idx);
                        } else
                                response = DLM_MASTER_RESP_NO;
                } else {
@@ -1702,7 +1706,7 @@ again:
                             "lockres, set the bit in the refmap\n",
                             namelen, lockname, to);
                        spin_lock(&res->spinlock);
-                        dlm_lockres_set_refmap_bit(to, res);
+                        dlm_lockres_set_refmap_bit(dlm, res, to);
                        spin_unlock(&res->spinlock);
                }
        }
@@ -2187,8 +2191,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        namelen = res->lockname.len;
        BUG_ON(namelen > O2NM_MAX_NAME_LEN);
-        mlog(0, "%s:%.*s: sending deref to %d\n",
-             dlm->name, namelen, lockname, res->owner);
        memset(&deref, 0, sizeof(deref));
        deref.node_idx = dlm->node_num;
        deref.namelen = namelen;
@@ -2197,14 +2199,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
                                 &deref, sizeof(deref), res->owner, &r);
        if (ret < 0)
-                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
-                     "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
+                     dlm->name, namelen, lockname, ret, res->owner);
-                     res->owner);
        else if (r < 0) {
                /* BAD.  other node says I did not have a ref. */
-                mlog(ML_ERROR,"while dropping ref on %s:%.*s "
+                mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
-                    "(master=%u) got %d.\n", dlm->name, namelen,
+                     dlm->name, namelen, lockname, res->owner, r);
-                    lockname, res->owner, r);
                dlm_print_one_lock_resource(res);
                BUG();
        }
@@ -2260,7 +2260,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
        else {
                BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
                if (test_bit(node, res->refmap)) {
-                        dlm_lockres_clear_refmap_bit(node, res);
+                        dlm_lockres_clear_refmap_bit(dlm, res, node);
                        cleared = 1;
                }
        }
@@ -2320,7 +2320,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
        BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
        if (test_bit(node, res->refmap)) {
                __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
-                dlm_lockres_clear_refmap_bit(node, res);
+                dlm_lockres_clear_refmap_bit(dlm, res, node);
                cleared = 1;
        }
        spin_unlock(&res->spinlock);
@@ -2802,7 +2802,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
                                BUG_ON(!list_empty(&lock->bast_list));
                                BUG_ON(lock->ast_pending);
                                BUG_ON(lock->bast_pending);
-                                dlm_lockres_clear_refmap_bit(lock->ml.node, res);
+                                dlm_lockres_clear_refmap_bit(dlm, res,
+                                                             lock->ml.node);
                                list_del_init(&lock->list);
                                dlm_lock_put(lock);
                                /* In a normal unlock, we would have added a
@@ -2823,7 +2824,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
                        mlog(0, "%s:%.*s: node %u had a ref to this "
                             "migrating lockres, clearing\n", dlm->name,
                             res->lockname.len, res->lockname.name, bit);
-                        dlm_lockres_clear_refmap_bit(bit, res);
+                        dlm_lockres_clear_refmap_bit(dlm, res, bit);
                }
                bit++;
        }
@@ -2916,9 +2917,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                                         &migrate, sizeof(migrate), nodenum,
                                         &status);
                if (ret < 0) {
-                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                        mlog(ML_ERROR, "%s: res %.*s, Error %d send "
-                             "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
+                             "MIGRATE_REQUEST to node %u\n", dlm->name,
-                             dlm->key, nodenum);
+                             migrate.namelen, migrate.name, ret, nodenum);
                        if (!dlm_is_host_down(ret)) {
                                mlog(ML_ERROR, "unhandled error=%d!\n", ret);
                                BUG();
@@ -2937,7 +2938,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                             dlm->name, res->lockname.len, res->lockname.name,
                             nodenum);
                        spin_lock(&res->spinlock);
-                        dlm_lockres_set_refmap_bit(nodenum, res);
+                        dlm_lockres_set_refmap_bit(dlm, res, nodenum);
                        spin_unlock(&res->spinlock);
                }
        }
@@ -3271,7 +3272,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
         * mastery reference here since old_master will briefly have
         * a reference after the migration completes */
        spin_lock(&res->spinlock);
-        dlm_lockres_set_refmap_bit(old_master, res);
+        dlm_lockres_set_refmap_bit(dlm, res, old_master);
        spin_unlock(&res->spinlock);
        mlog(0, "now time to do a migrate request to other nodes\n");
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7efab6d28a2..01ebfd0bdad 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -362,40 +362,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
 }
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
 {
-        if (timeout) {
+        if (dlm_is_node_dead(dlm, node))
-                mlog(ML_NOTICE, "%s: waiting %dms for notification of "
+                return;
-                     "death of node %u\n", dlm->name, timeout, node);
+        printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
+               "domain %s\n", node, dlm->name);
+        if (timeout)
                wait_event_timeout(dlm->dlm_reco_thread_wq,
-                           dlm_is_node_dead(dlm, node),
+                                   dlm_is_node_dead(dlm, node),
-                           msecs_to_jiffies(timeout));
+                                   msecs_to_jiffies(timeout));
-        } else {
+        else
-                mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
-                     "of death of node %u\n", dlm->name, node);
                wait_event(dlm->dlm_reco_thread_wq,
                           dlm_is_node_dead(dlm, node));
-        }
-        /* for now, return 0 */
-        return 0;
 }
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
 {
-        if (timeout) {
+        if (dlm_is_node_recovered(dlm, node))
-                mlog(0, "%s: waiting %dms for notification of "
+                return;
-                     "recovery of node %u\n", dlm->name, timeout, node);
+        printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
+               "domain %s\n", node, dlm->name);
+        if (timeout)
                wait_event_timeout(dlm->dlm_reco_thread_wq,
-                           dlm_is_node_recovered(dlm, node),
+                                   dlm_is_node_recovered(dlm, node),
-                           msecs_to_jiffies(timeout));
+                                   msecs_to_jiffies(timeout));
-        } else {
+        else
-                mlog(0, "%s: waiting indefinitely for notification "
-                     "of recovery of node %u\n", dlm->name, node);
                wait_event(dlm->dlm_reco_thread_wq,
                           dlm_is_node_recovered(dlm, node));
-        }
-        /* for now, return 0 */
-        return 0;
 }
 /* callers of the top-level api calls (dlmlock/dlmunlock) should
@@ -430,6 +428,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm)
 {
        spin_lock(&dlm->spinlock);
        BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+        printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
+               dlm->name, dlm->reco.dead_node);
        dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
        spin_unlock(&dlm->spinlock);
 }
@@ -440,9 +440,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
        BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
        dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
        spin_unlock(&dlm->spinlock);
+        printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
        wake_up(&dlm->reco.event);
 }
+static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
+{
+        printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
+               "dead node %u in domain %s\n", dlm->reco.new_master,
+               (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
+               dlm->reco.dead_node, dlm->name);
+}
 static int dlm_do_recovery(struct dlm_ctxt *dlm)
 {
        int status = 0;
@@ -505,9 +514,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
                }
                mlog(0, "another node will master this recovery session.\n");
        }
-        mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
-             dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master,
+        dlm_print_recovery_master(dlm);
-             dlm->node_num, dlm->reco.dead_node);
        /* it is safe to start everything back up here
         * because all of the dead node's lock resources
@@ -518,15 +526,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
        return 0;
 master_here:
-        mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node "
+        dlm_print_recovery_master(dlm);
-             "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
-             dlm->node_num, dlm->reco.dead_node, dlm->name);
        status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
        if (status < 0) {
                /* we should never hit this anymore */
-                mlog(ML_ERROR, "error %d remastering locks for node %u, "
+                mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
-                     "retrying.\n", status, dlm->reco.dead_node);
+                     "retrying.\n", dlm->name, status, dlm->reco.dead_node);
                /* yield a bit to allow any final network messages
                 * to get handled on remaining nodes */
                msleep(100);
@@ -567,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
                ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
-                mlog(0, "requesting lock info from node %u\n",
+                mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
                     ndata->node_num);
                if (ndata->node_num == dlm->node_num) {
@@ -640,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                spin_unlock(&dlm_reco_state_lock);
        }
-        mlog(0, "done requesting all lock info\n");
+        mlog(0, "%s: Done requesting all lock info\n", dlm->name);
        /* nodes should be sending reco data now
         * just need to wait */
@@ -802,10 +808,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
        /* negative status is handled by caller */
        if (ret < 0)
-                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
-                     "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
+                     "to recover dead node %u\n", dlm->name, ret,
-                     dlm->key, request_from);
+                     request_from, dead_node);
        // return from here, then
        // sleep until all received or error
        return ret;
@@ -956,9 +961,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
        ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
                                 sizeof(done_msg), send_to, &tmpret);
        if (ret < 0) {
-                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
-                     "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
+                     "to recover dead node %u\n", dlm->name, ret, send_to,
-                     dlm->key, send_to);
+                     dead_node);
                if (!dlm_is_host_down(ret)) {
                        BUG();
                }
@@ -1127,9 +1132,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
        if (ret < 0) {
                /* XXX: negative status is not handled.
                 * this will end up killing this node. */
-                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
-                     "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
+                     "node %u (%s)\n", dlm->name, mres->lockname_len,
-                     dlm->key, send_to);
+                     mres->lockname, ret, send_to,
+                     (orig_flags & DLM_MRES_MIGRATION ?
+                      "migration" : "recovery"));
        } else {
                /* might get an -ENOMEM back here */
                ret = status;
@@ -1767,7 +1774,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                             dlm->name, mres->lockname_len, mres->lockname,
                             from);
                        spin_lock(&res->spinlock);
-                        dlm_lockres_set_refmap_bit(from, res);
+                        dlm_lockres_set_refmap_bit(dlm, res, from);
                        spin_unlock(&res->spinlock);
                        added++;
                        break;
@@ -1965,7 +1972,7 @@ skip_lvb:
                        mlog(0, "%s:%.*s: added lock for node %u, "
                             "setting refmap bit\n", dlm->name,
                             res->lockname.len, res->lockname.name, ml->node);
-                        dlm_lockres_set_refmap_bit(ml->node, res);
+                        dlm_lockres_set_refmap_bit(dlm, res, ml->node);
                        added++;
                }
                spin_unlock(&res->spinlock);
@@ -2084,6 +2091,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
        list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
                if (res->owner == dead_node) {
+                        mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+                             dlm->name, res->lockname.len, res->lockname.name,
+                             res->owner, new_master);
                        list_del_init(&res->recovering);
                        spin_lock(&res->spinlock);
                        /* new_master has our reference from
@@ -2105,40 +2115,30 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
                bucket = dlm_lockres_hash(dlm, i);
                hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
-                        if (res->state & DLM_LOCK_RES_RECOVERING) {
+                        if (!(res->state & DLM_LOCK_RES_RECOVERING))
-                                if (res->owner == dead_node) {
+                                continue;
-                                        mlog(0, "(this=%u) res %.*s owner=%u "
-                                             "was not on recovering list, but "
-                                             "clearing state anyway\n",
-                                             dlm->node_num, res->lockname.len,
-                                             res->lockname.name, new_master);
-                                } else if (res->owner == dlm->node_num) {
-                                        mlog(0, "(this=%u) res %.*s owner=%u "
-                                             "was not on recovering list, "
-                                             "owner is THIS node, clearing\n",
-                                             dlm->node_num, res->lockname.len,
-                                             res->lockname.name, new_master);
-                                } else
-                                        continue;
-                                if (!list_empty(&res->recovering)) {
+                        if (res->owner != dead_node &&
-                                        mlog(0, "%s:%.*s: lockres was "
+                            res->owner != dlm->node_num)
-                                             "marked RECOVERING, owner=%u\n",
+                                continue;
-                                             dlm->name, res->lockname.len,
-                                             res->lockname.name, res->owner);
+                        if (!list_empty(&res->recovering)) {
-                                        list_del_init(&res->recovering);
+                                list_del_init(&res->recovering);
-                                        dlm_lockres_put(res);
+                                dlm_lockres_put(res);
-                                }
-                                spin_lock(&res->spinlock);
-                                /* new_master has our reference from
-                                 * the lock state sent during recovery */
-                                dlm_change_lockres_owner(dlm, res, new_master);
-                                res->state &= ~DLM_LOCK_RES_RECOVERING;
-                                if (__dlm_lockres_has_locks(res))
-                                        __dlm_dirty_lockres(dlm, res);
-                                spin_unlock(&res->spinlock);
-                                wake_up(&res->wq);
                        }
+                        /* new_master has our reference from
+                         * the lock state sent during recovery */
+                        mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+                             dlm->name, res->lockname.len, res->lockname.name,
+                             res->owner, new_master);
+                        spin_lock(&res->spinlock);
+                        dlm_change_lockres_owner(dlm, res, new_master);
+                        res->state &= ~DLM_LOCK_RES_RECOVERING;
+                        if (__dlm_lockres_has_locks(res))
+                                __dlm_dirty_lockres(dlm, res);
+                        spin_unlock(&res->spinlock);
+                        wake_up(&res->wq);
                }
        }
 }
@@ -2252,12 +2252,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
                             res->lockname.len, res->lockname.name, freed, dead_node);
                        __dlm_print_one_lock_resource(res);
                }
-                dlm_lockres_clear_refmap_bit(dead_node, res);
+                dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
        } else if (test_bit(dead_node, res->refmap)) {
                mlog(0, "%s:%.*s: dead node %u had a ref, but had "
                     "no locks and had not purged before dying\n", dlm->name,
                     res->lockname.len, res->lockname.name, dead_node);
-                dlm_lockres_clear_refmap_bit(dead_node, res);
+                dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
        }
        /* do not kick thread yet */
@@ -2324,9 +2324,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
                        dlm_revalidate_lvb(dlm, res, dead_node);
                        if (res->owner == dead_node) {
                                if (res->state & DLM_LOCK_RES_DROPPING_REF) {
-                                        mlog(ML_NOTICE, "Ignore %.*s for "
+                                        mlog(ML_NOTICE, "%s: res %.*s, Skip "
                                             "recovery as it is being freed\n",
-                                             res->lockname.len,
+                                             dlm->name, res->lockname.len,
                                             res->lockname.name);
                                } else
                                        dlm_move_lockres_to_recovery_list(dlm,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 1d6d1d22c47..e73c833fc2a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -94,24 +94,26 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
 {
        int bit;
+        assert_spin_locked(&res->spinlock);
        if (__dlm_lockres_has_locks(res))
                return 0;
+        /* Locks are in the process of being created */
+        if (res->inflight_locks)
+                return 0;
        if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
                return 0;
        if (res->state & DLM_LOCK_RES_RECOVERING)
                return 0;
+        /* Another node has this resource with this node as the master */
        bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
        if (bit < O2NM_MAX_NODES)
                return 0;
-        /*
-         * since the bit for dlm->node_num is not set, inflight_locks better
-         * be zero
-         */
-        BUG_ON(res->inflight_locks != 0);
        return 1;
 }
@@ -185,8 +187,6 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
                /* clear our bit from the master's refmap, ignore errors */
                ret = dlm_drop_lockres_ref(dlm, res);
                if (ret < 0) {
-                        mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
-                             res->lockname.len, res->lockname.name, ret);
                        if (!dlm_is_host_down(ret))
                                BUG();
                }
@@ -209,7 +209,7 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
                BUG();
        }
-        __dlm_unhash_lockres(res);
+        __dlm_unhash_lockres(dlm, res);
        /* lockres is not in the hash now.  drop the flag and wake up
         * any processes waiting in dlm_get_lock_resource. */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e1ed5e502ff..81a4cd22f80 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1692,7 +1692,7 @@ int ocfs2_open_lock(struct inode *inode)
        mlog(0, "inode %llu take PRMODE open lock\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        if (ocfs2_mount_local(osb))
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
                goto out;
        lockres = &OCFS2_I(inode)->ip_open_lockres;
@@ -1718,6 +1718,12 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
             write ? "EXMODE" : "PRMODE");
+        if (ocfs2_is_hard_readonly(osb)) {
+                if (write)
+                        status = -EROFS;
+                goto out;
+        }
        if (ocfs2_mount_local(osb))
                goto out;
@@ -2298,7 +2304,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
        if (ocfs2_is_hard_readonly(osb)) {
                if (ex)
                        status = -EROFS;
-                goto bail;
+                goto getbh;
        }
        if (ocfs2_mount_local(osb))
@@ -2356,7 +2362,7 @@ local:
                        mlog_errno(status);
                goto bail;
        }
+getbh:
        if (ret_bh) {
                status = ocfs2_assign_bh(inode, ret_bh, local_bh);
                if (status < 0) {
@@ -2628,8 +2634,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
        BUG_ON(!dl);
-        if (ocfs2_is_hard_readonly(osb))
+        if (ocfs2_is_hard_readonly(osb)) {
-                return -EROFS;
+                if (ex)
+                        return -EROFS;
+                return 0;
+        }
        if (ocfs2_mount_local(osb))
                return 0;
@@ -2647,7 +2656,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-        if (!ocfs2_mount_local(osb))
+        if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
                ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
 }
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 23457b491e8..2f5b92ef0e5 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -832,6 +832,102 @@ out:
        return ret;
 }
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
+{
+        struct inode *inode = file->f_mapping->host;
+        int ret;
+        unsigned int is_last = 0, is_data = 0;
+        u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+        u32 cpos, cend, clen, hole_size;
+        u64 extoff, extlen;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_extent_rec rec;
+        BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE);
+        ret = ocfs2_inode_lock(inode, &di_bh, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        down_read(&OCFS2_I(inode)->ip_alloc_sem);
+        if (*offset >= inode->i_size) {
+                ret = -ENXIO;
+                goto out_unlock;
+        }
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+                if (origin == SEEK_HOLE)
+                        *offset = inode->i_size;
+                goto out_unlock;
+        }
+        clen = 0;
+        cpos = *offset >> cs_bits;
+        cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);
+        while (cpos < cend && !is_last) {
+                ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
+                                                 &rec, &is_last);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_unlock;
+                }
+                extoff = cpos;
+                extoff <<= cs_bits;
+                if (rec.e_blkno == 0ULL) {
+                        clen = hole_size;
+                        is_data = 0;
+                } else {
+                        clen = le16_to_cpu(rec.e_leaf_clusters) -
+                                (cpos - le32_to_cpu(rec.e_cpos));
+                        is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ?  0 : 1;
+                }
+                if ((!is_data && origin == SEEK_HOLE) ||
+                    (is_data && origin == SEEK_DATA)) {
+                        if (extoff > *offset)
+                                *offset = extoff;
+                        goto out_unlock;
+                }
+                if (!is_last)
+                        cpos += clen;
+        }
+        if (origin == SEEK_HOLE) {
+                extoff = cpos;
+                extoff <<= cs_bits;
+                extlen = clen;
+                extlen <<=  cs_bits;
+                if ((extoff + extlen) > inode->i_size)
+                        extlen = inode->i_size - extoff;
+                extoff += extlen;
+                if (extoff > *offset)
+                        *offset = extoff;
+                goto out_unlock;
+        }
+        ret = -ENXIO;
+out_unlock:
+        brelse(di_bh);
+        up_read(&OCFS2_I(inode)->ip_alloc_sem);
+        ocfs2_inode_unlock(inode, 0);
+out:
+        if (ret && ret != -ENXIO)
+                ret = -ENXIO;
+        return ret;
+}
 int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
                           struct buffer_head *bhs[], int flags,
                           int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index e79d41c2c90..67ea57d2fd5 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 u64 map_start, u64 map_len);
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
 int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
                             u32 *p_cluster, u32 *num_clusters,
                             struct ocfs2_extent_list *el,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index de4ea1af041..6e396683c3d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1950,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        if (ret < 0)
                mlog_errno(ret);
+        if (file->f_flags & O_SYNC)
+                handle->h_sync = 1;
        ocfs2_commit_trans(osb, handle);
 out_inode_unlock:
@@ -2052,6 +2055,23 @@ out:
        return ret;
 }
+static void ocfs2_aiodio_wait(struct inode *inode)
+{
+        wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+        wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+}
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+        int blockmask = inode->i_sb->s_blocksize - 1;
+        loff_t final_size = pos + count;
+        if ((pos & blockmask) || (final_size & blockmask))
+                return 1;
+        return 0;
+}
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
                                            struct file *file,
                                            loff_t pos, size_t count,
@@ -2230,6 +2250,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int full_coherency = !(osb->s_mount_opt &
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
+        int unaligned_dio = 0;
        trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
                (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2297,6 +2318,10 @@ relock:
                goto out;
        }
+        if (direct_io && !is_sync_kiocb(iocb))
+                unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+                                                      *ppos);
        /*
         * We can't complete the direct I/O as requested, fall back to
         * buffered I/O.
@@ -2311,6 +2336,18 @@ relock:
                goto relock;
        }
+        if (unaligned_dio) {
+                /*
+                 * Wait on previous unaligned aio to complete before
+                 * proceeding.
+                 */
+                ocfs2_aiodio_wait(inode);
+                /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+                atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+                ocfs2_iocb_set_unaligned_aio(iocb);
+        }
        /*
         * To later detect whether a journal commit for sync writes is
         * necessary, we sample i_size, and cluster count here.
@@ -2382,8 +2419,12 @@ out_dio:
        if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                rw_level = -1;
                have_alloc_sem = 0;
+                unaligned_dio = 0;
        }
+        if (unaligned_dio)
+                atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
 out:
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
@@ -2591,6 +2632,57 @@ bail:
        return ret;
 }
+/* Refer generic_file_llseek_unlocked() */
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct inode *inode = file->f_mapping->host;
+        int ret = 0;
+        mutex_lock(&inode->i_mutex);
+        switch (origin) {
+        case SEEK_SET:
+                break;
+        case SEEK_END:
+                offset += inode->i_size;
+                break;
+        case SEEK_CUR:
+                if (offset == 0) {
+                        offset = file->f_pos;
+                        goto out;
+                }
+                offset += file->f_pos;
+                break;
+        case SEEK_DATA:
+        case SEEK_HOLE:
+                ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
+                if (ret)
+                        goto out;
+                break;
+        default:
+                ret = -EINVAL;
+                goto out;
+        }
+        if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+                ret = -EINVAL;
+        if (!ret && offset > inode->i_sb->s_maxbytes)
+                ret = -EINVAL;
+        if (ret)
+                goto out;
+        if (offset != file->f_pos) {
+                file->f_pos = offset;
+                file->f_version = 0;
+        }
+out:
+        mutex_unlock(&inode->i_mutex);
+        if (ret)
+                return ret;
+        return offset;
+}
 const struct inode_operations ocfs2_file_iops = {
        .setattr        = ocfs2_setattr,
        .getattr        = ocfs2_getattr,
@@ -2615,7 +2707,7 @@ const struct inode_operations ocfs2_special_file_iops = {
 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
 */
 const struct file_operations ocfs2_fops = {
-        .llseek         = generic_file_llseek,
+        .llseek         = ocfs2_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .mmap           = ocfs2_mmap,
@@ -2663,7 +2755,7 @@ const struct file_operations ocfs2_dops = {
 * the cluster.
 */
 const struct file_operations ocfs2_fops_no_plocks = {
-        .llseek         = generic_file_llseek,
+        .llseek         = ocfs2_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .mmap           = ocfs2_mmap,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index a22d2c09889..17454a904d7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -951,7 +951,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
        trace_ocfs2_cleanup_delete_inode(
                (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
        if (sync_data)
-                write_inode_now(inode, 1);
+                filemap_write_and_wait(inode->i_mapping);
        truncate_inode_pages(&inode->i_data, 0);
 }
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1c508b149b3..88924a3133f 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,6 +43,9 @@ struct ocfs2_inode_info
        /* protects extended attribute changes on this inode */
        struct rw_semaphore             ip_xattr_sem;
+        /* Number of outstanding AIO's which are not page aligned */
+        atomic_t                        ip_unaligned_aio;
        /* These fields are protected by ip_lock */
        spinlock_t                      ip_lock;
        u32                             ip_open_count;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index bc91072b721..726ff265b29 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -122,7 +122,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
        if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
                (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
                if (!capable(CAP_LINUX_IMMUTABLE))
-                        goto bail_unlock;
+                        goto bail_commit;
        }
        ocfs2_inode->ip_attr = flags;
@@ -132,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
        if (status < 0)
                mlog_errno(status);
+bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
        ocfs2_inode_unlock(inode, 1);
@@ -381,7 +382,7 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
        if (!oifi) {
                status = -ENOMEM;
                mlog_errno(status);
-                goto bail;
+                goto out_err;
        }
        if (o2info_from_user(*oifi, req))
@@ -431,7 +432,7 @@ bail:
                o2info_set_request_error(&oifi->ifi_req, req);
        kfree(oifi);
+out_err:
        return status;
 }
@@ -666,7 +667,7 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
        if (!oiff) {
                status = -ENOMEM;
                mlog_errno(status);
-                goto bail;
+                goto out_err;
        }
        if (o2info_from_user(*oiff, req))
@@ -716,7 +717,7 @@ bail:
                o2info_set_request_error(&oiff->iff_req, req);
        kfree(oiff);
+out_err:
        return status;
 }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 295d56454e8..0a42ae96dca 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1544,9 +1544,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        /* we need to run complete recovery for offline orphan slots */
        ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
-        mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
+        printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\
-             node_num, slot_num,
+               "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
-             MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+               MINOR(osb->sb->s_dev));
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -1601,6 +1601,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        jbd2_journal_destroy(journal);
+        printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\
+               "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+               MINOR(osb->sb->s_dev));
 done:
        /* drop the lock on this nodes journal */
        if (got_lock)
@@ -1808,6 +1811,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
 * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
 * is done to catch any orphans that are left over in orphan directories.
 *
+ * It scans all slots, even ones that are in use. It does so to handle the
+ * case described below:
+ *
+ *   Node 1 has an inode it was using. The dentry went away due to memory
+ *   pressure.  Node 1 closes the inode, but it's on the free list. The node
+ *   has the open lock.
+ *   Node 2 unlinks the inode. It grabs the dentry lock to notify others,
+ *   but node 1 has no dentry and doesn't get the message. It trylocks the
+ *   open lock, sees that another node has a PR, and does nothing.
+ *   Later node 2 runs its orphan dir. It igets the inode, trylocks the
+ *   open lock, sees the PR still, and does nothing.
+ *   Basically, we have to trigger an orphan iput on node 1. The only way
+ *   for this to happen is if node 1 runs node 2's orphan dir.
+ *
 * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
 * seconds.  It gets an EX lock on os_lockres and checks sequence number
 * stored in LVB. If the sequence number has changed, it means some other
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 68cf2f6d3c6..a3385b63ff5 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -441,10 +441,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
 /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
- * update on dir + index leaf + dx root update for free list */
+ * update on dir + index leaf + dx root update for free list +
+ * previous dirblock update in the free list */
 static inline int ocfs2_link_credits(struct super_block *sb)
 {
-        return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
+        return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
               ocfs2_quota_trans_credits(sb);
 }
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 3e9393ca39e..9cd41083e99 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
                                struct page *page)
 {
-        int ret;
+        int ret = VM_FAULT_NOPAGE;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
        loff_t pos = page_offset(page);
@@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
        void *fsdata;
        loff_t size = i_size_read(inode);
-        /*
-         * Another node might have truncated while we were waiting on
-         * cluster locks.
-         * We don't check size == 0 before the shift. This is borrowed
-         * from do_generic_file_read.
-         */
        last_index = (size - 1) >> PAGE_CACHE_SHIFT;
-        if (unlikely(!size || page->index > last_index)) {
-                ret = -EINVAL;
-                goto out;
-        }
        /*
-         * The i_size check above doesn't catch the case where nodes
+         * There are cases that lead to the page no longer bebongs to the
-         * truncated and then re-extended the file. We'll re-check the
+         * mapping.
-         * page mapping after taking the page lock inside of
+         * 1) pagecache truncates locally due to memory pressure.
-         * ocfs2_write_begin_nolock().
+         * 2) pagecache truncates when another is taking EX lock against 
+         * inode lock. see ocfs2_data_convert_worker.
+         * 
+         * The i_size check doesn't catch the case where nodes truncated and
+         * then re-extended the file. We'll re-check the page mapping after
+         * taking the page lock inside of ocfs2_write_begin_nolock().
+         *
+         * Let VM retry with these cases.
         */
-        if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+        if ((page->mapping != inode->i_mapping) ||
-                /*
+            (!PageUptodate(page)) ||
-                 * the page has been umapped in ocfs2_data_downconvert_worker.
+            (page_offset(page) >= size))
-                 * So return 0 here and let VFS retry.
-                 */
-                ret = 0;
                goto out;
-        }
        /*
         * Call ocfs2_write_begin() and ocfs2_write_end() to take
@@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
        if (ret) {
                if (ret != -ENOSPC)
                        mlog_errno(ret);
+                if (ret == -ENOMEM)
+                        ret = VM_FAULT_OOM;
+                else
+                        ret = VM_FAULT_SIGBUS;
                goto out;
        }
-        ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+        if (!locked_page) {
-                                     fsdata);
+                ret = VM_FAULT_NOPAGE;
-        if (ret < 0) {
-                mlog_errno(ret);
                goto out;
        }
+        ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+                                     fsdata);
        BUG_ON(ret != len);
-        ret = 0;
+        ret = VM_FAULT_LOCKED;
 out:
        return ret;
 }
@@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 out:
        ocfs2_unblock_signals(&oldset);
-        if (ret)
-                ret = VM_FAULT_SIGBUS;
        return ret;
 }
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index d53cb706f14..184c76b8c29 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -745,7 +745,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
         */
        ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
                                new_phys_cpos);
-        if (!new_phys_cpos) {
+        if (!*new_phys_cpos) {
                ret = -ENOSPC;
                goto out_commit;
        }
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 409285854f6..d355e6e36b3 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -836,18 +836,65 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
 static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
 {
-        __test_and_set_bit_le(bit, bitmap);
+        __set_bit_le(bit, bitmap);
 }
 #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
 static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
 {
-        __test_and_clear_bit_le(bit, bitmap);
+        __clear_bit_le(bit, bitmap);
 }
 #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
 #define ocfs2_test_bit test_bit_le
 #define ocfs2_find_next_zero_bit find_next_zero_bit_le
 #define ocfs2_find_next_bit find_next_bit_le
+static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr)
+{
+#if BITS_PER_LONG == 64
+        *bit += ((unsigned long) addr & 7UL) << 3;
+        addr = (void *) ((unsigned long) addr & ~7UL);
+#elif BITS_PER_LONG == 32
+        *bit += ((unsigned long) addr & 3UL) << 3;
+        addr = (void *) ((unsigned long) addr & ~3UL);
+#else
+#error "how many bits you are?!"
+#endif
+        return addr;
+}
+static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap)
+{
+        bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+        ocfs2_set_bit(bit, bitmap);
+}
+static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap)
+{
+        bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+        ocfs2_clear_bit(bit, bitmap);
+}
+static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap)
+{
+        bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+        return ocfs2_test_bit(bit, bitmap);
+}
+static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max,
+                                                        int start)
+{
+        int fix = 0, ret, tmpmax;
+        bitmap = correct_addr_and_bit_unaligned(&fix, bitmap);
+        tmpmax = max + fix;
+        start += fix;
+        ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix;
+        if (ret > max)
+                return max;
+        return ret;
+}
 #endif  /* OCFS2_H */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dc8007fc924..f100bf70a90 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -404,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
        int status = 0;
        struct ocfs2_quota_recovery *rec;
-        mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+        printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for "
+               "slot %u\n", osb->dev_str, slot_num);
        rec = ocfs2_alloc_quota_recovery();
        if (!rec)
                return ERR_PTR(-ENOMEM);
@@ -549,8 +551,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
                                goto out_commit;
                        }
                        lock_buffer(qbh);
-                        WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
+                        WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap));
-                        ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+                        ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap);
                        le32_add_cpu(&dchunk->dqc_free, 1);
                        unlock_buffer(qbh);
                        ocfs2_journal_dirty(handle, qbh);
@@ -596,7 +598,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
        struct inode *lqinode;
        unsigned int flags;
-        mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+        printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
+               "slot %u\n", osb->dev_str, slot_num);
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
        for (type = 0; type < MAXQUOTAS; type++) {
                if (list_empty(&(rec->r_list[type])))
@@ -612,8 +616,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
                /* Someone else is holding the lock? Then he must be
                 * doing the recovery. Just skip the file... */
                if (status == -EAGAIN) {
-                        mlog(ML_NOTICE, "skipping quota recovery for slot %d "
+                        printk(KERN_NOTICE "ocfs2: Skipping quota recovery on "
-                             "because quota file is locked.\n", slot_num);
+                               "device (%s) for slot %d because quota file is "
+                               "locked.\n", osb->dev_str, slot_num);
                        status = 0;
                        goto out_put;
                } else if (status < 0) {
@@ -944,7 +949,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
                      * ol_quota_entries_per_block(sb);
        }
-        found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+        found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0);
        /* We failed? */
        if (found == len) {
                mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
@@ -1208,7 +1213,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
        struct ocfs2_local_disk_chunk *dchunk;
        dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
-        ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+        ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap);
        le32_add_cpu(&dchunk->dqc_free, -1);
 }
@@ -1289,7 +1294,7 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
                        (od->dq_chunk->qc_headerbh->b_data);
        /* Mark structure as freed */
        lock_buffer(od->dq_chunk->qc_headerbh);
-        ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+        ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap);
        le32_add_cpu(&dchunk->dqc_free, 1);
        unlock_buffer(od->dq_chunk->qc_headerbh);
        ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 26fc0014d50..1424c151ccc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -493,8 +493,8 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
                        goto bail;
                }
        } else
-                mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
+                printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
-                     slot);
+                       "allocated to this node!\n", slot, osb->dev_str);
        ocfs2_set_slot(si, slot, osb->node_num);
        osb->slot_num = slot;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 19965b00c43..94368017edb 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -28,6 +28,7 @@
 #include "cluster/masklog.h"
 #include "cluster/nodemanager.h"
 #include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
 #include "stackglue.h"
@@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
 }
 /*
+ * Check if this node is heartbeating and is connected to all other
+ * heartbeating nodes.
+ */
+static int o2cb_cluster_check(void)
+{
+        u8 node_num;
+        int i;
+        unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        node_num = o2nm_this_node();
+        if (node_num == O2NM_MAX_NODES) {
+                printk(KERN_ERR "o2cb: This node has not been configured.\n");
+                return -EINVAL;
+        }
+        /*
+         * o2dlm expects o2net sockets to be created. If not, then
+         * dlm_join_domain() fails with a stack of errors which are both cryptic
+         * and incomplete. The idea here is to detect upfront whether we have
+         * managed to connect to all nodes or not. If not, then list the nodes
+         * to allow the user to check the configuration (incorrect IP, firewall,
+         * etc.) Yes, this is racy. But its not the end of the world.
+         */
+#define O2CB_MAP_STABILIZE_COUNT        60
+        for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
+                o2hb_fill_node_map(hbmap, sizeof(hbmap));
+                if (!test_bit(node_num, hbmap)) {
+                        printk(KERN_ERR "o2cb: %s heartbeat has not been "
+                               "started.\n", (o2hb_global_heartbeat_active() ?
+                                              "Global" : "Local"));
+                        return -EINVAL;
+                }
+                o2net_fill_node_map(netmap, sizeof(netmap));
+                /* Force set the current node to allow easy compare */
+                set_bit(node_num, netmap);
+                if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+                        return 0;
+                if (i < O2CB_MAP_STABILIZE_COUNT)
+                        msleep(1000);
+        }
+        printk(KERN_ERR "o2cb: This node could not connect to nodes:");
+        i = -1;
+        while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
+                                  i + 1)) < O2NM_MAX_NODES) {
+                if (!test_bit(i, netmap))
+                        printk(" %u", i);
+        }
+        printk(".\n");
+        return -ENOTCONN;
+}
+/*
 * Called from the dlm when it's about to evict a node. This is how the
 * classic stack signals node death.
 */
@@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data)
 {
        struct ocfs2_cluster_connection *conn = data;
-        mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
+        printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
-             node_num, conn->cc_namelen, conn->cc_name);
+               node_num, conn->cc_namelen, conn->cc_name);
        conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
 }
@@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
        BUG_ON(conn == NULL);
        BUG_ON(conn->cc_proto == NULL);
-        /* for now we only have one cluster/node, make sure we see it
+        /* Ensure cluster stack is up and all nodes are connected */
-         * in the heartbeat universe */
+        rc = o2cb_cluster_check();
-        if (!o2hb_check_local_node_heartbeating()) {
+        if (rc) {
-                if (o2hb_global_heartbeat_active())
+                printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
-                        mlog(ML_ERROR, "Global heartbeat not started\n");
+                       "before retrying.\n");
-                rc = -EINVAL;
                goto out;
        }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 56f61027236..4994f8b0e60 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -54,6 +54,7 @@
 #include "ocfs1_fs_compat.h"
 #include "alloc.h"
+#include "aops.h"
 #include "blockcheck.h"
 #include "dlmglue.h"
 #include "export.h"
@@ -1107,9 +1108,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                ocfs2_set_ro_flag(osb, 1);
-                printk(KERN_NOTICE "Readonly device detected. No cluster "
+                printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
-                       "services will be utilized for this mount. Recovery "
+                       "Cluster services will not be used for this mount. "
-                       "will be skipped.\n");
+                       "Recovery will be skipped.\n", osb->dev_str);
        }
        if (!ocfs2_is_hard_readonly(osb)) {
@@ -1616,12 +1617,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        return 0;
 }
+wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
 static int __init ocfs2_init(void)
 {
-        int status;
+        int status, i;
        ocfs2_print_version();
+        for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
+                init_waitqueue_head(&ocfs2__ioend_wq[i]);
        status = init_ocfs2_uptodate_cache();
        if (status < 0) {
                mlog_errno(status);
@@ -1760,7 +1766,7 @@ static void ocfs2_inode_init_once(void *data)
        ocfs2_extent_map_init(&oi->vfs_inode);
        INIT_LIST_HEAD(&oi->ip_io_markers);
        oi->ip_dir_start_lookup = 0;
+        atomic_set(&oi->ip_unaligned_aio, 0);
        init_rwsem(&oi->ip_alloc_sem);
        init_rwsem(&oi->ip_xattr_sem);
        mutex_init(&oi->ip_io_mutex);
@@ -1974,7 +1980,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
         * If we failed before we got a uuid_str yet, we can't stop
         * heartbeat.  Otherwise, do it.
         */
-        if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+        if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
+            !ocfs2_is_hard_readonly(osb))
                hangup_needed = 1;
        if (osb->cconn)
@@ -2353,7 +2360,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
                mlog_errno(status);
                goto bail;
        }
-        cleancache_init_shared_fs((char *)&uuid_net_key, sb);
+        cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
 bail:
        return status;
@@ -2462,8 +2469,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
                        goto finally;
                }
        } else {
-                mlog(ML_NOTICE, "File system was not unmounted cleanly, "
+                printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
-                     "recovering volume.\n");
+                       "unmounted cleanly, recovering it.\n", osb->dev_str);
        }
        local = ocfs2_mount_local(osb);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 194fb22ef79..aa9e8777b09 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2376,16 +2376,18 @@ static int ocfs2_remove_value_outside(struct inode*inode,
                }
                ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        break;
-                }
                ocfs2_commit_trans(osb, ctxt.handle);
                if (ctxt.meta_ac) {
                        ocfs2_free_alloc_context(ctxt.meta_ac);
                        ctxt.meta_ac = NULL;
                }
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
        }
        if (ctxt.meta_ac)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2db1bd3173b..851ba3dcdc2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1652,46 +1652,12 @@ out:
        return error;
 }
-static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
-                struct kstat *stat)
-{
-        struct inode *inode = dentry->d_inode;
-        struct task_struct *task = get_proc_task(inode);
-        int rc;
-        if (task == NULL)
-                return -ESRCH;
-        rc = -EACCES;
-        if (lock_trace(task))
-                goto out_task;
-        generic_fillattr(inode, stat);
-        unlock_trace(task);
-        rc = 0;
-out_task:
-        put_task_struct(task);
-        return rc;
-}
 static const struct inode_operations proc_pid_link_inode_operations = {
        .readlink       = proc_pid_readlink,
        .follow_link    = proc_pid_follow_link,
        .setattr        = proc_setattr,
 };
-static const struct inode_operations proc_fdinfo_link_inode_operations = {
-        .setattr        = proc_setattr,
-        .getattr        = proc_pid_fd_link_getattr,
-};
-static const struct inode_operations proc_fd_link_inode_operations = {
-        .readlink       = proc_pid_readlink,
-        .follow_link    = proc_pid_follow_link,
-        .setattr        = proc_setattr,
-        .getattr        = proc_pid_fd_link_getattr,
-};
 /* building an inode */
@@ -1923,61 +1889,49 @@ out:
 static int proc_fd_info(struct inode *inode, struct path *path, char *info)
 {
-        struct task_struct *task;
+        struct task_struct *task = get_proc_task(inode);
-        struct files_struct *files;
+        struct files_struct *files = NULL;
        struct file *file;
        int fd = proc_fd(inode);
-        int rc;
-        task = get_proc_task(inode);
-        if (!task)
-                return -ENOENT;
-        rc = -EACCES;
-        if (lock_trace(task))
-                goto out_task;
-        rc = -ENOENT;
-        files = get_files_struct(task);
-        if (files == NULL)
-                goto out_unlock;
-        /*
+        if (task) {
-         * We are not taking a ref to the file structure, so we must
+                files = get_files_struct(task);
-         * hold ->file_lock.
+                put_task_struct(task);
-         */
+        }
-        spin_lock(&files->file_lock);
+        if (files) {
-        file = fcheck_files(files, fd);
+                /*
-        if (file) {
+                 * We are not taking a ref to the file structure, so we must
-                unsigned int f_flags;
+                 * hold ->file_lock.
-                struct fdtable *fdt;
+                 */
+                spin_lock(&files->file_lock);
-                fdt = files_fdtable(files);
+                file = fcheck_files(files, fd);
-                f_flags = file->f_flags & ~O_CLOEXEC;
+                if (file) {
-                if (FD_ISSET(fd, fdt->close_on_exec))
+                        unsigned int f_flags;
-                        f_flags |= O_CLOEXEC;
+                        struct fdtable *fdt;
-                if (path) {
+                        fdt = files_fdtable(files);
-                        *path = file->f_path;
+                        f_flags = file->f_flags & ~O_CLOEXEC;
-                        path_get(&file->f_path);
+                        if (FD_ISSET(fd, fdt->close_on_exec))
+                                f_flags |= O_CLOEXEC;
+                        if (path) {
+                                *path = file->f_path;
+                                path_get(&file->f_path);
+                        }
+                        if (info)
+                                snprintf(info, PROC_FDINFO_MAX,
+                                         "pos:\t%lli\n"
+                                         "flags:\t0%o\n",
+                                         (long long) file->f_pos,
+                                         f_flags);
+                        spin_unlock(&files->file_lock);
+                        put_files_struct(files);
+                        return 0;
                }
-                if (info)
+                spin_unlock(&files->file_lock);
-                        snprintf(info, PROC_FDINFO_MAX,
+                put_files_struct(files);
-                                 "pos:\t%lli\n"
+        }
-                                 "flags:\t0%o\n",
+        return -ENOENT;
-                                 (long long) file->f_pos,
-                                 f_flags);
-                rc = 0;
-        } else
-                rc = -ENOENT;
-        spin_unlock(&files->file_lock);
-        put_files_struct(files);
-out_unlock:
-        unlock_trace(task);
-out_task:
-        put_task_struct(task);
-        return rc;
 }
 static int proc_fd_link(struct inode *inode, struct path *path)
@@ -2072,7 +2026,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
        spin_unlock(&files->file_lock);
        put_files_struct(files);
-        inode->i_op = &proc_fd_link_inode_operations;
+        inode->i_op = &proc_pid_link_inode_operations;
        inode->i_size = 64;
        ei->op.proc_get_link = proc_fd_link;
        d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2104,12 +2058,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
        if (fd == ~0U)
                goto out;
-        result = ERR_PTR(-EACCES);
-        if (lock_trace(task))
-                goto out;
        result = instantiate(dir, dentry, task, &fd);
-        unlock_trace(task);
 out:
        put_task_struct(task);
 out_no_task:
@@ -2129,28 +2078,23 @@ static int proc_readfd_common(struct file * filp, void * dirent,
        retval = -ENOENT;
        if (!p)
                goto out_no_task;
-        retval = -EACCES;
-        if (lock_trace(p))
-                goto out;
        retval = 0;
        fd = filp->f_pos;
        switch (fd) {
                case 0:
                        if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-                                goto out_unlock;
+                                goto out;
                        filp->f_pos++;
                case 1:
                        ino = parent_ino(dentry);
                        if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-                                goto out_unlock;
+                                goto out;
                        filp->f_pos++;
                default:
                        files = get_files_struct(p);
                        if (!files)
-                                goto out_unlock;
+                                goto out;
                        rcu_read_lock();
                        for (fd = filp->f_pos-2;
                             fd < files_fdtable(files)->max_fds;
@@ -2174,9 +2118,6 @@ static int proc_readfd_common(struct file * filp, void * dirent,
                        rcu_read_unlock();
                        put_files_struct(files);
        }
-out_unlock:
-        unlock_trace(p);
 out:
        put_task_struct(p);
 out_no_task:
@@ -2254,7 +2195,6 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
        ei->fd = fd;
        inode->i_mode = S_IFREG | S_IRUSR;
        inode->i_fop = &proc_fdinfo_file_operations;
-        inode->i_op = &proc_fdinfo_link_inode_operations;
        d_set_d_op(dentry, &tid_fd_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 586174168e2..80e4645f799 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -131,12 +131,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(i.freeswap),
                K(global_page_state(NR_FILE_DIRTY)),
                K(global_page_state(NR_WRITEBACK)),
-                K(global_page_state(NR_ANON_PAGES)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                K(global_page_state(NR_ANON_PAGES)
                  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
-                  HPAGE_PMD_NR
+                  HPAGE_PMD_NR),
+#else
+                K(global_page_state(NR_ANON_PAGES)),
 #endif
-                  ),
                K(global_page_state(NR_FILE_MAPPED)),
                K(global_page_state(NR_SHMEM)),
                K(global_page_state(NR_SLAB_RECLAIMABLE) +
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 42b274da92c..2a30d67dd6b 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu)
                idle = kstat_cpu(cpu).cpustat.idle;
                idle = cputime64_add(idle, arch_idle_time(cpu));
        } else
-                idle = usecs_to_cputime(idle_time);
+                idle = nsecs_to_jiffies64(1000 * idle_time);
        return idle;
 }
@@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu)
                /* !NO_HZ so we can rely on cpustat.iowait */
                iowait = kstat_cpu(cpu).cpustat.iowait;
        else
-                iowait = usecs_to_cputime(iowait_time);
+                iowait = nsecs_to_jiffies64(1000 * iowait_time);
        return iowait;
 }
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cd99bf55765..b0f450a2bb7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
 #include <linux/user.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 2bd620f0d79..57bbf9078ac 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -167,6 +167,7 @@ int pstore_register(struct pstore_info *psi)
        }
        psinfo = psi;
+        mutex_init(&psinfo->read_mutex);
        spin_unlock(&pstore_lock);
        if (owner && !try_module_get(owner)) {
@@ -195,30 +196,32 @@ EXPORT_SYMBOL_GPL(pstore_register);
 void pstore_get_records(int quiet)
 {
        struct pstore_info *psi = psinfo;
+        char                    *buf = NULL;
        ssize_t                 size;
        u64                     id;
        enum pstore_type_id     type;
        struct timespec         time;
        int                     failed = 0, rc;
-        unsigned long           flags;
        if (!psi)
                return;
-        spin_lock_irqsave(&psinfo->buf_lock, flags);
+        mutex_lock(&psi->read_mutex);
        rc = psi->open(psi);
        if (rc)
                goto out;
-        while ((size = psi->read(&id, &type, &time, psi)) > 0) {
+        while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) {
-                rc = pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
+                rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size,
                                  time, psi);
+                kfree(buf);
+                buf = NULL;
                if (rc && (rc != -EEXIST || !quiet))
                        failed++;
        }
        psi->close(psi);
 out:
-        spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+        mutex_unlock(&psi->read_mutex);
        if (failed)
                printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index aae0edb95c6..35f4b0ecdeb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                /* caller already holds s_umount */
                if (sb->s_flags & MS_RDONLY)
                        return -EROFS;
-                writeback_inodes_sb(sb);
+                writeback_inodes_sb(sb, WB_REASON_SYNC);
                return 0;
        default:
                return -EINVAL;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 05d6b0e78c9..dba43c3ea3a 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -449,8 +449,6 @@ EXPORT_SYMBOL(seq_path);
 /*
 * Same as seq_path, but relative to supplied root.
- *
- * root may be changed, see __d_path().
 */
 int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
                  char *esc)
@@ -463,6 +461,8 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
                char *p;
                p = __d_path(path, root, buf, size);
+                if (!p)
+                        return SEQ_SKIP;
                res = PTR_ERR(p);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
@@ -474,7 +474,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
        }
        seq_commit(m, res);
-        return res < 0 ? res : 0;
+        return res < 0 && res != -ENAMETOOLONG ? res : 0;
 }
 /*
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 048b59d5b2f..c70111ebefd 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -78,6 +78,28 @@ config SQUASHFS_XZ
          If unsure, say N.
+config SQUASHFS_4K_DEVBLK_SIZE
+        bool "Use 4K device block size?"
+        depends on SQUASHFS
+        help
+          By default Squashfs sets the dev block size (sb_min_blocksize)
+          to 1K or the smallest block size supported by the block device
+          (if larger).  This, because blocks are packed together and
+          unaligned in Squashfs, should reduce latency.
+          This, however, gives poor performance on MTD NAND devices where
+          the optimal I/O size is 4K (even though the devices can support
+          smaller block sizes).
+          Using a 4K device block size may also improve overall I/O
+          performance for some file access patterns (e.g. sequential
+          accesses of files in filesystem order) on all media.
+          Setting this option will force Squashfs to use a 4K device block
+          size by default.
+          If unsure, say N.
 config SQUASHFS_EMBEDDED
        bool "Additional option for memory-constrained systems"
        depends on SQUASHFS
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index b4a4e539a08..e8e14645de9 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -36,6 +36,13 @@
 #define SQUASHFS_FILE_SIZE              131072
 #define SQUASHFS_FILE_LOG               17
+/* default size of block device I/O */
+#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
+#define SQUASHFS_DEVBLK_SIZE 4096
+#else
+#define SQUASHFS_DEVBLK_SIZE 1024
+#endif
 #define SQUASHFS_FILE_MAX_SIZE          1048576
 #define SQUASHFS_FILE_MAX_LOG           20
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 7438850c62d..2da1715452a 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        }
        msblk = sb->s_fs_info;
-        msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
+        msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
        msblk->devblksize_log2 = ffz(~msblk->devblksize);
        mutex_init(&msblk->read_data_mutex);
diff --git a/fs/statfs.c b/fs/statfs.c
index 8244924dec5..9cf04a11896 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -76,7 +76,7 @@ EXPORT_SYMBOL(vfs_statfs);
 int user_statfs(const char __user *pathname, struct kstatfs *st)
 {
        struct path path;
-        int error = user_path(pathname, &path);
+        int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
        if (!error) {
                error = vfs_statfs(&path, st);
                path_put(&path);
diff --git a/fs/sync.c b/fs/sync.c
index c98a7477edf..101b8ef901d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
        if (wait)
                sync_inodes_sb(sb);
        else
-                writeback_inodes_sb(sb);
+                writeback_inodes_sb(sb, WB_REASON_SYNC);
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, wait);
@@ -98,7 +98,7 @@ static void sync_filesystems(int wait)
 */
 SYSCALL_DEFINE0(sync)
 {
-        wakeup_flusher_threads(0);
+        wakeup_flusher_threads(0, WB_REASON_SYNC);
        sync_filesystems(0);
        sync_filesystems(1);
        if (unlikely(laptop_mode))
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 315de66e52b..bc4f94b2870 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -63,7 +63,7 @@
 static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
        down_read(&c->vfs_sb->s_umount);
-        writeback_inodes_sb(c->vfs_sb);
+        writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE);
        up_read(&c->vfs_sb->s_umount);
 }
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index eef109a1a92..b09ba2dd8b6 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
        spin_unlock(&dbg_lock);
 }
+void dbg_dump_sleb(const struct ubifs_info *c,
+                   const struct ubifs_scan_leb *sleb, int offs)
+{
+        struct ubifs_scan_node *snod;
+        printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n",
+               current->pid, sleb->lnum, offs);
+        list_for_each_entry(snod, &sleb->nodes, list) {
+                cond_resched();
+                printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
+                       snod->offs, snod->len);
+                dbg_dump_node(c, snod->node);
+        }
+}
 void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 {
        struct ubifs_scan_leb *sleb;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index feb361e252a..8d9c4681018 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
 void dbg_dump_lprops(struct ubifs_info *c);
 void dbg_dump_lpt_info(struct ubifs_info *c);
 void dbg_dump_leb(const struct ubifs_info *c, int lnum);
+void dbg_dump_sleb(const struct ubifs_info *c,
+                   const struct ubifs_scan_leb *sleb, int offs);
 void dbg_dump_znode(const struct ubifs_info *c,
                    const struct ubifs_znode *znode);
 void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
@@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c)        { return; }
 static inline void dbg_dump_leb(const struct ubifs_info *c,
                                int lnum)                         { return; }
 static inline void
+dbg_dump_sleb(const struct ubifs_info *c,
+              const struct ubifs_scan_leb *sleb, int offs)        { return; }
+static inline void
 dbg_dump_znode(const struct ubifs_info *c,
               const struct ubifs_znode *znode)                   { return; }
 static inline void dbg_dump_heap(struct ubifs_info *c,
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index af02790d932..ee4f43f4bb9 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf)
 }
 /**
- *  clean_an_unclean_leb - read and write a LEB to remove corruption.
+ * clean_an_unclean_leb - read and write a LEB to remove corruption.
 * @c: UBIFS file-system description object
 * @ucleb: unclean LEB information
 * @sbuf: LEB-sized buffer to use
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 93d938ad3d2..6094c5a5d7a 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c)
        mst->total_dirty = cpu_to_le64(tmp64);
        /*  The indexing LEB does not contribute to dark space */
-        tmp64 = (c->main_lebs - 1) * c->dark_wm;
+        tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm);
        mst->total_dark = cpu_to_le64(tmp64);
        mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b6c4b3795c4..76e4266d2e7 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -42,6 +42,8 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
        int count, i;
        count = be32_to_cpu(aclp->acl_cnt);
+        if (count > XFS_ACL_MAX_ENTRIES)
+                return ERR_PTR(-EFSCORRUPTED);
        acl = posix_acl_alloc(count, GFP_KERNEL);
        if (!acl)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 33b13310ee0..574d4ee9b62 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -189,7 +189,7 @@ xfs_end_io(
        int             error = 0;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                error = -EIO;
+                ioend->io_error = -EIO;
                goto done;
        }
        if (ioend->io_error)
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d4906e7c978..c1b55e59655 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -110,6 +110,7 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
 /*
 * Query whether the requested number of additional bytes of extended
 * attribute space will be able to fit inline.
+ *
 * Returns zero if not, else the di_forkoff fork offset to be used in the
 * literal area for attribute data once the new bytes have been added.
 *
@@ -122,7 +123,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
        int offset;
        int minforkoff; /* lower limit on valid forkoff locations */
        int maxforkoff; /* upper limit on valid forkoff locations */
-        int dsize;      
+        int dsize;
        xfs_mount_t *mp = dp->i_mount;
        offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */
@@ -136,47 +137,60 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
                return (offset >= minforkoff) ? minforkoff : 0;
        }
-        if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
+        /*
-                if (bytes <= XFS_IFORK_ASIZE(dp))
+         * If the requested numbers of bytes is smaller or equal to the
-                        return dp->i_d.di_forkoff;
+         * current attribute fork size we can always proceed.
+         *
+         * Note that if_bytes in the data fork might actually be larger than
+         * the current data fork size is due to delalloc extents. In that
+         * case either the extent count will go down when they are converted
+         * to real extents, or the delalloc conversion will take care of the
+         * literal area rebalancing.
+         */
+        if (bytes <= XFS_IFORK_ASIZE(dp))
+                return dp->i_d.di_forkoff;
+        /*
+         * For attr2 we can try to move the forkoff if there is space in the
+         * literal area, but for the old format we are done if there is no
+         * space in the fixed attribute fork.
+         */
+        if (!(mp->m_flags & XFS_MOUNT_ATTR2))
                return 0;
-        }
        dsize = dp->i_df.if_bytes;
-        
        switch (dp->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
-                /* 
+                /*
                 * If there is no attr fork and the data fork is extents, 
-                 * determine if creating the default attr fork will result 
+                 * determine if creating the default attr fork will result
-                 * in the extents form migrating to btree. If so, the 
+                 * in the extents form migrating to btree. If so, the
-                 * minimum offset only needs to be the space required for 
+                 * minimum offset only needs to be the space required for
                 * the btree root.
-                 */ 
+                 */
                if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
                    xfs_default_attroffset(dp))
                        dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
                break;
-                
        case XFS_DINODE_FMT_BTREE:
                /*
-                 * If have data btree then keep forkoff if we have one,
+                 * If we have a data btree then keep forkoff if we have one,
-                 * otherwise we are adding a new attr, so then we set 
+                 * otherwise we are adding a new attr, so then we set
-                 * minforkoff to where the btree root can finish so we have 
+                 * minforkoff to where the btree root can finish so we have
                 * plenty of room for attrs
                 */
                if (dp->i_d.di_forkoff) {
-                        if (offset < dp->i_d.di_forkoff) 
+                        if (offset < dp->i_d.di_forkoff)
                                return 0;
-                        else 
+                        return dp->i_d.di_forkoff;
-                                return dp->i_d.di_forkoff;
+                }
-                } else
+                dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
-                        dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
                break;
        }
-        
-        /* 
+        /*
-         * A data fork btree root must have space for at least 
+         * A data fork btree root must have space for at least
         * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
         */
        minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
@@ -186,10 +200,10 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
        maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
        maxforkoff = maxforkoff >> 3;   /* rounded down */
-        if (offset >= minforkoff && offset < maxforkoff)
-                return offset;
        if (offset >= maxforkoff)
                return maxforkoff;
+        if (offset >= minforkoff)
+                return offset;
        return 0;
 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c68baeb0974..d0ab7883705 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2383,6 +2383,8 @@ xfs_bmap_btalloc(
        int             tryagain;
        int             error;
+        ASSERT(ap->length);
        mp = ap->ip->i_mount;
        align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
        if (unlikely(align)) {
@@ -4629,6 +4631,8 @@ xfs_bmapi_allocate(
        int                     error;
        int                     rt;
+        ASSERT(bma->length > 0);
        rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
        /*
@@ -4849,6 +4853,7 @@ xfs_bmapi_write(
        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
        ASSERT(!(flags & XFS_BMAPI_IGSTATE));
        ASSERT(tp != NULL);
+        ASSERT(len > 0);
        whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
                XFS_ATTR_FORK : XFS_DATA_FORK;
@@ -4918,9 +4923,22 @@ xfs_bmapi_write(
                        bma.eof = eof;
                        bma.conv = !!(flags & XFS_BMAPI_CONVERT);
                        bma.wasdel = wasdelay;
-                        bma.length = len;
                        bma.offset = bno;
+                        /*
+                         * There's a 32/64 bit type mismatch between the
+                         * allocation length request (which can be 64 bits in
+                         * length) and the bma length request, which is
+                         * xfs_extlen_t and therefore 32 bits. Hence we have to
+                         * check for 32-bit overflows and handle them here.
+                         */
+                        if (len > (xfs_filblks_t)MAXEXTLEN)
+                                bma.length = MAXEXTLEN;
+                        else
+                                bma.length = len;
+                        ASSERT(len > 0);
+                        ASSERT(bma.length > 0);
                        error = xfs_bmapi_allocate(&bma, flags);
                        if (error)
                                goto error0;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1a3513881bc..eac97ef81e2 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -656,7 +656,7 @@ xfs_buf_item_committing(
 /*
 * This is the ops vector shared by all buf log items.
 */
-static struct xfs_item_ops xfs_buf_item_ops = {
+static const struct xfs_item_ops xfs_buf_item_ops = {
        .iop_size       = xfs_buf_item_size,
        .iop_format     = xfs_buf_item_format,
        .iop_pin        = xfs_buf_item_pin,
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index bb3f71d236d..0dee0b71029 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -295,7 +295,7 @@ xfs_qm_dquot_logitem_committing(
 /*
 * This is the ops vector for dquots
 */
-static struct xfs_item_ops xfs_dquot_item_ops = {
+static const struct xfs_item_ops xfs_dquot_item_ops = {
        .iop_size       = xfs_qm_dquot_logitem_size,
        .iop_format     = xfs_qm_dquot_logitem_format,
        .iop_pin        = xfs_qm_dquot_logitem_pin,
@@ -483,7 +483,7 @@ xfs_qm_qoff_logitem_committing(
 {
 }
-static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
+static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
        .iop_size       = xfs_qm_qoff_logitem_size,
        .iop_format     = xfs_qm_qoff_logitem_format,
        .iop_pin        = xfs_qm_qoff_logitem_pin,
@@ -498,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
 /*
 * This is the ops vector shared by all quotaoff-start log items.
 */
-static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
+static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
        .iop_size       = xfs_qm_qoff_logitem_size,
        .iop_format     = xfs_qm_qoff_logitem_format,
        .iop_pin        = xfs_qm_qoff_logitem_pin,
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index da108977b21..558910f5e3c 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -98,22 +98,22 @@ xfs_fs_encode_fh(
        switch (fileid_type) {
        case FILEID_INO32_GEN_PARENT:
                spin_lock(&dentry->d_lock);
-                fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino;
+                fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
                fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
                spin_unlock(&dentry->d_lock);
                /*FALLTHRU*/
        case FILEID_INO32_GEN:
-                fid->i32.ino = inode->i_ino;
+                fid->i32.ino = XFS_I(inode)->i_ino;
                fid->i32.gen = inode->i_generation;
                break;
        case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
                spin_lock(&dentry->d_lock);
-                fid64->parent_ino = dentry->d_parent->d_inode->i_ino;
+                fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
                fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
                spin_unlock(&dentry->d_lock);
                /*FALLTHRU*/
        case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
-                fid64->ino = inode->i_ino;
+                fid64->ino = XFS_I(inode)->i_ino;
                fid64->gen = inode->i_generation;
                break;
        }
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d22e6262343..35c2aff38b2 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -217,7 +217,7 @@ xfs_efi_item_committing(
 /*
 * This is the ops vector shared by all efi log items.
 */
-static struct xfs_item_ops xfs_efi_item_ops = {
+static const struct xfs_item_ops xfs_efi_item_ops = {
        .iop_size       = xfs_efi_item_size,
        .iop_format     = xfs_efi_item_format,
        .iop_pin        = xfs_efi_item_pin,
@@ -477,7 +477,7 @@ xfs_efd_item_committing(
 /*
 * This is the ops vector shared by all efd log items.
 */
-static struct xfs_item_ops xfs_efd_item_ops = {
+static const struct xfs_item_ops xfs_efd_item_ops = {
        .iop_size       = xfs_efd_item_size,
        .iop_format     = xfs_efd_item_format,
        .iop_pin        = xfs_efd_item_pin,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c0237c602f1..755ee816488 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2835,6 +2835,27 @@ corrupt_out:
        return XFS_ERROR(EFSCORRUPTED);
 }
+void
+xfs_promote_inode(
+        struct xfs_inode        *ip)
+{
+        struct xfs_buf          *bp;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+        bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno,
+                        ip->i_imap.im_len, XBF_TRYLOCK);
+        if (!bp)
+                return;
+        if (XFS_BUF_ISDELAYWRITE(bp)) {
+                xfs_buf_delwri_promote(bp);
+                wake_up_process(ip->i_mount->m_ddev_targp->bt_task);
+        }
+        xfs_buf_relse(bp);
+}
 /*
 * Return a pointer to the extent record at file index idx.
 */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 760140d1dd6..b4cd4739f98 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -498,6 +498,7 @@ int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 void            xfs_iext_realloc(xfs_inode_t *, int, int);
 void            xfs_iunpin_wait(xfs_inode_t *);
 int             xfs_iflush(xfs_inode_t *, uint);
+void            xfs_promote_inode(struct xfs_inode *);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
 void            xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index b7cf21ba240..abaafdbb3e6 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -795,7 +795,7 @@ xfs_inode_item_committing(
 /*
 * This is the ops vector shared by all buf log items.
 */
-static struct xfs_item_ops xfs_inode_item_ops = {
+static const struct xfs_item_ops xfs_inode_item_ops = {
        .iop_size       = xfs_inode_item_size,
        .iop_format     = xfs_inode_item_format,
        .iop_pin        = xfs_inode_item_pin,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2758a6277c5..34817adf4b9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -150,6 +150,117 @@ xlog_grant_add_space(
        } while (head_val != old);
 }
+STATIC bool
+xlog_reserveq_wake(
+        struct log              *log,
+        int                     *free_bytes)
+{
+        struct xlog_ticket      *tic;
+        int                     need_bytes;
+        list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+                if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+                        need_bytes = tic->t_unit_res * tic->t_cnt;
+                else
+                        need_bytes = tic->t_unit_res;
+                if (*free_bytes < need_bytes)
+                        return false;
+                *free_bytes -= need_bytes;
+                trace_xfs_log_grant_wake_up(log, tic);
+                wake_up(&tic->t_wait);
+        }
+        return true;
+}
+STATIC bool
+xlog_writeq_wake(
+        struct log              *log,
+        int                     *free_bytes)
+{
+        struct xlog_ticket      *tic;
+        int                     need_bytes;
+        list_for_each_entry(tic, &log->l_writeq, t_queue) {
+                ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+                need_bytes = tic->t_unit_res;
+                if (*free_bytes < need_bytes)
+                        return false;
+                *free_bytes -= need_bytes;
+                trace_xfs_log_regrant_write_wake_up(log, tic);
+                wake_up(&tic->t_wait);
+        }
+        return true;
+}
+STATIC int
+xlog_reserveq_wait(
+        struct log              *log,
+        struct xlog_ticket      *tic,
+        int                     need_bytes)
+{
+        list_add_tail(&tic->t_queue, &log->l_reserveq);
+        do {
+                if (XLOG_FORCED_SHUTDOWN(log))
+                        goto shutdown;
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                trace_xfs_log_grant_sleep(log, tic);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+                trace_xfs_log_grant_wake(log, tic);
+                spin_lock(&log->l_grant_reserve_lock);
+                if (XLOG_FORCED_SHUTDOWN(log))
+                        goto shutdown;
+        } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
+        list_del_init(&tic->t_queue);
+        return 0;
+shutdown:
+        list_del_init(&tic->t_queue);
+        return XFS_ERROR(EIO);
+}
+STATIC int
+xlog_writeq_wait(
+        struct log              *log,
+        struct xlog_ticket      *tic,
+        int                     need_bytes)
+{
+        list_add_tail(&tic->t_queue, &log->l_writeq);
+        do {
+                if (XLOG_FORCED_SHUTDOWN(log))
+                        goto shutdown;
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                trace_xfs_log_regrant_write_sleep(log, tic);
+                xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+                trace_xfs_log_regrant_write_wake(log, tic);
+                spin_lock(&log->l_grant_write_lock);
+                if (XLOG_FORCED_SHUTDOWN(log))
+                        goto shutdown;
+        } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
+        list_del_init(&tic->t_queue);
+        return 0;
+shutdown:
+        list_del_init(&tic->t_queue);
+        return XFS_ERROR(EIO);
+}
 static void
 xlog_tic_reset_res(xlog_ticket_t *tic)
 {
@@ -350,8 +461,19 @@ xfs_log_reserve(
                retval = xlog_grant_log_space(log, internal_ticket);
        }
+        if (unlikely(retval)) {
+                /*
+                 * If we are failing, make sure the ticket doesn't have any
+                 * current reservations.  We don't want to add this back
+                 * when the ticket/ transaction gets cancelled.
+                 */
+                internal_ticket->t_curr_res = 0;
+                /* ungrant will give back unit_res * t_cnt. */
+                internal_ticket->t_cnt = 0;
+        }
        return retval;
-}       /* xfs_log_reserve */
+}
 /*
@@ -626,7 +748,7 @@ xfs_log_item_init(
        struct xfs_mount        *mp,
        struct xfs_log_item     *item,
        int                     type,
-        struct xfs_item_ops     *ops)
+        const struct xfs_item_ops *ops)
 {
        item->li_mountp = mp;
        item->li_ailp = mp->m_ail;
@@ -2481,8 +2603,8 @@ restart:
 /*
 * Atomically get the log space required for a log ticket.
 *
- * Once a ticket gets put onto the reserveq, it will only return after
+ * Once a ticket gets put onto the reserveq, it will only return after the
- * the needed reservation is satisfied.
+ * needed reservation is satisfied.
 *
 * This function is structured so that it has a lock free fast path. This is
 * necessary because every new transaction reservation will come through this
@@ -2490,113 +2612,53 @@ restart:
 * every pass.
 *
 * As tickets are only ever moved on and off the reserveq under the
- * l_grant_reserve_lock, we only need to take that lock if we are going
+ * l_grant_reserve_lock, we only need to take that lock if we are going to add
- * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
- * ticket was never added to the reserveq because the t_queue list head will be
+ * was never added to the reserveq because the t_queue list head will be empty
- * empty and we hold the only reference to it so it can safely be checked
+ * and we hold the only reference to it so it can safely be checked unlocked.
- * unlocked.
 */
 STATIC int
-xlog_grant_log_space(xlog_t        *log,
+xlog_grant_log_space(
-                     xlog_ticket_t *tic)
+        struct log              *log,
+        struct xlog_ticket      *tic)
 {
-        int              free_bytes;
+        int                     free_bytes, need_bytes;
-        int              need_bytes;
+        int                     error = 0;
-#ifdef DEBUG
+        ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
-                panic("grant Recovery problem");
-#endif
        trace_xfs_log_grant_enter(log, tic);
+        /*
+         * If there are other waiters on the queue then give them a chance at
+         * logspace before us.  Wake up the first waiters, if we do not wake
+         * up all the waiters then go to sleep waiting for more free space,
+         * otherwise try to get some space for this transaction.
+         */
        need_bytes = tic->t_unit_res;
        if (tic->t_flags & XFS_LOG_PERM_RESERV)
                need_bytes *= tic->t_ocnt;
-        /* something is already sleeping; insert new transaction at end */
-        if (!list_empty_careful(&log->l_reserveq)) {
-                spin_lock(&log->l_grant_reserve_lock);
-                /* recheck the queue now we are locked */
-                if (list_empty(&log->l_reserveq)) {
-                        spin_unlock(&log->l_grant_reserve_lock);
-                        goto redo;
-                }
-                list_add_tail(&tic->t_queue, &log->l_reserveq);
-                trace_xfs_log_grant_sleep1(log, tic);
-                /*
-                 * Gotta check this before going to sleep, while we're
-                 * holding the grant lock.
-                 */
-                if (XLOG_FORCED_SHUTDOWN(log))
-                        goto error_return;
-                XFS_STATS_INC(xs_sleep_logspace);
-                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-                /*
-                 * If we got an error, and the filesystem is shutting down,
-                 * we'll catch it down below. So just continue...
-                 */
-                trace_xfs_log_grant_wake1(log, tic);
-        }
-redo:
-        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return_unlocked;
        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-        if (free_bytes < need_bytes) {
+        if (!list_empty_careful(&log->l_reserveq)) {
                spin_lock(&log->l_grant_reserve_lock);
-                if (list_empty(&tic->t_queue))
+                if (!xlog_reserveq_wake(log, &free_bytes) ||
-                        list_add_tail(&tic->t_queue, &log->l_reserveq);
+                    free_bytes < need_bytes)
+                        error = xlog_reserveq_wait(log, tic, need_bytes);
-                trace_xfs_log_grant_sleep2(log, tic);
+                spin_unlock(&log->l_grant_reserve_lock);
+        } else if (free_bytes < need_bytes) {
-                if (XLOG_FORCED_SHUTDOWN(log))
-                        goto error_return;
-                xlog_grant_push_ail(log, need_bytes);
-                XFS_STATS_INC(xs_sleep_logspace);
-                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-                trace_xfs_log_grant_wake2(log, tic);
-                goto redo;
-        }
-        if (!list_empty(&tic->t_queue)) {
                spin_lock(&log->l_grant_reserve_lock);
-                list_del_init(&tic->t_queue);
+                error = xlog_reserveq_wait(log, tic, need_bytes);
                spin_unlock(&log->l_grant_reserve_lock);
        }
+        if (error)
+                return error;
-        /* we've got enough space */
        xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_grant_exit(log, tic);
        xlog_verify_grant_tail(log);
        return 0;
+}
-error_return_unlocked:
-        spin_lock(&log->l_grant_reserve_lock);
-error_return:
-        list_del_init(&tic->t_queue);
-        spin_unlock(&log->l_grant_reserve_lock);
-        trace_xfs_log_grant_error(log, tic);
-        /*
-         * If we are failing, make sure the ticket doesn't have any
-         * current reservations. We don't want to add this back when
-         * the ticket/transaction gets cancelled.
-         */
-        tic->t_curr_res = 0;
-        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        return XFS_ERROR(EIO);
-}       /* xlog_grant_log_space */
 /*
 * Replenish the byte reservation required by moving the grant write head.
@@ -2605,10 +2667,12 @@ error_return:
 * free fast path.
 */
 STATIC int
-xlog_regrant_write_log_space(xlog_t        *log,
+xlog_regrant_write_log_space(
-                             xlog_ticket_t *tic)
+        struct log              *log,
+        struct xlog_ticket      *tic)
 {
-        int             free_bytes, need_bytes;
+        int                     free_bytes, need_bytes;
+        int                     error = 0;
        tic->t_curr_res = tic->t_unit_res;
        xlog_tic_reset_res(tic);
@@ -2616,104 +2680,38 @@ xlog_regrant_write_log_space(xlog_t	   *log,
        if (tic->t_cnt > 0)
                return 0;
-#ifdef DEBUG
+        ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
-                panic("regrant Recovery problem");
-#endif
        trace_xfs_log_regrant_write_enter(log, tic);
-        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return_unlocked;
-        /* If there are other waiters on the queue then give them a
+        /*
-         * chance at logspace before us. Wake up the first waiters,
+         * If there are other waiters on the queue then give them a chance at
-         * if we do not wake up all the waiters then go to sleep waiting
+         * logspace before us.  Wake up the first waiters, if we do not wake
-         * for more free space, otherwise try to get some space for
+         * up all the waiters then go to sleep waiting for more free space,
-         * this transaction.
+         * otherwise try to get some space for this transaction.
         */
        need_bytes = tic->t_unit_res;
-        if (!list_empty_careful(&log->l_writeq)) {
-                struct xlog_ticket *ntic;
-                spin_lock(&log->l_grant_write_lock);
-                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                list_for_each_entry(ntic, &log->l_writeq, t_queue) {
-                        ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
-                        if (free_bytes < ntic->t_unit_res)
-                                break;
-                        free_bytes -= ntic->t_unit_res;
-                        wake_up(&ntic->t_wait);
-                }
-                if (ntic != list_first_entry(&log->l_writeq,
-                                                struct xlog_ticket, t_queue)) {
-                        if (list_empty(&tic->t_queue))
-                                list_add_tail(&tic->t_queue, &log->l_writeq);
-                        trace_xfs_log_regrant_write_sleep1(log, tic);
-                        xlog_grant_push_ail(log, need_bytes);
-                        XFS_STATS_INC(xs_sleep_logspace);
-                        xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-                        trace_xfs_log_regrant_write_wake1(log, tic);
-                } else
-                        spin_unlock(&log->l_grant_write_lock);
-        }
-redo:
-        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return_unlocked;
        free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-        if (free_bytes < need_bytes) {
+        if (!list_empty_careful(&log->l_writeq)) {
                spin_lock(&log->l_grant_write_lock);
-                if (list_empty(&tic->t_queue))
+                if (!xlog_writeq_wake(log, &free_bytes) ||
-                        list_add_tail(&tic->t_queue, &log->l_writeq);
+                    free_bytes < need_bytes)
+                        error = xlog_writeq_wait(log, tic, need_bytes);
-                if (XLOG_FORCED_SHUTDOWN(log))
+                spin_unlock(&log->l_grant_write_lock);
-                        goto error_return;
+        } else if (free_bytes < need_bytes) {
-                xlog_grant_push_ail(log, need_bytes);
-                XFS_STATS_INC(xs_sleep_logspace);
-                trace_xfs_log_regrant_write_sleep2(log, tic);
-                xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-                trace_xfs_log_regrant_write_wake2(log, tic);
-                goto redo;
-        }
-        if (!list_empty(&tic->t_queue)) {
                spin_lock(&log->l_grant_write_lock);
-                list_del_init(&tic->t_queue);
+                error = xlog_writeq_wait(log, tic, need_bytes);
                spin_unlock(&log->l_grant_write_lock);
        }
-        /* we've got enough space */
+        if (error)
+                return error;
        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_regrant_write_exit(log, tic);
        xlog_verify_grant_tail(log);
        return 0;
+}
- error_return_unlocked:
-        spin_lock(&log->l_grant_write_lock);
- error_return:
-        list_del_init(&tic->t_queue);
-        spin_unlock(&log->l_grant_write_lock);
-        trace_xfs_log_regrant_write_error(log, tic);
-        /*
-         * If we are failing, make sure the ticket doesn't have any
-         * current reservations. We don't want to add this back when
-         * the ticket/transaction gets cancelled.
-         */
-        tic->t_curr_res = 0;
-        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        return XFS_ERROR(EIO);
-}       /* xlog_regrant_write_log_space */
 /* The first cnt-1 times through here we don't need to
 * move the grant write head because the permanent
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 78c9039994a..3f7bf451c03 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -137,7 +137,7 @@ struct xfs_trans;
 void    xfs_log_item_init(struct xfs_mount      *mp,
                        struct xfs_log_item     *item,
                        int                     type,
-                        struct xfs_item_ops     *ops);
+                        const struct xfs_item_ops *ops);
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
                       struct xlog_ticket *ticket,
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5cff443f6cd..0bbb1a41998 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -674,7 +674,8 @@ xfs_qm_dqattach_one(
         * disk and we didn't ask it to allocate;
         * ESRCH if quotas got turned off suddenly.
         */
-        error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp);
+        error = xfs_qm_dqget(ip->i_mount, ip, id, type,
+                             doalloc | XFS_QMOPT_DOWARN, &dqp);
        if (error)
                return error;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index aa3dc1a4d53..be5c51d8f75 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -770,6 +770,17 @@ restart:
        if (!xfs_iflock_nowait(ip)) {
                if (!(sync_mode & SYNC_WAIT))
                        goto out;
+                /*
+                 * If we only have a single dirty inode in a cluster there is
+                 * a fair chance that the AIL push may have pushed it into
+                 * the buffer, but xfsbufd won't touch it until 30 seconds
+                 * from now, and thus we will lock up here.
+                 *
+                 * Promote the inode buffer to the front of the delwri list
+                 * and wake up xfsbufd now.
+                 */
+                xfs_promote_inode(ip);
                xfs_iflock(ip);
        }
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f1d2802b2f0..49403579887 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -834,18 +834,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 603f3eb5204..3ae713c0abd 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -326,7 +326,7 @@ typedef struct xfs_log_item {
                                                 struct xfs_log_item *);
                                                        /* buffer item iodone */
                                                        /* callback func */
-        struct xfs_item_ops             *li_ops;        /* function list */
+        const struct xfs_item_ops       *li_ops;        /* function list */
        /* delayed logging */
        struct list_head                li_cil;         /* CIL pointers */
@@ -341,7 +341,7 @@ typedef struct xfs_log_item {
        { XFS_LI_IN_AIL,        "IN_AIL" }, \
        { XFS_LI_ABORTED,       "ABORTED" }
-typedef struct xfs_item_ops {
+struct xfs_item_ops {
        uint (*iop_size)(xfs_log_item_t *);
        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
        void (*iop_pin)(xfs_log_item_t *);
@@ -352,7 +352,7 @@ typedef struct xfs_item_ops {
        void (*iop_push)(xfs_log_item_t *);
        bool (*iop_pushbuf)(xfs_log_item_t *);
        void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
-} xfs_item_ops_t;
+};
 #define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
 #define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4ecf2a54906..ce9268a2f56 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -112,7 +112,7 @@ xfs_readlink(
        char            *link)
 {
        xfs_mount_t     *mp = ip->i_mount;
-        int             pathlen;
+        xfs_fsize_t     pathlen;
        int             error = 0;
        trace_xfs_readlink(ip);
@@ -122,13 +122,19 @@ xfs_readlink(
        xfs_ilock(ip, XFS_ILOCK_SHARED);
-        ASSERT(S_ISLNK(ip->i_d.di_mode));
-        ASSERT(ip->i_d.di_size <= MAXPATHLEN);
        pathlen = ip->i_d.di_size;
        if (!pathlen)
                goto out;
+        if (pathlen < 0 || pathlen > MAXPATHLEN) {
+                xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
+                         __func__, (unsigned long long) ip->i_ino,
+                         (long long) pathlen);
+                ASSERT(0);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
        if (ip->i_df.if_flags & XFS_IFINLINE) {
                memcpy(link, ip->i_df.if_u1.if_data, pathlen);
                link[pathlen] = '\0';