Merge branch 'for-linus' into for-3.1/core

Conflicts: block/blk-throttle.c block/cfq-iosched.c Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
author: Jens Axboe <jaxboe@fusionio.com> 2011-07-01 10:17:13 -0400
committer: Jens Axboe <jaxboe@fusionio.com> 2011-07-01 10:17:13 -0400
commit: 04bf7869ca0fd12009aee301cac2264a36df4d98 (patch)
tree: 66cb81ebf8b76560a31433c2c493dc430c914af9 /fs
parent: d2f31a5fd60d168b00fc4f7617b68a1287b21e90 (diff)
parent: 7b28afe01ab6ffb5f152f47831b44933facd2328 (diff)
118 files changed, 2016 insertions, 1662 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 20c106f24927..1b0b19550015 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -584,11 +584,11 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 success:
        d_add(dentry, inode);
-        _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
+        _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }",
               fid.vnode,
               fid.unique,
               dentry->d_inode->i_ino,
-               (unsigned long long)dentry->d_inode->i_version);
+               dentry->d_inode->i_generation);
        return NULL;
 }
@@ -671,10 +671,10 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
                 * been deleted and replaced, and the original vnode ID has
                 * been reused */
                if (fid.unique != vnode->fid.unique) {
-                        _debug("%s: file deleted (uq %u -> %u I:%llu)",
+                        _debug("%s: file deleted (uq %u -> %u I:%u)",
                               dentry->d_name.name, fid.unique,
                               vnode->fid.unique,
-                               (unsigned long long)dentry->d_inode->i_version);
+                               dentry->d_inode->i_generation);
                        spin_lock(&vnode->lock);
                        set_bit(AFS_VNODE_DELETED, &vnode->flags);
                        spin_unlock(&vnode->lock);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 4bd0218473a9..346e3289abd7 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -89,7 +89,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
                        i_size_write(&vnode->vfs_inode, size);
                        vnode->vfs_inode.i_uid = status->owner;
                        vnode->vfs_inode.i_gid = status->group;
-                        vnode->vfs_inode.i_version = vnode->fid.unique;
+                        vnode->vfs_inode.i_generation = vnode->fid.unique;
                        vnode->vfs_inode.i_nlink = status->nlink;
                        mode = vnode->vfs_inode.i_mode;
@@ -102,6 +102,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
                vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server;
                vnode->vfs_inode.i_mtime        = vnode->vfs_inode.i_ctime;
                vnode->vfs_inode.i_atime        = vnode->vfs_inode.i_ctime;
+                vnode->vfs_inode.i_version      = data_version;
        }
        expected_version = status->data_version;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index db66c5201474..0fdab6e03d87 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -75,7 +75,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
        inode->i_ctime.tv_nsec  = 0;
        inode->i_atime          = inode->i_mtime = inode->i_ctime;
        inode->i_blocks         = 0;
-        inode->i_version        = vnode->fid.unique;
+        inode->i_generation     = vnode->fid.unique;
+        inode->i_version        = vnode->status.data_version;
        inode->i_mapping->a_ops = &afs_fs_aops;
        /* check to see whether a symbolic link is really a mountpoint */
@@ -100,7 +101,7 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
        struct afs_iget_data *data = opaque;
        return inode->i_ino == data->fid.vnode &&
-                inode->i_version == data->fid.unique;
+                inode->i_generation == data->fid.unique;
 }
 /*
@@ -122,7 +123,7 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
        struct afs_vnode *vnode = AFS_FS_I(inode);
        inode->i_ino = data->fid.vnode;
-        inode->i_version = data->fid.unique;
+        inode->i_generation = data->fid.unique;
        vnode->fid = data->fid;
        vnode->volume = data->volume;
@@ -380,8 +381,7 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        inode = dentry->d_inode;
-        _enter("{ ino=%lu v=%llu }", inode->i_ino,
+        _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
-                (unsigned long long)inode->i_version);
        generic_fillattr(inode, stat);
        return 0;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index fb240e8766d6..356dcf0929e8 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -31,8 +31,8 @@
 static void afs_i_init_once(void *foo);
 static struct dentry *afs_mount(struct file_system_type *fs_type,
                      int flags, const char *dev_name, void *data);
+static void afs_kill_super(struct super_block *sb);
 static struct inode *afs_alloc_inode(struct super_block *sb);
-static void afs_put_super(struct super_block *sb);
 static void afs_destroy_inode(struct inode *inode);
 static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -40,7 +40,7 @@ struct file_system_type afs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "afs",
        .mount          = afs_mount,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = afs_kill_super,
        .fs_flags       = 0,
 };
@@ -50,7 +50,6 @@ static const struct super_operations afs_super_ops = {
        .drop_inode     = afs_drop_inode,
        .destroy_inode  = afs_destroy_inode,
        .evict_inode    = afs_evict_inode,
-        .put_super      = afs_put_super,
        .show_options   = generic_show_options,
 };
@@ -282,19 +281,25 @@ static int afs_parse_device_name(struct afs_mount_params *params,
 */
 static int afs_test_super(struct super_block *sb, void *data)
 {
-        struct afs_mount_params *params = data;
+        struct afs_super_info *as1 = data;
        struct afs_super_info *as = sb->s_fs_info;
-        return as->volume == params->volume;
+        return as->volume == as1->volume;
+}
+static int afs_set_super(struct super_block *sb, void *data)
+{
+        sb->s_fs_info = data;
+        return set_anon_super(sb, NULL);
 }
 /*
 * fill in the superblock
 */
-static int afs_fill_super(struct super_block *sb, void *data)
+static int afs_fill_super(struct super_block *sb,
+                          struct afs_mount_params *params)
 {
-        struct afs_mount_params *params = data;
+        struct afs_super_info *as = sb->s_fs_info;
-        struct afs_super_info *as = NULL;
        struct afs_fid fid;
        struct dentry *root = NULL;
        struct inode *inode = NULL;
@@ -302,23 +307,13 @@ static int afs_fill_super(struct super_block *sb, void *data)
        _enter("");
-        /* allocate a superblock info record */
-        as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
-        if (!as) {
-                _leave(" = -ENOMEM");
-                return -ENOMEM;
-        }
-        afs_get_volume(params->volume);
-        as->volume = params->volume;
        /* fill in the superblock */
        sb->s_blocksize         = PAGE_CACHE_SIZE;
        sb->s_blocksize_bits    = PAGE_CACHE_SHIFT;
        sb->s_magic             = AFS_FS_MAGIC;
        sb->s_op                = &afs_super_ops;
-        sb->s_fs_info           = as;
        sb->s_bdi               = &as->volume->bdi;
+        strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id));
        /* allocate the root inode and dentry */
        fid.vid         = as->volume->vid;
@@ -326,7 +321,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
        fid.unique      = 1;
        inode = afs_iget(sb, params->key, &fid, NULL, NULL);
        if (IS_ERR(inode))
-                goto error_inode;
+                return PTR_ERR(inode);
        if (params->autocell)
                set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
@@ -342,16 +337,8 @@ static int afs_fill_super(struct super_block *sb, void *data)
        _leave(" = 0");
        return 0;
-error_inode:
-        ret = PTR_ERR(inode);
-        inode = NULL;
 error:
        iput(inode);
-        afs_put_volume(as->volume);
-        kfree(as);
-        sb->s_fs_info = NULL;
        _leave(" = %d", ret);
        return ret;
 }
@@ -367,6 +354,7 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
        struct afs_volume *vol;
        struct key *key;
        char *new_opts = kstrdup(options, GFP_KERNEL);
+        struct afs_super_info *as;
        int ret;
        _enter(",,%s,%p", dev_name, options);
@@ -399,12 +387,22 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
                ret = PTR_ERR(vol);
                goto error;
        }
-        params.volume = vol;
+        /* allocate a superblock info record */
+        as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
+        if (!as) {
+                ret = -ENOMEM;
+                afs_put_volume(vol);
+                goto error;
+        }
+        as->volume = vol;
        /* allocate a deviceless superblock */
-        sb = sget(fs_type, afs_test_super, set_anon_super, &params);
+        sb = sget(fs_type, afs_test_super, afs_set_super, as);
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
+                afs_put_volume(vol);
+                kfree(as);
                goto error;
        }
@@ -422,16 +420,16 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
        } else {
                _debug("reuse");
                ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
+                afs_put_volume(vol);
+                kfree(as);
        }
-        afs_put_volume(params.volume);
        afs_put_cell(params.cell);
        kfree(new_opts);
        _leave(" = 0 [%p]", sb);
        return dget(sb->s_root);
 error:
-        afs_put_volume(params.volume);
        afs_put_cell(params.cell);
        key_put(params.key);
        kfree(new_opts);
@@ -439,18 +437,12 @@ error:
        return ERR_PTR(ret);
 }
-/*
+static void afs_kill_super(struct super_block *sb)
- * finish the unmounting process on the superblock
- */
-static void afs_put_super(struct super_block *sb)
 {
        struct afs_super_info *as = sb->s_fs_info;
+        kill_anon_super(sb);
-        _enter("");
        afs_put_volume(as->volume);
+        kfree(as);
-        _leave("");
 }
 /*
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 789b3afb3423..b806285ff853 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -84,23 +84,21 @@ void afs_put_writeback(struct afs_writeback *wb)
 * partly or wholly fill a page that's under preparation for writing
 */
 static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
-                         loff_t pos, unsigned len, struct page *page)
+                         loff_t pos, struct page *page)
 {
        loff_t i_size;
-        unsigned eof;
        int ret;
+        int len;
-        _enter(",,%llu,%u", (unsigned long long)pos, len);
+        _enter(",,%llu", (unsigned long long)pos);
-        ASSERTCMP(len, <=, PAGE_CACHE_SIZE);
        i_size = i_size_read(&vnode->vfs_inode);
-        if (pos + len > i_size)
+        if (pos + PAGE_CACHE_SIZE > i_size)
-                eof = i_size;
+                len = i_size - pos;
        else
-                eof = PAGE_CACHE_SIZE;
+                len = PAGE_CACHE_SIZE;
-        ret = afs_vnode_fetch_data(vnode, key, 0, eof, page);
+        ret = afs_vnode_fetch_data(vnode, key, pos, len, page);
        if (ret < 0) {
                if (ret == -ENOENT) {
                        _debug("got NOENT from server"
@@ -153,9 +151,8 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
        *pagep = page;
        /* page won't leak in error case: it eventually gets cleaned off LRU */
-        if (!PageUptodate(page)) {
+        if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) {
-                _debug("not up to date");
+                ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page);
-                ret = afs_fill_page(vnode, key, pos, len, page);
                if (ret < 0) {
                        kfree(candidate);
                        _leave(" = %d [prep]", ret);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 9ad2369d9e35..bfcb18feb1df 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -231,9 +231,6 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
 static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
        return -EIO;
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1f2b19978333..610e8e0b04b8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -762,7 +762,19 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
        if (!disk)
                return ERR_PTR(-ENXIO);
-        whole = bdget_disk(disk, 0);
+        /*
+         * Normally, @bdev should equal what's returned from bdget_disk()
+         * if partno is 0; however, some drivers (floppy) use multiple
+         * bdev's for the same physical device and @bdev may be one of the
+         * aliases.  Keep @bdev if partno is 0.  This means claimer
+         * tracking is broken for those devices but it has always been that
+         * way.
+         */
+        if (partno)
+                whole = bdget_disk(disk, 0);
+        else
+                whole = bdgrab(bdev);
        module_put(disk->fops->owner);
        put_disk(disk);
        if (!whole)
@@ -1272,8 +1284,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
                 * individual writeable reference is too fragile given the
                 * way @mode is used in blkdev_get/put().
                 */
-                if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
+                if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
-                    !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+                    (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
                        bdev->bd_write_holder = true;
                        disk_block_events(disk);
                }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 93b1aa932014..52d7eca8c7bf 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -121,9 +121,6 @@ struct btrfs_inode {
         */
        u64 index_cnt;
-        /* the start of block group preferred for allocations. */
-        u64 block_group;
        /* the fsync log has some corner cases that mean we have to check
         * directories to see if any unlinks have been done before
         * the directory was logged.  See tree-log.c for all the
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b0e18d986e0a..2e667868e0d2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -43,8 +43,6 @@ struct btrfs_path *btrfs_alloc_path(void)
 {
        struct btrfs_path *path;
        path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
-        if (path)
-                path->reada = 1;
        return path;
 }
@@ -1224,11 +1222,13 @@ static void reada_for_search(struct btrfs_root *root,
        u64 search;
        u64 target;
        u64 nread = 0;
+        u64 gen;
        int direction = path->reada;
        struct extent_buffer *eb;
        u32 nr;
        u32 blocksize;
        u32 nscan = 0;
+        bool map = true;
        if (level != 1)
                return;
@@ -1250,7 +1250,19 @@ static void reada_for_search(struct btrfs_root *root,
        nritems = btrfs_header_nritems(node);
        nr = slot;
+        if (node->map_token || path->skip_locking)
+                map = false;
        while (1) {
+                if (map && !node->map_token) {
+                        unsigned long offset = btrfs_node_key_ptr_offset(nr);
+                        map_private_extent_buffer(node, offset,
+                                                  sizeof(struct btrfs_key_ptr),
+                                                  &node->map_token,
+                                                  &node->kaddr,
+                                                  &node->map_start,
+                                                  &node->map_len, KM_USER1);
+                }
                if (direction < 0) {
                        if (nr == 0)
                                break;
@@ -1268,14 +1280,23 @@ static void reada_for_search(struct btrfs_root *root,
                search = btrfs_node_blockptr(node, nr);
                if ((search <= target && target - search <= 65536) ||
                    (search > target && search - target <= 65536)) {
-                        readahead_tree_block(root, search, blocksize,
+                        gen = btrfs_node_ptr_generation(node, nr);
-                                     btrfs_node_ptr_generation(node, nr));
+                        if (map && node->map_token) {
+                                unmap_extent_buffer(node, node->map_token,
+                                                    KM_USER1);
+                                node->map_token = NULL;
+                        }
+                        readahead_tree_block(root, search, blocksize, gen);
                        nread += blocksize;
                }
                nscan++;
                if ((nread > 65536 || nscan > 32))
                        break;
        }
+        if (map && node->map_token) {
+                unmap_extent_buffer(node, node->map_token, KM_USER1);
+                node->map_token = NULL;
+        }
 }
 /*
@@ -1648,9 +1669,6 @@ again:
                }
 cow_done:
                BUG_ON(!cow && ins_len);
-                if (level != btrfs_header_level(b))
-                        WARN_ON(1);
-                level = btrfs_header_level(b);
                p->nodes[level] = b;
                if (!p->skip_locking)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6c093fa98f61..f30ac05dbda7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -19,7 +19,6 @@
 #ifndef __BTRFS_CTREE__
 #define __BTRFS_CTREE__
-#include <linux/version.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/fs.h>
@@ -930,7 +929,6 @@ struct btrfs_fs_info {
         * is required instead of the faster short fsync log commits
         */
        u64 last_trans_log_full_commit;
-        u64 open_ioctl_trans;
        unsigned long mount_opt:20;
        unsigned long compress_type:4;
        u64 max_inline;
@@ -947,7 +945,6 @@ struct btrfs_fs_info {
        struct super_block *sb;
        struct inode *btree_inode;
        struct backing_dev_info bdi;
-        struct mutex trans_mutex;
        struct mutex tree_log_mutex;
        struct mutex transaction_kthread_mutex;
        struct mutex cleaner_mutex;
@@ -968,6 +965,13 @@ struct btrfs_fs_info {
        struct rw_semaphore subvol_sem;
        struct srcu_struct subvol_srcu;
+        spinlock_t trans_lock;
+        /*
+         * the reloc mutex goes with the trans lock, it is taken
+         * during commit to protect us from the relocation code
+         */
+        struct mutex reloc_mutex;
        struct list_head trans_list;
        struct list_head hashers;
        struct list_head dead_roots;
@@ -980,6 +984,7 @@ struct btrfs_fs_info {
        atomic_t async_submit_draining;
        atomic_t nr_async_bios;
        atomic_t async_delalloc_pages;
+        atomic_t open_ioctl_trans;
        /*
         * this is used by the balancing code to wait for all the pending
@@ -1044,6 +1049,7 @@ struct btrfs_fs_info {
        int closing;
        int log_root_recovering;
        int enospc_unlink;
+        int trans_no_join;
        u64 total_pinned;
@@ -1065,7 +1071,6 @@ struct btrfs_fs_info {
        struct reloc_control *reloc_ctl;
        spinlock_t delalloc_lock;
-        spinlock_t new_trans_lock;
        u64 delalloc_bytes;
        /* data_alloc_cluster is only used in ssd mode */
@@ -1172,6 +1177,14 @@ struct btrfs_root {
        u32 type;
        u64 highest_objectid;
+        /* btrfs_record_root_in_trans is a multi-step process,
+         * and it can race with the balancing code.   But the
+         * race is very small, and only the first time the root
+         * is added to each transaction.  So in_trans_setup
+         * is used to tell us when more checks are required
+         */
+        unsigned long in_trans_setup;
        int ref_cows;
        int track_dirty;
        int in_radix;
@@ -1181,7 +1194,6 @@ struct btrfs_root {
        struct btrfs_key defrag_max;
        int defrag_running;
        char *name;
-        int in_sysfs;
        /* the dirty list is only used by non-reference counted roots */
        struct list_head dirty_list;
@@ -1340,6 +1352,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
 #define BTRFS_MOUNT_ENOSPC_DEBUG         (1 << 15)
 #define BTRFS_MOUNT_AUTO_DEFRAG         (1 << 16)
+#define BTRFS_MOUNT_INODE_MAP_CACHE     (1 << 17)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -2238,6 +2251,9 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 void btrfs_block_rsv_release(struct btrfs_root *root,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes);
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_block_rsv *rsv);
 int btrfs_set_block_group_ro(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
 int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2350,6 +2366,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
                        struct extent_buffer *parent);
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+        /*
+         * Get synced with close_ctree()
+         */
+        smp_mb();
+        return fs_info->closing;
+}
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
                        struct btrfs_path *path,
@@ -2512,8 +2537,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 int btrfs_writepages(struct address_space *mapping,
                     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *new_root,
+                             struct btrfs_root *new_root, u64 new_dirid);
-                             u64 new_dirid, u64 alloc_hint);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                         size_t size, struct bio *bio, unsigned long bio_flags);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 01e29503a54b..f1cbd028f7b3 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -297,7 +297,6 @@ struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
                item->data_len = data_len;
                item->ins_or_del = 0;
                item->bytes_reserved = 0;
-                item->block_rsv = NULL;
                item->delayed_node = NULL;
                atomic_set(&item->refs, 1);
        }
@@ -593,10 +592,8 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-        if (!ret) {
+        if (!ret)
                item->bytes_reserved = num_bytes;
-                item->block_rsv = dst_rsv;
-        }
        return ret;
 }
@@ -604,10 +601,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
                                                struct btrfs_delayed_item *item)
 {
+        struct btrfs_block_rsv *rsv;
        if (!item->bytes_reserved)
                return;
-        btrfs_block_rsv_release(root, item->block_rsv,
+        rsv = &root->fs_info->global_block_rsv;
+        btrfs_block_rsv_release(root, rsv,
                                item->bytes_reserved);
 }
@@ -678,6 +678,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&head);
        next = item;
+        nitems = 0;
        /*
         * count the number of the continuous items that we can insert in batch
@@ -1013,6 +1014,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_root *delayed_root;
        struct btrfs_delayed_node *curr_node, *prev_node;
        struct btrfs_path *path;
+        struct btrfs_block_rsv *block_rsv;
        int ret = 0;
        path = btrfs_alloc_path();
@@ -1020,6 +1022,9 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        path->leave_spinning = 1;
+        block_rsv = trans->block_rsv;
+        trans->block_rsv = &root->fs_info->global_block_rsv;
        delayed_root = btrfs_get_delayed_root(root);
        curr_node = btrfs_first_delayed_node(delayed_root);
@@ -1044,6 +1049,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
        }
        btrfs_free_path(path);
+        trans->block_rsv = block_rsv;
        return ret;
 }
@@ -1051,6 +1057,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
                                              struct btrfs_delayed_node *node)
 {
        struct btrfs_path *path;
+        struct btrfs_block_rsv *block_rsv;
        int ret;
        path = btrfs_alloc_path();
@@ -1058,6 +1065,9 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        path->leave_spinning = 1;
+        block_rsv = trans->block_rsv;
+        trans->block_rsv = &node->root->fs_info->global_block_rsv;
        ret = btrfs_insert_delayed_items(trans, path, node->root, node);
        if (!ret)
                ret = btrfs_delete_delayed_items(trans, path, node->root, node);
@@ -1065,6 +1075,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
                ret = btrfs_update_delayed_inode(trans, node->root, path, node);
        btrfs_free_path(path);
+        trans->block_rsv = block_rsv;
        return ret;
 }
@@ -1115,6 +1126,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
        struct btrfs_path *path;
        struct btrfs_delayed_node *delayed_node = NULL;
        struct btrfs_root *root;
+        struct btrfs_block_rsv *block_rsv;
        unsigned long nr = 0;
        int need_requeue = 0;
        int ret;
@@ -1129,10 +1141,13 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
        delayed_node = async_node->delayed_node;
        root = delayed_node->root;
-        trans = btrfs_join_transaction(root, 0);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                goto free_path;
+        block_rsv = trans->block_rsv;
+        trans->block_rsv = &root->fs_info->global_block_rsv;
        ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
        if (!ret)
                ret = btrfs_delete_delayed_items(trans, path, root,
@@ -1175,6 +1190,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
        nr = trans->blocks_used;
+        trans->block_rsv = block_rsv;
        btrfs_end_transaction_dmeta(trans, root);
        __btrfs_btree_balance_dirty(root, nr);
 free_path:
@@ -1221,6 +1237,13 @@ again:
        return 0;
 }
+void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
+{
+        struct btrfs_delayed_root *delayed_root;
+        delayed_root = btrfs_get_delayed_root(root);
+        WARN_ON(btrfs_first_delayed_node(delayed_root));
+}
 void btrfs_balance_delayed_items(struct btrfs_root *root)
 {
        struct btrfs_delayed_root *delayed_root;
@@ -1572,8 +1595,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_stack_inode_transid(inode_item, trans->transid);
        btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
        btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
-        btrfs_set_stack_inode_block_group(inode_item,
+        btrfs_set_stack_inode_block_group(inode_item, 0);
-                                          BTRFS_I(inode)->block_group);
        btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
                                     inode->i_atime.tv_sec);
@@ -1595,7 +1617,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root, struct inode *inode)
 {
        struct btrfs_delayed_node *delayed_node;
-        int ret;
+        int ret = 0;
        delayed_node = btrfs_get_or_create_delayed_node(inode);
        if (IS_ERR(delayed_node))
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index eb7d240aa648..d1a6a2915c66 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -75,7 +75,6 @@ struct btrfs_delayed_item {
        struct list_head tree_list;     /* used for batch insert/delete items */
        struct list_head readdir_list;  /* used for readdir items */
        u64 bytes_reserved;
-        struct btrfs_block_rsv *block_rsv;
        struct btrfs_delayed_node *delayed_node;
        atomic_t refs;
        int ins_or_del;
@@ -138,4 +137,8 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
 /* for init */
 int __init btrfs_delayed_inode_init(void);
 void btrfs_delayed_inode_exit(void);
+/* for debugging */
+void btrfs_assert_delayed_root_empty(struct btrfs_root *root);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 98b6a71decba..1ac8db5dc0a3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1044,7 +1044,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->last_trans = 0;
        root->highest_objectid = 0;
        root->name = NULL;
-        root->in_sysfs = 0;
        root->inode_tree = RB_ROOT;
        INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
        root->block_rsv = NULL;
@@ -1300,19 +1299,21 @@ again:
                return root;
        root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
-        if (!root->free_ino_ctl)
-                goto fail;
        root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
                                        GFP_NOFS);
-        if (!root->free_ino_pinned)
+        if (!root->free_ino_pinned || !root->free_ino_ctl) {
+                ret = -ENOMEM;
                goto fail;
+        }
        btrfs_init_free_ino_ctl(root);
        mutex_init(&root->fs_commit_mutex);
        spin_lock_init(&root->cache_lock);
        init_waitqueue_head(&root->cache_wait);
-        set_anon_super(&root->anon_super, NULL);
+        ret = set_anon_super(&root->anon_super, NULL);
+        if (ret)
+                goto fail;
        if (btrfs_root_refs(&root->root_item) == 0) {
                ret = -ENOENT;
@@ -1505,24 +1506,24 @@ static int transaction_kthread(void *arg)
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
-                spin_lock(&root->fs_info->new_trans_lock);
+                spin_lock(&root->fs_info->trans_lock);
                cur = root->fs_info->running_transaction;
                if (!cur) {
-                        spin_unlock(&root->fs_info->new_trans_lock);
+                        spin_unlock(&root->fs_info->trans_lock);
                        goto sleep;
                }
                now = get_seconds();
                if (!cur->blocked &&
                    (now < cur->start_time || now - cur->start_time < 30)) {
-                        spin_unlock(&root->fs_info->new_trans_lock);
+                        spin_unlock(&root->fs_info->trans_lock);
                        delay = HZ * 5;
                        goto sleep;
                }
                transid = cur->transid;
-                spin_unlock(&root->fs_info->new_trans_lock);
+                spin_unlock(&root->fs_info->trans_lock);
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
                if (transid == trans->transid) {
                        ret = btrfs_commit_transaction(trans, root);
@@ -1613,11 +1614,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->ordered_operations);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
        spin_lock_init(&fs_info->delalloc_lock);
-        spin_lock_init(&fs_info->new_trans_lock);
+        spin_lock_init(&fs_info->trans_lock);
        spin_lock_init(&fs_info->ref_cache_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
+        mutex_init(&fs_info->reloc_mutex);
        init_completion(&fs_info->kobj_unregister);
        fs_info->tree_root = tree_root;
@@ -1645,6 +1647,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->max_inline = 8192 * 1024;
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
+        fs_info->trans_no_join = 0;
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
@@ -1667,8 +1670,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        init_waitqueue_head(&fs_info->scrub_pause_wait);
        init_rwsem(&fs_info->scrub_super_lock);
        fs_info->scrub_workers_refcnt = 0;
-        btrfs_init_workers(&fs_info->scrub_workers, "scrub",
-                           fs_info->thread_pool_size, &fs_info->generic_worker);
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
@@ -1709,7 +1710,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->do_barriers = 1;
-        mutex_init(&fs_info->trans_mutex);
        mutex_init(&fs_info->ordered_operations_mutex);
        mutex_init(&fs_info->tree_log_mutex);
        mutex_init(&fs_info->chunk_mutex);
@@ -2479,13 +2479,13 @@ int btrfs_commit_super(struct btrfs_root *root)
        down_write(&root->fs_info->cleanup_work_sem);
        up_write(&root->fs_info->cleanup_work_sem);
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
        /* run commit again to drop the original snapshot */
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
        btrfs_commit_transaction(trans, root);
@@ -2911,9 +2911,8 @@ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
        INIT_LIST_HEAD(&splice);
-        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
        spin_lock(&root->fs_info->delalloc_lock);
+        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
        while (!list_empty(&splice)) {
                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
@@ -3024,10 +3023,13 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
        WARN_ON(1);
-        mutex_lock(&root->fs_info->trans_mutex);
        mutex_lock(&root->fs_info->transaction_kthread_mutex);
+        spin_lock(&root->fs_info->trans_lock);
        list_splice_init(&root->fs_info->trans_list, &list);
+        root->fs_info->trans_no_join = 1;
+        spin_unlock(&root->fs_info->trans_lock);
        while (!list_empty(&list)) {
                t = list_entry(list.next, struct btrfs_transaction, list);
                if (!t)
@@ -3052,23 +3054,18 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
                t->blocked = 0;
                if (waitqueue_active(&root->fs_info->transaction_wait))
                        wake_up(&root->fs_info->transaction_wait);
-                mutex_unlock(&root->fs_info->trans_mutex);
-                mutex_lock(&root->fs_info->trans_mutex);
                t->commit_done = 1;
                if (waitqueue_active(&t->commit_wait))
                        wake_up(&t->commit_wait);
-                mutex_unlock(&root->fs_info->trans_mutex);
-                mutex_lock(&root->fs_info->trans_mutex);
                btrfs_destroy_pending_snapshots(t);
                btrfs_destroy_delalloc_inodes(root);
-                spin_lock(&root->fs_info->new_trans_lock);
+                spin_lock(&root->fs_info->trans_lock);
                root->fs_info->running_transaction = NULL;
-                spin_unlock(&root->fs_info->new_trans_lock);
+                spin_unlock(&root->fs_info->trans_lock);
                btrfs_destroy_marked_extents(root, &t->dirty_pages,
                                             EXTENT_DIRTY);
@@ -3082,8 +3079,10 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
                kmem_cache_free(btrfs_transaction_cachep, t);
        }
+        spin_lock(&root->fs_info->trans_lock);
+        root->fs_info->trans_no_join = 0;
+        spin_unlock(&root->fs_info->trans_lock);
        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
-        mutex_unlock(&root->fs_info->trans_mutex);
        return 0;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 169bd62ce776..1f61bf5b4960 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -348,7 +348,7 @@ static int caching_kthread(void *data)
         */
        path->skip_locking = 1;
        path->search_commit_root = 1;
-        path->reada = 2;
+        path->reada = 1;
        key.objectid = last;
        key.offset = 0;
@@ -366,8 +366,7 @@ again:
        nritems = btrfs_header_nritems(leaf);
        while (1) {
-                smp_mb();
+                if (btrfs_fs_closing(fs_info) > 1) {
-                if (fs_info->closing > 1) {
                        last = (u64)-1;
                        break;
                }
@@ -379,15 +378,18 @@ again:
                        if (ret)
                                break;
-                        caching_ctl->progress = last;
+                        if (need_resched() ||
-                        btrfs_release_path(path);
+                            btrfs_next_leaf(extent_root, path)) {
-                        up_read(&fs_info->extent_commit_sem);
+                                caching_ctl->progress = last;
-                        mutex_unlock(&caching_ctl->mutex);
+                                btrfs_release_path(path);
-                        if (btrfs_transaction_in_commit(fs_info))
+                                up_read(&fs_info->extent_commit_sem);
-                                schedule_timeout(1);
+                                mutex_unlock(&caching_ctl->mutex);
-                        else
                                cond_resched();
-                        goto again;
+                                goto again;
+                        }
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                        continue;
                }
                if (key.objectid < block_group->key.objectid) {
@@ -3065,7 +3067,7 @@ again:
                        spin_unlock(&data_sinfo->lock);
 alloc:
                        alloc_target = btrfs_get_alloc_profile(root, 1);
-                        trans = btrfs_join_transaction(root, 1);
+                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
@@ -3087,13 +3089,21 @@ alloc:
                        }
                        goto again;
                }
+                /*
+                 * If we have less pinned bytes than we want to allocate then
+                 * don't bother committing the transaction, it won't help us.
+                 */
+                if (data_sinfo->bytes_pinned < bytes)
+                        committed = 1;
                spin_unlock(&data_sinfo->lock);
                /* commit the current transaction and try again */
 commit_trans:
-                if (!committed && !root->fs_info->open_ioctl_trans) {
+                if (!committed &&
+                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
                        committed = 1;
-                        trans = btrfs_join_transaction(root, 1);
+                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
                        ret = btrfs_commit_transaction(trans, root);
@@ -3304,10 +3314,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
        if (reserved == 0)
                return 0;
-        /* nothing to shrink - nothing to reclaim */
-        if (root->fs_info->delalloc_bytes == 0)
-                return 0;
        max_reclaim = min(reserved, to_reclaim);
        while (loops < 1024) {
@@ -3472,7 +3478,7 @@ again:
                goto out;
        ret = -ENOSPC;
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                goto out;
        ret = btrfs_commit_transaction(trans, root);
@@ -3699,7 +3705,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
                if (trans)
                        return -EAGAIN;
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
                ret = btrfs_commit_transaction(trans, root);
                return 0;
@@ -3837,6 +3843,37 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
 }
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_block_rsv *rsv)
+{
+        struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
+        u64 num_bytes;
+        int ret;
+        /*
+         * Truncate should be freeing data, but give us 2 items just in case it
+         * needs to use some space.  We may want to be smarter about this in the
+         * future.
+         */
+        num_bytes = btrfs_calc_trans_metadata_size(root, 2);
+        /* We already have enough bytes, just return */
+        if (rsv->reserved >= num_bytes)
+                return 0;
+        num_bytes -= rsv->reserved;
+        /*
+         * You should have reserved enough space before hand to do this, so this
+         * should not fail.
+         */
+        ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
+        BUG_ON(ret);
+        return 0;
+}
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 int num_items)
@@ -3877,23 +3914,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
        /*
-         * one for deleting orphan item, one for updating inode and
+         * We need to hold space in order to delete our orphan item once we've
-         * two for calling btrfs_truncate_inode_items.
+         * added it, so this takes the reservation so we can release it later
-         *
+         * when we are truly done with the orphan item.
-         * btrfs_truncate_inode_items is a delete operation, it frees
-         * more space than it uses in most cases. So two units of
-         * metadata space should be enough for calling it many times.
-         * If all of the metadata space is used, we can commit
-         * transaction and use space it freed.
         */
-        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4);
+        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 void btrfs_orphan_release_metadata(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4);
+        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
@@ -4987,6 +5019,15 @@ have_block_group:
                if (unlikely(block_group->ro))
                        goto loop;
+                spin_lock(&block_group->free_space_ctl->tree_lock);
+                if (cached &&
+                    block_group->free_space_ctl->free_space <
+                    num_bytes + empty_size) {
+                        spin_unlock(&block_group->free_space_ctl->tree_lock);
+                        goto loop;
+                }
+                spin_unlock(&block_group->free_space_ctl->tree_lock);
                /*
                 * Ok we want to try and use the cluster allocator, so lets look
                 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
@@ -5150,6 +5191,7 @@ checks:
                        btrfs_add_free_space(block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
+                btrfs_put_block_group(block_group);
                break;
 loop:
                failed_cluster_refill = false;
@@ -5172,9 +5214,7 @@ loop:
         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
         *                      again
         */
-        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
+        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
-            (found_uncached_bg || empty_size || empty_cluster ||
-             allowed_chunk_alloc)) {
                index = 0;
                if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
                        found_uncached_bg = false;
@@ -5214,42 +5254,39 @@ loop:
                        goto search;
                }
-                if (loop < LOOP_CACHING_WAIT) {
+                loop++;
-                        loop++;
-                        goto search;
-                }
                if (loop == LOOP_ALLOC_CHUNK) {
-                        empty_size = 0;
+                       if (allowed_chunk_alloc) {
-                        empty_cluster = 0;
+                                ret = do_chunk_alloc(trans, root, num_bytes +
-                }
+                                                     2 * 1024 * 1024, data,
+                                                     CHUNK_ALLOC_LIMITED);
+                                allowed_chunk_alloc = 0;
+                                if (ret == 1)
+                                        done_chunk_alloc = 1;
+                        } else if (!done_chunk_alloc &&
+                                   space_info->force_alloc ==
+                                   CHUNK_ALLOC_NO_FORCE) {
+                                space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+                        }
-                if (allowed_chunk_alloc) {
+                       /*
-                        ret = do_chunk_alloc(trans, root, num_bytes +
+                        * We didn't allocate a chunk, go ahead and drop the
-                                             2 * 1024 * 1024, data,
+                        * empty size and loop again.
-                                             CHUNK_ALLOC_LIMITED);
+                        */
-                        allowed_chunk_alloc = 0;
+                       if (!done_chunk_alloc)
-                        done_chunk_alloc = 1;
+                               loop = LOOP_NO_EMPTY_SIZE;
-                } else if (!done_chunk_alloc &&
-                           space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) {
-                        space_info->force_alloc = CHUNK_ALLOC_LIMITED;
                }
-                if (loop < LOOP_NO_EMPTY_SIZE) {
+                if (loop == LOOP_NO_EMPTY_SIZE) {
-                        loop++;
+                        empty_size = 0;
-                        goto search;
+                        empty_cluster = 0;
                }
-                ret = -ENOSPC;
+                goto search;
        } else if (!ins->objectid) {
                ret = -ENOSPC;
-        }
+        } else if (ins->objectid) {
-        /* we found what we needed */
-        if (ins->objectid) {
-                if (!(data & BTRFS_BLOCK_GROUP_DATA))
-                        trans->block_group = block_group->key.objectid;
-                btrfs_put_block_group(block_group);
                ret = 0;
        }
@@ -6526,7 +6563,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        BUG_ON(cache->ro);
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
        alloc_flags = update_block_group_flags(root, cache->flags);
@@ -6882,6 +6919,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = 1;
        cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
        if (cache_gen != 0 &&
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c5d9fbb92bc3..7055d11c1efd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1476,7 +1476,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
                        if (total_bytes >= max_bytes)
                                break;
                        if (!found) {
-                                *start = state->start;
+                                *start = max(cur_start, state->start);
                                found = 1;
                        }
                        last = state->end;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4e8445a4757c..a11a92ee2d30 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -126,9 +126,9 @@ struct extent_buffer {
        unsigned long map_len;
        struct page *first_page;
        unsigned long bflags;
-        atomic_t refs;
        struct list_head leak_list;
        struct rcu_head rcu_head;
+        atomic_t refs;
        /* the spinlock is used to protect most operations */
        spinlock_t lock;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c6a22d783c35..fa4ef18b66b1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -129,7 +129,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        if (!btrfs_test_opt(root, AUTO_DEFRAG))
                return 0;
-        if (root->fs_info->closing)
+        if (btrfs_fs_closing(root->fs_info))
                return 0;
        if (BTRFS_I(inode)->in_defrag)
@@ -144,7 +144,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        if (!defrag)
                return -ENOMEM;
-        defrag->ino = inode->i_ino;
+        defrag->ino = btrfs_ino(inode);
        defrag->transid = transid;
        defrag->root = root->root_key.objectid;
@@ -229,7 +229,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
                first_ino = defrag->ino + 1;
                rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
-                if (fs_info->closing)
+                if (btrfs_fs_closing(fs_info))
                        goto next_free;
                spin_unlock(&fs_info->defrag_inodes_lock);
@@ -1480,14 +1480,12 @@ int btrfs_sync_file(struct file *file, int datasync)
         * the current transaction, we can bail out now without any
         * syncing
         */
-        mutex_lock(&root->fs_info->trans_mutex);
+        smp_mb();
        if (BTRFS_I(inode)->last_trans <=
            root->fs_info->last_trans_committed) {
                BTRFS_I(inode)->last_trans = 0;
-                mutex_unlock(&root->fs_info->trans_mutex);
                goto out;
        }
-        mutex_unlock(&root->fs_info->trans_mutex);
        /*
         * ok we haven't committed the transaction yet, lets do a commit
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 70d45795d758..9f985a429877 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -98,7 +98,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
                return inode;
        spin_lock(&block_group->lock);
-        if (!root->fs_info->closing) {
+        if (!btrfs_fs_closing(root->fs_info)) {
                block_group->inode = igrab(inode);
                block_group->iref = 1;
        }
@@ -250,7 +250,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        pgoff_t index = 0;
        unsigned long first_page_offset;
        int num_checksums;
-        int ret = 0, ret2;
+        int ret = 0;
        INIT_LIST_HEAD(&bitmaps);
@@ -402,7 +402,14 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                                spin_lock(&ctl->tree_lock);
                                ret = link_free_space(ctl, e);
                                spin_unlock(&ctl->tree_lock);
-                                BUG_ON(ret);
+                                if (ret) {
+                                        printk(KERN_ERR "Duplicate entries in "
+                                               "free space cache, dumping\n");
+                                        kunmap(page);
+                                        unlock_page(page);
+                                        page_cache_release(page);
+                                        goto free_cache;
+                                }
                        } else {
                                e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
                                if (!e->bitmap) {
@@ -414,10 +421,18 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                                        goto free_cache;
                                }
                                spin_lock(&ctl->tree_lock);
-                                ret2 = link_free_space(ctl, e);
+                                ret = link_free_space(ctl, e);
                                ctl->total_bitmaps++;
                                ctl->op->recalc_thresholds(ctl);
                                spin_unlock(&ctl->tree_lock);
+                                if (ret) {
+                                        printk(KERN_ERR "Duplicate entries in "
+                                               "free space cache, dumping\n");
+                                        kunmap(page);
+                                        unlock_page(page);
+                                        page_cache_release(page);
+                                        goto free_cache;
+                                }
                                list_add_tail(&e->list, &bitmaps);
                        }
@@ -478,8 +493,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
         * If we're unmounting then just return, since this does a search on the
         * normal root and not the commit root and we could deadlock.
         */
-        smp_mb();
+        if (btrfs_fs_closing(fs_info))
-        if (fs_info->closing)
                return 0;
        /*
@@ -575,10 +589,25 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
                PAGE_CACHE_SHIFT;
+        /* Since the first page has all of our checksums and our generation we
+         * need to calculate the offset into the page that we can start writing
+         * our entries.
+         */
+        first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
        filemap_write_and_wait(inode->i_mapping);
        btrfs_wait_ordered_range(inode, inode->i_size &
                                 ~(root->sectorsize - 1), (u64)-1);
+        /* make sure we don't overflow that first page */
+        if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
+                /* this is really the same as running out of space, where we also return 0 */
+                printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
+                ret = 0;
+                goto out_update;
+        }
        /* We need a checksum per page. */
        crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
        if (!crc)
@@ -590,12 +619,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                return -1;
        }
-        /* Since the first page has all of our checksums and our generation we
-         * need to calculate the offset into the page that we can start writing
-         * our entries.
-         */
-        first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
        /* Get the cluster for this block_group if it exists */
        if (block_group && !list_empty(&block_group->cluster_list))
                cluster = list_entry(block_group->cluster_list.next,
@@ -857,12 +880,14 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        ret = 1;
 out_free:
+        kfree(checksums);
+        kfree(pages);
+out_update:
        if (ret != 1) {
                invalidate_inode_pages2_range(inode->i_mapping, 0, index);
                BTRFS_I(inode)->generation = 0;
        }
-        kfree(checksums);
-        kfree(pages);
        btrfs_update_inode(trans, root, inode);
        return ret;
 }
@@ -963,10 +988,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
                         * logically.
                         */
                        if (bitmap) {
-                                WARN_ON(info->bitmap);
+                                if (info->bitmap) {
+                                        WARN_ON_ONCE(1);
+                                        return -EEXIST;
+                                }
                                p = &(*p)->rb_right;
                        } else {
-                                WARN_ON(!info->bitmap);
+                                if (!info->bitmap) {
+                                        WARN_ON_ONCE(1);
+                                        return -EEXIST;
+                                }
                                p = &(*p)->rb_left;
                        }
                }
@@ -1386,6 +1417,23 @@ again:
        return 0;
 }
+static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
+                               struct btrfs_free_space *info, u64 offset,
+                               u64 bytes)
+{
+        u64 bytes_to_set = 0;
+        u64 end;
+        end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
+        bytes_to_set = min(end - offset, bytes);
+        bitmap_set_bits(ctl, info, offset, bytes_to_set);
+        return bytes_to_set;
+}
 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
                      struct btrfs_free_space *info)
 {
@@ -1422,12 +1470,18 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
        return true;
 }
+static struct btrfs_free_space_op free_space_op = {
+        .recalc_thresholds      = recalculate_thresholds,
+        .use_bitmap             = use_bitmap,
+};
 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *info)
 {
        struct btrfs_free_space *bitmap_info;
+        struct btrfs_block_group_cache *block_group = NULL;
        int added = 0;
-        u64 bytes, offset, end;
+        u64 bytes, offset, bytes_added;
        int ret;
        bytes = info->bytes;
@@ -1436,7 +1490,49 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
        if (!ctl->op->use_bitmap(ctl, info))
                return 0;
+        if (ctl->op == &free_space_op)
+                block_group = ctl->private;
 again:
+        /*
+         * Since we link bitmaps right into the cluster we need to see if we
+         * have a cluster here, and if so and it has our bitmap we need to add
+         * the free space to that bitmap.
+         */
+        if (block_group && !list_empty(&block_group->cluster_list)) {
+                struct btrfs_free_cluster *cluster;
+                struct rb_node *node;
+                struct btrfs_free_space *entry;
+                cluster = list_entry(block_group->cluster_list.next,
+                                     struct btrfs_free_cluster,
+                                     block_group_list);
+                spin_lock(&cluster->lock);
+                node = rb_first(&cluster->root);
+                if (!node) {
+                        spin_unlock(&cluster->lock);
+                        goto no_cluster_bitmap;
+                }
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                if (!entry->bitmap) {
+                        spin_unlock(&cluster->lock);
+                        goto no_cluster_bitmap;
+                }
+                if (entry->offset == offset_to_bitmap(ctl, offset)) {
+                        bytes_added = add_bytes_to_bitmap(ctl, entry,
+                                                          offset, bytes);
+                        bytes -= bytes_added;
+                        offset += bytes_added;
+                }
+                spin_unlock(&cluster->lock);
+                if (!bytes) {
+                        ret = 1;
+                        goto out;
+                }
+        }
+no_cluster_bitmap:
        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                         1, 0);
        if (!bitmap_info) {
@@ -1444,19 +1540,10 @@ again:
                goto new_bitmap;
        }
-        end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
+        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+        bytes -= bytes_added;
-        if (offset >= bitmap_info->offset && offset + bytes > end) {
+        offset += bytes_added;
-                bitmap_set_bits(ctl, bitmap_info, offset, end - offset);
+        added = 0;
-                bytes -= end - offset;
-                offset = end;
-                added = 0;
-        } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
-                bitmap_set_bits(ctl, bitmap_info, offset, bytes);
-                bytes = 0;
-        } else {
-                BUG();
-        }
        if (!bytes) {
                ret = 1;
@@ -1735,11 +1822,6 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
               "\n", count);
 }
-static struct btrfs_free_space_op free_space_op = {
-        .recalc_thresholds      = recalculate_thresholds,
-        .use_bitmap             = use_bitmap,
-};
 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -2111,9 +2193,11 @@ again:
 /*
 * This searches the block group for just extents to fill the cluster with.
 */
-static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
+static noinline int
-                                   struct btrfs_free_cluster *cluster,
+setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
-                                   u64 offset, u64 bytes, u64 min_bytes)
+                        struct btrfs_free_cluster *cluster,
+                        struct list_head *bitmaps, u64 offset, u64 bytes,
+                        u64 min_bytes)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *first = NULL;
@@ -2135,6 +2219,8 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
         * extent entry.
         */
        while (entry->bitmap) {
+                if (list_empty(&entry->list))
+                        list_add_tail(&entry->list, bitmaps);
                node = rb_next(&entry->offset_index);
                if (!node)
                        return -ENOSPC;
@@ -2154,8 +2240,12 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
                        return -ENOSPC;
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
-                if (entry->bitmap)
+                if (entry->bitmap) {
+                        if (list_empty(&entry->list))
+                                list_add_tail(&entry->list, bitmaps);
                        continue;
+                }
                /*
                 * we haven't filled the empty size and the window is
                 * very large.  reset and try again
@@ -2207,9 +2297,11 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 * This specifically looks for bitmaps that may work in the cluster, we assume
 * that we have already failed to find extents that will work.
 */
-static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
+static noinline int
-                                struct btrfs_free_cluster *cluster,
+setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
-                                u64 offset, u64 bytes, u64 min_bytes)
+                     struct btrfs_free_cluster *cluster,
+                     struct list_head *bitmaps, u64 offset, u64 bytes,
+                     u64 min_bytes)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry;
@@ -2219,10 +2311,39 @@ static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
        if (ctl->total_bitmaps == 0)
                return -ENOSPC;
+        /*
+         * First check our cached list of bitmaps and see if there is an entry
+         * here that will work.
+         */
+        list_for_each_entry(entry, bitmaps, list) {
+                if (entry->bytes < min_bytes)
+                        continue;
+                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
+                                           bytes, min_bytes);
+                if (!ret)
+                        return 0;
+        }
+        /*
+         * If we do have entries on our list and we are here then we didn't find
+         * anything, so go ahead and get the next entry after the last entry in
+         * this list and start the search from there.
+         */
+        if (!list_empty(bitmaps)) {
+                entry = list_entry(bitmaps->prev, struct btrfs_free_space,
+                                   list);
+                node = rb_next(&entry->offset_index);
+                if (!node)
+                        return -ENOSPC;
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                goto search;
+        }
        entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
        if (!entry)
                return -ENOSPC;
+search:
        node = &entry->offset_index;
        do {
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
@@ -2253,6 +2374,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                             u64 offset, u64 bytes, u64 empty_size)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+        struct list_head bitmaps;
+        struct btrfs_free_space *entry, *tmp;
        u64 min_bytes;
        int ret;
@@ -2291,11 +2414,16 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                goto out;
        }
-        ret = setup_cluster_no_bitmap(block_group, cluster, offset, bytes,
+        INIT_LIST_HEAD(&bitmaps);
-                                      min_bytes);
+        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
+                                      bytes, min_bytes);
        if (ret)
-                ret = setup_cluster_bitmap(block_group, cluster, offset,
+                ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
-                                           bytes, min_bytes);
+                                           offset, bytes, min_bytes);
+        /* Clear our temporary list */
+        list_for_each_entry_safe(entry, tmp, &bitmaps, list)
+                list_del_init(&entry->list);
        if (!ret) {
                atomic_inc(&block_group->count);
@@ -2481,7 +2609,7 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
                return inode;
        spin_lock(&root->cache_lock);
-        if (!root->fs_info->closing)
+        if (!btrfs_fs_closing(root->fs_info))
                root->cache_inode = igrab(inode);
        spin_unlock(&root->cache_lock);
@@ -2504,12 +2632,14 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
        int ret = 0;
        u64 root_gen = btrfs_root_generation(&root->root_item);
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return 0;
        /*
         * If we're unmounting then just return, since this does a search on the
         * normal root and not the commit root and we could deadlock.
         */
-        smp_mb();
+        if (btrfs_fs_closing(fs_info))
-        if (fs_info->closing)
                return 0;
        path = btrfs_alloc_path();
@@ -2543,6 +2673,9 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
        struct inode *inode;
        int ret;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return 0;
        inode = lookup_free_ino_inode(root, path);
        if (IS_ERR(inode))
                return 0;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 3262cd17a12f..b4087e0fa871 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -38,6 +38,9 @@ static int caching_kthread(void *data)
        int slot;
        int ret;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return 0;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -59,8 +62,7 @@ again:
                goto out;
        while (1) {
-                smp_mb();
+                if (btrfs_fs_closing(fs_info))
-                if (fs_info->closing)
                        goto out;
                leaf = path->nodes[0];
@@ -141,6 +143,9 @@ static void start_caching(struct btrfs_root *root)
        int ret;
        u64 objectid;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return;
        spin_lock(&root->cache_lock);
        if (root->cached != BTRFS_CACHE_NO) {
                spin_unlock(&root->cache_lock);
@@ -178,6 +183,9 @@ static void start_caching(struct btrfs_root *root)
 int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
 {
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return btrfs_find_free_objectid(root, objectid);
 again:
        *objectid = btrfs_find_ino_for_alloc(root);
@@ -201,6 +209,10 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
 {
        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
        struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return;
 again:
        if (root->cached == BTRFS_CACHE_FINISHED) {
                __btrfs_add_free_space(ctl, objectid, 1);
@@ -250,6 +262,9 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
        struct rb_node *n;
        u64 count;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return;
        while (1) {
                n = rb_first(rbroot);
                if (!n)
@@ -388,9 +403,24 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
        int prealloc;
        bool retry = false;
+        /* only fs tree and subvol/snap needs ino cache */
+        if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
+            (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
+             root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
+                return 0;
+        /* Don't save inode cache if we are deleting this root */
+        if (btrfs_root_refs(&root->root_item) == 0 &&
+            root != root->fs_info->tree_root)
+                return 0;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return 0;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
 again:
        inode = lookup_free_ino_inode(root, path);
        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 39a9d5750efd..0a9b10c5b0a7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -138,7 +138,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        path->leave_spinning = 1;
-        btrfs_set_trans_block_group(trans, inode);
        key.objectid = btrfs_ino(inode);
        key.offset = start;
@@ -426,9 +425,8 @@ again:
                }
        }
        if (start == 0) {
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
-                btrfs_set_trans_block_group(trans, inode);
                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                /* lets try to make an inline extent */
@@ -623,8 +621,9 @@ retry:
                            async_extent->start + async_extent->ram_size - 1,
                            GFP_NOFS);
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
+                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                ret = btrfs_reserve_extent(trans, root,
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
@@ -793,9 +792,8 @@ static noinline int cow_file_range(struct inode *inode,
        int ret = 0;
        BUG_ON(is_free_space_inode(root, inode));
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
-        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -1077,10 +1075,12 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        nolock = is_free_space_inode(root, inode);
        if (nolock)
-                trans = btrfs_join_transaction_nolock(root, 1);
+                trans = btrfs_join_transaction_nolock(root);
        else
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        cow_start = (u64)-1;
        cur_offset = start;
@@ -1519,8 +1519,6 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 {
        struct btrfs_ordered_sum *sum;
-        btrfs_set_trans_block_group(trans, inode);
        list_for_each_entry(sum, list, list) {
                btrfs_csum_file_blocks(trans,
                       BTRFS_I(inode)->root->fs_info->csum_root, sum);
@@ -1735,11 +1733,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (!ret) {
                        if (nolock)
-                                trans = btrfs_join_transaction_nolock(root, 1);
+                                trans = btrfs_join_transaction_nolock(root);
                        else
-                                trans = btrfs_join_transaction(root, 1);
+                                trans = btrfs_join_transaction(root);
                        BUG_ON(IS_ERR(trans));
-                        btrfs_set_trans_block_group(trans, inode);
                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode(trans, root, inode);
                        BUG_ON(ret);
@@ -1752,11 +1749,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                         0, &cached_state, GFP_NOFS);
        if (nolock)
-                trans = btrfs_join_transaction_nolock(root, 1);
+                trans = btrfs_join_transaction_nolock(root);
        else
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
-        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -1990,7 +1986,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        }
        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
-                return 0;
+                goto good;
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -2431,7 +2427,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                        (u64)-1);
        if (root->orphan_block_rsv || root->orphan_item_inserted) {
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
                if (!IS_ERR(trans))
                        btrfs_end_transaction(trans, root);
        }
@@ -2511,12 +2507,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key location;
        int maybe_acls;
-        u64 alloc_group_block;
        u32 rdev;
        int ret;
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        path->leave_spinning = 1;
        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -2526,6 +2522,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
        leaf = path->nodes[0];
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
+        if (!leaf->map_token)
+                map_private_extent_buffer(leaf, (unsigned long)inode_item,
+                                          sizeof(struct btrfs_inode_item),
+                                          &leaf->map_token, &leaf->kaddr,
+                                          &leaf->map_start, &leaf->map_len,
+                                          KM_USER1);
        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
        inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
@@ -2555,8 +2557,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
        BTRFS_I(inode)->index_cnt = (u64)-1;
        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
-        alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
        /*
         * try to precache a NULL acl entry for files that don't have
         * any xattrs or acls
@@ -2566,8 +2566,11 @@ static void btrfs_read_locked_inode(struct inode *inode)
        if (!maybe_acls)
                cache_no_acl(inode);
-        BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
+        if (leaf->map_token) {
-                                                alloc_group_block, 0);
+                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                leaf->map_token = NULL;
+        }
        btrfs_free_path(path);
        inode_item = NULL;
@@ -2647,7 +2650,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_inode_transid(leaf, item, trans->transid);
        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-        btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+        btrfs_set_inode_block_group(leaf, item, 0);
        if (leaf->map_token) {
                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
@@ -3004,8 +3007,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
@@ -3075,6 +3076,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
        ret = btrfs_update_inode(trans, root, dir);
        BUG_ON(ret);
+        btrfs_free_path(path);
        return 0;
 }
@@ -3094,8 +3096,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
                err = btrfs_unlink_subvol(trans, root, dir,
                                          BTRFS_I(inode)->location.objectid,
@@ -3514,7 +3514,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                err = PTR_ERR(trans);
                                break;
                        }
-                        btrfs_set_trans_block_group(trans, inode);
                        err = btrfs_drop_extents(trans, inode, cur_offset,
                                                 cur_offset + hole_size,
@@ -3648,9 +3647,8 @@ void btrfs_evict_inode(struct inode *inode)
        btrfs_i_size_write(inode, 0);
        while (1) {
-                trans = btrfs_start_transaction(root, 0);
+                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
-                btrfs_set_trans_block_group(trans, inode);
                trans->block_rsv = root->orphan_block_rsv;
                ret = btrfs_block_rsv_check(trans, root,
@@ -4133,7 +4131,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        path->reada = 2;
+        path->reada = 1;
        if (key_type == BTRFS_DIR_INDEX_KEY) {
                INIT_LIST_HEAD(&ins_list);
@@ -4268,18 +4267,16 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        if (BTRFS_I(inode)->dummy_inode)
                return 0;
-        smp_mb();
+        if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode))
-        if (root->fs_info->closing && is_free_space_inode(root, inode))
                nolock = true;
        if (wbc->sync_mode == WB_SYNC_ALL) {
                if (nolock)
-                        trans = btrfs_join_transaction_nolock(root, 1);
+                        trans = btrfs_join_transaction_nolock(root);
                else
-                        trans = btrfs_join_transaction(root, 1);
+                        trans = btrfs_join_transaction(root);
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
-                btrfs_set_trans_block_group(trans, inode);
                if (nolock)
                        ret = btrfs_end_transaction_nolock(trans, root);
                else
@@ -4303,9 +4300,8 @@ void btrfs_dirty_inode(struct inode *inode, int flags)
        if (BTRFS_I(inode)->dummy_inode)
                return;
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
-        btrfs_set_trans_block_group(trans, inode);
        ret = btrfs_update_inode(trans, root, inode);
        if (ret && ret == -ENOSPC) {
@@ -4319,7 +4315,6 @@ void btrfs_dirty_inode(struct inode *inode, int flags)
                                       PTR_ERR(trans));
                        return;
                }
-                btrfs_set_trans_block_group(trans, inode);
                ret = btrfs_update_inode(trans, root, inode);
                if (ret) {
@@ -4418,8 +4413,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct inode *dir,
                                     const char *name, int name_len,
-                                     u64 ref_objectid, u64 objectid,
+                                     u64 ref_objectid, u64 objectid, int mode,
-                                     u64 alloc_hint, int mode, u64 *index)
+                                     u64 *index)
 {
        struct inode *inode;
        struct btrfs_inode_item *inode_item;
@@ -4472,8 +4467,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                owner = 0;
        else
                owner = 1;
-        BTRFS_I(inode)->block_group =
-                        btrfs_find_block_group(root, 0, alloc_hint, owner);
        key[0].objectid = objectid;
        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -4629,15 +4622,13 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        err = btrfs_find_free_ino(root, &objectid);
        if (err)
                goto out_unlock;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len, btrfs_ino(dir), objectid,
-                                BTRFS_I(dir)->block_group, mode, &index);
+                                mode, &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto out_unlock;
@@ -4649,7 +4640,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        btrfs_set_trans_block_group(trans, inode);
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
@@ -4658,8 +4648,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                init_special_inode(inode, inode->i_mode, rdev);
                btrfs_update_inode(trans, root, inode);
        }
-        btrfs_update_inode_block_group(trans, inode);
-        btrfs_update_inode_block_group(trans, dir);
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
@@ -4692,15 +4680,13 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        err = btrfs_find_free_ino(root, &objectid);
        if (err)
                goto out_unlock;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len, btrfs_ino(dir), objectid,
-                                BTRFS_I(dir)->block_group, mode, &index);
+                                mode, &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto out_unlock;
@@ -4712,7 +4698,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        btrfs_set_trans_block_group(trans, inode);
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
@@ -4723,8 +4708,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
-        btrfs_update_inode_block_group(trans, inode);
-        btrfs_update_inode_block_group(trans, dir);
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
@@ -4771,8 +4754,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        btrfs_inc_nlink(inode);
        inode->i_ctime = CURRENT_TIME;
-        btrfs_set_trans_block_group(trans, dir);
        ihold(inode);
        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
@@ -4781,7 +4762,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                drop_inode = 1;
        } else {
                struct dentry *parent = dget_parent(dentry);
-                btrfs_update_inode_block_group(trans, dir);
                err = btrfs_update_inode(trans, root, inode);
                BUG_ON(err);
                btrfs_log_new_name(trans, inode, NULL, parent);
@@ -4818,7 +4798,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        trans = btrfs_start_transaction(root, 5);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        err = btrfs_find_free_ino(root, &objectid);
        if (err)
@@ -4826,8 +4805,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len, btrfs_ino(dir), objectid,
-                                BTRFS_I(dir)->block_group, S_IFDIR | mode,
+                                S_IFDIR | mode, &index);
-                                &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto out_fail;
@@ -4841,7 +4819,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        inode->i_op = &btrfs_dir_inode_operations;
        inode->i_fop = &btrfs_dir_file_operations;
-        btrfs_set_trans_block_group(trans, inode);
        btrfs_i_size_write(inode, 0);
        err = btrfs_update_inode(trans, root, inode);
@@ -4855,8 +4832,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
        drop_on_err = 0;
-        btrfs_update_inode_block_group(trans, inode);
-        btrfs_update_inode_block_group(trans, dir);
 out_fail:
        nr = trans->blocks_used;
@@ -4989,7 +4964,15 @@ again:
        if (!path) {
                path = btrfs_alloc_path();
-                BUG_ON(!path);
+                if (!path) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                /*
+                 * Chances are we'll be called again, so go ahead and do
+                 * readahead
+                 */
+                path->reada = 1;
        }
        ret = btrfs_lookup_file_extent(trans, root, path,
@@ -5130,8 +5113,10 @@ again:
                                kunmap(page);
                                free_extent_map(em);
                                em = NULL;
                                btrfs_release_path(path);
-                                trans = btrfs_join_transaction(root, 1);
+                                trans = btrfs_join_transaction(root);
                                if (IS_ERR(trans))
                                        return ERR_CAST(trans);
                                goto again;
@@ -5375,7 +5360,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
        }
-        trans = btrfs_join_transaction(root, 0);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return ERR_CAST(trans);
@@ -5611,7 +5596,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                 * to make sure the current transaction stays open
                 * while we look for nocow cross refs
                 */
-                trans = btrfs_join_transaction(root, 0);
+                trans = btrfs_join_transaction(root);
                if (IS_ERR(trans))
                        goto must_cow;
@@ -5750,7 +5735,7 @@ again:
        BUG_ON(!ordered);
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
                err = -ENOMEM;
                goto out;
@@ -6500,6 +6485,7 @@ out:
 static int btrfs_truncate(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *rsv;
        int ret;
        int err = 0;
        struct btrfs_trans_handle *trans;
@@ -6513,28 +6499,80 @@ static int btrfs_truncate(struct inode *inode)
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-        trans = btrfs_start_transaction(root, 5);
+        /*
-        if (IS_ERR(trans))
+         * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
-                return PTR_ERR(trans);
+         * 3 things going on here
+         *
+         * 1) We need to reserve space for our orphan item and the space to
+         * delete our orphan item.  Lord knows we don't want to have a dangling
+         * orphan item because we didn't reserve space to remove it.
+         *
+         * 2) We need to reserve space to update our inode.
+         *
+         * 3) We need to have something to cache all the space that is going to
+         * be free'd up by the truncate operation, but also have some slack
+         * space reserved in case it uses space during the truncate (thank you
+         * very much snapshotting).
+         *
+         * And we need these to all be seperate.  The fact is we can use alot of
+         * space doing the truncate, and we have no earthly idea how much space
+         * we will use, so we need the truncate reservation to be seperate so it
+         * doesn't end up using space reserved for updating the inode or
+         * removing the orphan item.  We also need to be able to stop the
+         * transaction and start a new one, which means we need to be able to
+         * update the inode several times, and we have no idea of knowing how
+         * many times that will be, so we can't just reserve 1 item for the
+         * entirety of the opration, so that has to be done seperately as well.
+         * Then there is the orphan item, which does indeed need to be held on
+         * to for the whole operation, and we need nobody to touch this reserved
+         * space except the orphan code.
+         *
+         * So that leaves us with
+         *
+         * 1) root->orphan_block_rsv - for the orphan deletion.
+         * 2) rsv - for the truncate reservation, which we will steal from the
+         * transaction reservation.
+         * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+         * updating the inode.
+         */
+        rsv = btrfs_alloc_block_rsv(root);
+        if (!rsv)
+                return -ENOMEM;
+        btrfs_add_durable_block_rsv(root->fs_info, rsv);
-        btrfs_set_trans_block_group(trans, inode);
+        trans = btrfs_start_transaction(root, 4);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out;
+        }
+        /*
+         * Reserve space for the truncate process.  Truncate should be adding
+         * space, but if there are snapshots it may end up using space.
+         */
+        ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
+        BUG_ON(ret);
        ret = btrfs_orphan_add(trans, inode);
        if (ret) {
                btrfs_end_transaction(trans, root);
-                return ret;
+                goto out;
        }
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
-        /* Now start a transaction for the truncate */
+        /*
-        trans = btrfs_start_transaction(root, 0);
+         * Ok so we've already migrated our bytes over for the truncate, so here
-        if (IS_ERR(trans))
+         * just reserve the one slot we need for updating the inode.
-                return PTR_ERR(trans);
+         */
-        btrfs_set_trans_block_group(trans, inode);
+        trans = btrfs_start_transaction(root, 1);
-        trans->block_rsv = root->orphan_block_rsv;
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out;
+        }
+        trans->block_rsv = rsv;
        /*
         * setattr is responsible for setting the ordered_data_close flag,
@@ -6558,24 +6596,17 @@ static int btrfs_truncate(struct inode *inode)
        while (1) {
                if (!trans) {
-                        trans = btrfs_start_transaction(root, 0);
+                        trans = btrfs_start_transaction(root, 3);
-                        if (IS_ERR(trans))
+                        if (IS_ERR(trans)) {
-                                return PTR_ERR(trans);
+                                err = PTR_ERR(trans);
-                        btrfs_set_trans_block_group(trans, inode);
+                                goto out;
-                        trans->block_rsv = root->orphan_block_rsv;
+                        }
-                }
-                ret = btrfs_block_rsv_check(trans, root,
+                        ret = btrfs_truncate_reserve_metadata(trans, root,
-                                            root->orphan_block_rsv, 0, 5);
+                                                              rsv);
-                if (ret == -EAGAIN) {
+                        BUG_ON(ret);
-                        ret = btrfs_commit_transaction(trans, root);
-                        if (ret)
+                        trans->block_rsv = rsv;
-                                return ret;
-                        trans = NULL;
-                        continue;
-                } else if (ret) {
-                        err = ret;
-                        break;
                }
                ret = btrfs_truncate_inode_items(trans, root, inode,
@@ -6586,6 +6617,7 @@ static int btrfs_truncate(struct inode *inode)
                        break;
                }
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
                ret = btrfs_update_inode(trans, root, inode);
                if (ret) {
                        err = ret;
@@ -6599,6 +6631,7 @@ static int btrfs_truncate(struct inode *inode)
        }
        if (ret == 0 && inode->i_nlink > 0) {
+                trans->block_rsv = root->orphan_block_rsv;
                ret = btrfs_orphan_del(trans, inode);
                if (ret)
                        err = ret;
@@ -6610,15 +6643,20 @@ static int btrfs_truncate(struct inode *inode)
                ret = btrfs_orphan_del(NULL, inode);
        }
+        trans->block_rsv = &root->fs_info->trans_block_rsv;
        ret = btrfs_update_inode(trans, root, inode);
        if (ret && !err)
                err = ret;
        nr = trans->blocks_used;
        ret = btrfs_end_transaction_throttle(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+out:
+        btrfs_free_block_rsv(root, rsv);
        if (ret && !err)
                err = ret;
-        btrfs_btree_balance_dirty(root, nr);
        return err;
 }
@@ -6627,15 +6665,14 @@ static int btrfs_truncate(struct inode *inode)
 * create a new subvolume directory/inode (helper for the ioctl).
 */
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *new_root,
+                             struct btrfs_root *new_root, u64 new_dirid)
-                             u64 new_dirid, u64 alloc_hint)
 {
        struct inode *inode;
        int err;
        u64 index = 0;
        inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
-                                new_dirid, alloc_hint, S_IFDIR | 0700, &index);
+                                new_dirid, S_IFDIR | 0700, &index);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
        inode->i_op = &btrfs_dir_inode_operations;
@@ -6748,21 +6785,6 @@ void btrfs_destroy_inode(struct inode *inode)
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
-        if (root == root->fs_info->tree_root) {
-                struct btrfs_block_group_cache *block_group;
-                block_group = btrfs_lookup_block_group(root->fs_info,
-                                                BTRFS_I(inode)->block_group);
-                if (block_group && block_group->inode == inode) {
-                        spin_lock(&block_group->lock);
-                        block_group->inode = NULL;
-                        spin_unlock(&block_group->lock);
-                        btrfs_put_block_group(block_group);
-                } else if (block_group) {
-                        btrfs_put_block_group(block_group);
-                }
-        }
        spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
@@ -6948,8 +6970,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                goto out_notrans;
        }
-        btrfs_set_trans_block_group(trans, new_dir);
        if (dest != root)
                btrfs_record_root_in_trans(trans, dest);
@@ -7131,16 +7151,13 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        err = btrfs_find_free_ino(root, &objectid);
        if (err)
                goto out_unlock;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len, btrfs_ino(dir), objectid,
-                                BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
+                                S_IFLNK|S_IRWXUGO, &index);
-                                &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto out_unlock;
@@ -7152,7 +7169,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        btrfs_set_trans_block_group(trans, inode);
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
@@ -7163,8 +7179,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
-        btrfs_update_inode_block_group(trans, inode);
-        btrfs_update_inode_block_group(trans, dir);
        if (drop_inode)
                goto out_unlock;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 85e818ce00c5..a3c4751e07db 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -243,7 +243,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
        }
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
        ret = btrfs_update_inode(trans, root, inode);
@@ -414,8 +414,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        btrfs_record_root_in_trans(trans, new_root);
-        ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
+        ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
-                                       BTRFS_I(dir)->block_group);
        /*
         * insert the directory item
         */
@@ -483,8 +482,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
        BUG_ON(ret);
+        spin_lock(&root->fs_info->trans_lock);
        list_add(&pending_snapshot->list,
                 &trans->transaction->pending_snapshots);
+        spin_unlock(&root->fs_info->trans_lock);
        if (async_transid) {
                *async_transid = trans->transid;
                ret = btrfs_commit_transaction_async(trans,
@@ -707,16 +708,17 @@ static int find_new_extents(struct btrfs_root *root,
        struct btrfs_file_extent_item *extent;
        int type;
        int ret;
+        u64 ino = btrfs_ino(inode);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        min_key.objectid = inode->i_ino;
+        min_key.objectid = ino;
        min_key.type = BTRFS_EXTENT_DATA_KEY;
        min_key.offset = *off;
-        max_key.objectid = inode->i_ino;
+        max_key.objectid = ino;
        max_key.type = (u8)-1;
        max_key.offset = (u64)-1;
@@ -727,7 +729,7 @@ static int find_new_extents(struct btrfs_root *root,
                                           path, 0, newer_than);
                if (ret != 0)
                        goto none;
-                if (min_key.objectid != inode->i_ino)
+                if (min_key.objectid != ino)
                        goto none;
                if (min_key.type != BTRFS_EXTENT_DATA_KEY)
                        goto none;
@@ -2054,29 +2056,34 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
 {
-        struct btrfs_ioctl_fs_info_args fi_args;
+        struct btrfs_ioctl_fs_info_args *fi_args;
        struct btrfs_device *device;
        struct btrfs_device *next;
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+        int ret = 0;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        fi_args.num_devices = fs_devices->num_devices;
+        fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
-        fi_args.max_id = 0;
+        if (!fi_args)
-        memcpy(&fi_args.fsid, root->fs_info->fsid, sizeof(fi_args.fsid));
+                return -ENOMEM;
+        fi_args->num_devices = fs_devices->num_devices;
+        memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
-                if (device->devid > fi_args.max_id)
+                if (device->devid > fi_args->max_id)
-                        fi_args.max_id = device->devid;
+                        fi_args->max_id = device->devid;
        }
        mutex_unlock(&fs_devices->device_list_mutex);
-        if (copy_to_user(arg, &fi_args, sizeof(fi_args)))
+        if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
-                return -EFAULT;
+                ret = -EFAULT;
-        return 0;
+        kfree(fi_args);
+        return ret;
 }
 static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
@@ -2489,12 +2496,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
        if (ret)
                goto out;
-        mutex_lock(&root->fs_info->trans_mutex);
+        atomic_inc(&root->fs_info->open_ioctl_trans);
-        root->fs_info->open_ioctl_trans++;
-        mutex_unlock(&root->fs_info->trans_mutex);
        ret = -ENOMEM;
-        trans = btrfs_start_ioctl_transaction(root, 0);
+        trans = btrfs_start_ioctl_transaction(root);
        if (IS_ERR(trans))
                goto out_drop;
@@ -2502,9 +2507,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
        return 0;
 out_drop:
-        mutex_lock(&root->fs_info->trans_mutex);
+        atomic_dec(&root->fs_info->open_ioctl_trans);
-        root->fs_info->open_ioctl_trans--;
-        mutex_unlock(&root->fs_info->trans_mutex);
        mnt_drop_write(file->f_path.mnt);
 out:
        return ret;
@@ -2738,9 +2741,7 @@ long btrfs_ioctl_trans_end(struct file *file)
        btrfs_end_transaction(trans, root);
-        mutex_lock(&root->fs_info->trans_mutex);
+        atomic_dec(&root->fs_info->open_ioctl_trans);
-        root->fs_info->open_ioctl_trans--;
-        mutex_unlock(&root->fs_info->trans_mutex);
        mnt_drop_write(file->f_path.mnt);
        return 0;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ca38eca70af0..5e0a3dc79a45 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -677,6 +677,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
                err = -ENOMEM;
                goto out;
        }
+        path1->reada = 1;
+        path2->reada = 2;
        node = alloc_backref_node(cache);
        if (!node) {
@@ -1366,7 +1368,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
        int ret;
        if (!root->reloc_root)
-                return 0;
+                goto out;
        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;
@@ -1388,6 +1390,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
        ret = btrfs_update_root(trans, root->fs_info->tree_root,
                                &reloc_root->root_key, root_item);
        BUG_ON(ret);
+out:
        return 0;
 }
@@ -1999,6 +2003,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = 1;
        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;
@@ -2139,10 +2144,11 @@ int prepare_to_merge(struct reloc_control *rc, int err)
        u64 num_bytes = 0;
        int ret;
-        mutex_lock(&root->fs_info->trans_mutex);
+        mutex_lock(&root->fs_info->reloc_mutex);
        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
        rc->merging_rsv_size += rc->nodes_relocated * 2;
-        mutex_unlock(&root->fs_info->trans_mutex);
+        mutex_unlock(&root->fs_info->reloc_mutex);
 again:
        if (!err) {
                num_bytes = rc->merging_rsv_size;
@@ -2152,7 +2158,7 @@ again:
                        err = ret;
        }
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans)) {
                if (!err)
                        btrfs_block_rsv_release(rc->extent_root,
@@ -2211,9 +2217,16 @@ int merge_reloc_roots(struct reloc_control *rc)
        int ret;
 again:
        root = rc->extent_root;
-        mutex_lock(&root->fs_info->trans_mutex);
+        /*
+         * this serializes us with btrfs_record_root_in_transaction,
+         * we have to make sure nobody is in the middle of
+         * adding their roots to the list while we are
+         * doing this splice
+         */
+        mutex_lock(&root->fs_info->reloc_mutex);
        list_splice_init(&rc->reloc_roots, &reloc_roots);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        mutex_unlock(&root->fs_info->reloc_mutex);
        while (!list_empty(&reloc_roots)) {
                found = 1;
@@ -3236,7 +3249,7 @@ truncate:
                goto out;
        }
-        trans = btrfs_join_transaction(root, 0);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
                ret = PTR_ERR(trans);
@@ -3300,6 +3313,7 @@ static int find_data_references(struct reloc_control *rc,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = 1;
        root = read_fs_root(rc->extent_root->fs_info, ref_root);
        if (IS_ERR(root)) {
@@ -3586,17 +3600,19 @@ next:
 static void set_reloc_control(struct reloc_control *rc)
 {
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-        mutex_lock(&fs_info->trans_mutex);
+        mutex_lock(&fs_info->reloc_mutex);
        fs_info->reloc_ctl = rc;
-        mutex_unlock(&fs_info->trans_mutex);
+        mutex_unlock(&fs_info->reloc_mutex);
 }
 static void unset_reloc_control(struct reloc_control *rc)
 {
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-        mutex_lock(&fs_info->trans_mutex);
+        mutex_lock(&fs_info->reloc_mutex);
        fs_info->reloc_ctl = NULL;
-        mutex_unlock(&fs_info->trans_mutex);
+        mutex_unlock(&fs_info->reloc_mutex);
 }
 static int check_extent_flags(u64 flags)
@@ -3645,7 +3661,7 @@ int prepare_to_relocate(struct reloc_control *rc)
        rc->create_reloc_tree = 1;
        set_reloc_control(rc);
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
        BUG_ON(IS_ERR(trans));
        btrfs_commit_transaction(trans, rc->extent_root);
        return 0;
@@ -3668,6 +3684,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = 1;
        ret = prepare_to_relocate(rc);
        if (ret) {
@@ -3834,7 +3851,7 @@ restart:
        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
        /* get rid of pinned extents */
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans))
                err = PTR_ERR(trans);
        else
@@ -4093,6 +4110,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = -1;
        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
        key.type = BTRFS_ROOT_ITEM_KEY;
@@ -4159,7 +4177,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        set_reloc_control(rc);
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans)) {
                unset_reloc_control(rc);
                err = PTR_ERR(trans);
@@ -4193,7 +4211,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        unset_reloc_control(rc);
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans))
                err = PTR_ERR(trans);
        else
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6dfed0c27ac3..a8d03d5efb5d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -16,13 +16,7 @@
 * Boston, MA 021110-1307, USA.
 */
-#include <linux/sched.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h>
 #include <linux/blkdev.h>
-#include <linux/rbtree.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>
 #include "ctree.h"
 #include "volumes.h"
 #include "disk-io.h"
@@ -117,33 +111,37 @@ static void scrub_free_csums(struct scrub_dev *sdev)
        }
 }
+static void scrub_free_bio(struct bio *bio)
+{
+        int i;
+        struct page *last_page = NULL;
+        if (!bio)
+                return;
+        for (i = 0; i < bio->bi_vcnt; ++i) {
+                if (bio->bi_io_vec[i].bv_page == last_page)
+                        continue;
+                last_page = bio->bi_io_vec[i].bv_page;
+                __free_page(last_page);
+        }
+        bio_put(bio);
+}
 static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
 {
        int i;
-        int j;
-        struct page *last_page;
        if (!sdev)
                return;
        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
                struct scrub_bio *sbio = sdev->bios[i];
-                struct bio *bio;
                if (!sbio)
                        break;
-                bio = sbio->bio;
+                scrub_free_bio(sbio->bio);
-                if (bio) {
-                        last_page = NULL;
-                        for (j = 0; j < bio->bi_vcnt; ++j) {
-                                if (bio->bi_io_vec[j].bv_page == last_page)
-                                        continue;
-                                last_page = bio->bi_io_vec[j].bv_page;
-                                __free_page(last_page);
-                        }
-                        bio_put(bio);
-                }
                kfree(sbio);
        }
@@ -156,8 +154,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 {
        struct scrub_dev *sdev;
        int             i;
-        int             j;
-        int             ret;
        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
        sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
@@ -165,7 +161,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
                goto nomem;
        sdev->dev = dev;
        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
-                struct bio *bio;
                struct scrub_bio *sbio;
                sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
@@ -173,32 +168,10 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
                        goto nomem;
                sdev->bios[i] = sbio;
-                bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
-                if (!bio)
-                        goto nomem;
                sbio->index = i;
                sbio->sdev = sdev;
-                sbio->bio = bio;
                sbio->count = 0;
                sbio->work.func = scrub_checksum;
-                bio->bi_private = sdev->bios[i];
-                bio->bi_end_io = scrub_bio_end_io;
-                bio->bi_sector = 0;
-                bio->bi_bdev = dev->bdev;
-                bio->bi_size = 0;
-                for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) {
-                        struct page *page;
-                        page = alloc_page(GFP_NOFS);
-                        if (!page)
-                                goto nomem;
-                        ret = bio_add_page(bio, page, PAGE_SIZE, 0);
-                        if (!ret)
-                                goto nomem;
-                }
-                WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO);
                if (i != SCRUB_BIOS_PER_DEV-1)
                        sdev->bios[i]->next_free = i + 1;
@@ -369,9 +342,6 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
        int ret;
        DECLARE_COMPLETION_ONSTACK(complete);
-        /* we are going to wait on this IO */
-        rw |= REQ_SYNC;
        bio = bio_alloc(GFP_NOFS, 1);
        bio->bi_bdev = bdev;
        bio->bi_sector = sector;
@@ -380,6 +350,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
        bio->bi_private = &complete;
        submit_bio(rw, bio);
+        /* this will also unplug the queue */
        wait_for_completion(&complete);
        ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -394,6 +365,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
        sbio->err = err;
+        sbio->bio = bio;
        btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
 }
@@ -453,6 +425,8 @@ static void scrub_checksum(struct btrfs_work *work)
        }
 out:
+        scrub_free_bio(sbio->bio);
+        sbio->bio = NULL;
        spin_lock(&sdev->list_lock);
        sbio->next_free = sdev->first_free;
        sdev->first_free = sbio->index;
@@ -583,25 +557,50 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
 static int scrub_submit(struct scrub_dev *sdev)
 {
        struct scrub_bio *sbio;
+        struct bio *bio;
+        int i;
        if (sdev->curr == -1)
                return 0;
        sbio = sdev->bios[sdev->curr];
-        sbio->bio->bi_sector = sbio->physical >> 9;
+        bio = bio_alloc(GFP_NOFS, sbio->count);
-        sbio->bio->bi_size = sbio->count * PAGE_SIZE;
+        if (!bio)
-        sbio->bio->bi_next = NULL;
+                goto nomem;
-        sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
-        sbio->bio->bi_comp_cpu = -1;
+        bio->bi_private = sbio;
-        sbio->bio->bi_bdev = sdev->dev->bdev;
+        bio->bi_end_io = scrub_bio_end_io;
+        bio->bi_bdev = sdev->dev->bdev;
+        bio->bi_sector = sbio->physical >> 9;
+        for (i = 0; i < sbio->count; ++i) {
+                struct page *page;
+                int ret;
+                page = alloc_page(GFP_NOFS);
+                if (!page)
+                        goto nomem;
+                ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+                if (!ret) {
+                        __free_page(page);
+                        goto nomem;
+                }
+        }
        sbio->err = 0;
        sdev->curr = -1;
        atomic_inc(&sdev->in_flight);
-        submit_bio(0, sbio->bio);
+        submit_bio(READ, bio);
        return 0;
+nomem:
+        scrub_free_bio(bio);
+        return -ENOMEM;
 }
 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
@@ -633,7 +632,11 @@ again:
                sbio->logical = logical;
        } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
                   sbio->logical + sbio->count * PAGE_SIZE != logical) {
-                scrub_submit(sdev);
+                int ret;
+                ret = scrub_submit(sdev);
+                if (ret)
+                        return ret;
                goto again;
        }
        sbio->spag[sbio->count].flags = flags;
@@ -645,8 +648,13 @@ again:
                memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
        }
        ++sbio->count;
-        if (sbio->count == SCRUB_PAGES_PER_BIO || force)
+        if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
-                scrub_submit(sdev);
+                int ret;
+                ret = scrub_submit(sdev);
+                if (ret)
+                        return ret;
+        }
        return 0;
 }
@@ -727,6 +735,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
        struct btrfs_root *root = fs_info->extent_root;
        struct btrfs_root *csum_root = fs_info->csum_root;
        struct btrfs_extent_item *extent;
+        struct blk_plug plug;
        u64 flags;
        int ret;
        int slot;
@@ -789,18 +798,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
-                        goto out;
+                        goto out_noplug;
-                l = path->nodes[0];
-                slot = path->slots[0];
-                btrfs_item_key_to_cpu(l, &key, slot);
-                if (key.objectid != logical) {
-                        ret = btrfs_previous_item(root, path, 0,
-                                                  BTRFS_EXTENT_ITEM_KEY);
-                        if (ret < 0)
-                                goto out;
-                }
+                /*
+                 * we might miss half an extent here, but that doesn't matter,
+                 * as it's only the prefetch
+                 */
                while (1) {
                        l = path->nodes[0];
                        slot = path->slots[0];
@@ -809,7 +812,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                                if (ret == 0)
                                        continue;
                                if (ret < 0)
-                                        goto out;
+                                        goto out_noplug;
                                break;
                        }
@@ -831,6 +834,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
         * the scrub. This might currently (crc32) end up to be about 1MB
         */
        start_stripe = 0;
+        blk_start_plug(&plug);
 again:
        logical = base + offset + start_stripe * increment;
        for (i = start_stripe; i < nstripes; ++i) {
@@ -890,15 +894,20 @@ again:
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
                        goto out;
+                if (ret > 0) {
-                l = path->nodes[0];
-                slot = path->slots[0];
-                btrfs_item_key_to_cpu(l, &key, slot);
-                if (key.objectid != logical) {
                        ret = btrfs_previous_item(root, path, 0,
                                                  BTRFS_EXTENT_ITEM_KEY);
                        if (ret < 0)
                                goto out;
+                        if (ret > 0) {
+                                /* there's no smaller item, so stick with the
+                                 * larger one */
+                                btrfs_release_path(path);
+                                ret = btrfs_search_slot(NULL, root, &key,
+                                                        path, 0, 0);
+                                if (ret < 0)
+                                        goto out;
+                        }
                }
                while (1) {
@@ -972,6 +981,8 @@ next:
        scrub_submit(sdev);
 out:
+        blk_finish_plug(&plug);
+out_noplug:
        btrfs_free_path(path);
        return ret < 0 ? ret : 0;
 }
@@ -1047,8 +1058,15 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
-                        goto out;
+                        break;
-                ret = 0;
+                if (ret > 0) {
+                        if (path->slots[0] >=
+                            btrfs_header_nritems(path->nodes[0])) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret)
+                                        break;
+                        }
+                }
                l = path->nodes[0];
                slot = path->slots[0];
@@ -1058,7 +1076,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
                if (found_key.objectid != sdev->dev->devid)
                        break;
-                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
                        break;
                if (found_key.offset >= end)
@@ -1087,7 +1105,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
                cache = btrfs_lookup_block_group(fs_info, chunk_offset);
                if (!cache) {
                        ret = -ENOENT;
-                        goto out;
+                        break;
                }
                ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
                                  chunk_offset, length);
@@ -1099,9 +1117,13 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
                btrfs_release_path(path);
        }
-out:
        btrfs_free_path(path);
-        return ret;
+        /*
+         * ret can still be 1 from search_slot or next_leaf,
+         * that's not an error
+         */
+        return ret < 0 ? ret : 0;
 }
 static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
@@ -1138,8 +1160,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
        struct btrfs_fs_info *fs_info = root->fs_info;
        mutex_lock(&fs_info->scrub_lock);
-        if (fs_info->scrub_workers_refcnt == 0)
+        if (fs_info->scrub_workers_refcnt == 0) {
+                btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+                           fs_info->thread_pool_size, &fs_info->generic_worker);
+                fs_info->scrub_workers.idle_thresh = 4;
                btrfs_start_workers(&fs_info->scrub_workers, 1);
+        }
        ++fs_info->scrub_workers_refcnt;
        mutex_unlock(&fs_info->scrub_lock);
@@ -1166,7 +1192,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
        int ret;
        struct btrfs_device *dev;
-        if (root->fs_info->closing)
+        if (btrfs_fs_closing(root->fs_info))
                return -EINVAL;
        /*
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9b2e7e5bc3ef..0bb4ebbb71b7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -161,7 +161,8 @@ enum {
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-        Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err,
+        Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
+        Opt_inode_cache, Opt_err,
 };
 static match_table_t tokens = {
@@ -193,6 +194,7 @@ static match_table_t tokens = {
        {Opt_enospc_debug, "enospc_debug"},
        {Opt_subvolrootid, "subvolrootid=%d"},
        {Opt_defrag, "autodefrag"},
+        {Opt_inode_cache, "inode_cache"},
        {Opt_err, NULL},
 };
@@ -361,6 +363,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        printk(KERN_INFO "btrfs: enabling disk space caching\n");
                        btrfs_set_opt(info->mount_opt, SPACE_CACHE);
                        break;
+                case Opt_inode_cache:
+                        printk(KERN_INFO "btrfs: enabling inode map caching\n");
+                        btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
+                        break;
                case Opt_clear_cache:
                        printk(KERN_INFO "btrfs: force clearing of disk cache\n");
                        btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
@@ -819,7 +825,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        } else {
                char b[BDEVNAME_SIZE];
-                s->s_flags = flags;
+                s->s_flags = flags | MS_NOSEC;
                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
                error = btrfs_fill_super(s, fs_devices, data,
                                         flags & MS_SILENT ? 1 : 0);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c3c223ae6691..daac9ae6d731 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -28,152 +28,6 @@
 #include "disk-io.h"
 #include "transaction.h"
-static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                (unsigned long long)btrfs_root_used(&root->root_item));
-}
-static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                (unsigned long long)btrfs_root_limit(&root->root_item));
-}
-static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
-}
-static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
-}
-static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
-}
-/* this is for root attrs (subvols/snapshots) */
-struct btrfs_root_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct btrfs_root *, char *);
-        ssize_t (*store)(struct btrfs_root *, const char *, size_t);
-};
-#define ROOT_ATTR(name, mode, show, store) \
-static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
-                                                              show, store)
-ROOT_ATTR(blocks_used,  0444,   root_blocks_used_show,  NULL);
-ROOT_ATTR(block_limit,  0644,   root_block_limit_show,  NULL);
-static struct attribute *btrfs_root_attrs[] = {
-        &btrfs_root_attr_blocks_used.attr,
-        &btrfs_root_attr_block_limit.attr,
-        NULL,
-};
-/* this is for super attrs (actual full fs) */
-struct btrfs_super_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct btrfs_fs_info *, char *);
-        ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
-};
-#define SUPER_ATTR(name, mode, show, store) \
-static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
-                                                                show, store)
-SUPER_ATTR(blocks_used,         0444,   super_blocks_used_show,         NULL);
-SUPER_ATTR(total_blocks,        0444,   super_total_blocks_show,        NULL);
-SUPER_ATTR(blocksize,           0444,   super_blocksize_show,           NULL);
-static struct attribute *btrfs_super_attrs[] = {
-        &btrfs_super_attr_blocks_used.attr,
-        &btrfs_super_attr_total_blocks.attr,
-        &btrfs_super_attr_blocksize.attr,
-        NULL,
-};
-static ssize_t btrfs_super_attr_show(struct kobject *kobj,
-                                    struct attribute *attr, char *buf)
-{
-        struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
-                                                super_kobj);
-        struct btrfs_super_attr *a = container_of(attr,
-                                                  struct btrfs_super_attr,
-                                                  attr);
-        return a->show ? a->show(fs, buf) : 0;
-}
-static ssize_t btrfs_super_attr_store(struct kobject *kobj,
-                                     struct attribute *attr,
-                                     const char *buf, size_t len)
-{
-        struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
-                                                super_kobj);
-        struct btrfs_super_attr *a = container_of(attr,
-                                                  struct btrfs_super_attr,
-                                                  attr);
-        return a->store ? a->store(fs, buf, len) : 0;
-}
-static ssize_t btrfs_root_attr_show(struct kobject *kobj,
-                                    struct attribute *attr, char *buf)
-{
-        struct btrfs_root *root = container_of(kobj, struct btrfs_root,
-                                                root_kobj);
-        struct btrfs_root_attr *a = container_of(attr,
-                                                 struct btrfs_root_attr,
-                                                 attr);
-        return a->show ? a->show(root, buf) : 0;
-}
-static ssize_t btrfs_root_attr_store(struct kobject *kobj,
-                                     struct attribute *attr,
-                                     const char *buf, size_t len)
-{
-        struct btrfs_root *root = container_of(kobj, struct btrfs_root,
-                                                root_kobj);
-        struct btrfs_root_attr *a = container_of(attr,
-                                                 struct btrfs_root_attr,
-                                                 attr);
-        return a->store ? a->store(root, buf, len) : 0;
-}
-static void btrfs_super_release(struct kobject *kobj)
-{
-        struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
-                                                super_kobj);
-        complete(&fs->kobj_unregister);
-}
-static void btrfs_root_release(struct kobject *kobj)
-{
-        struct btrfs_root *root = container_of(kobj, struct btrfs_root,
-                                                root_kobj);
-        complete(&root->kobj_unregister);
-}
-static const struct sysfs_ops btrfs_super_attr_ops = {
-        .show   = btrfs_super_attr_show,
-        .store  = btrfs_super_attr_store,
-};
-static const struct sysfs_ops btrfs_root_attr_ops = {
-        .show   = btrfs_root_attr_show,
-        .store  = btrfs_root_attr_store,
-};
 /* /sys/fs/btrfs/ entry */
 static struct kset *btrfs_kset;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dc80f7156923..51dcec86757f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -35,6 +35,7 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
        WARN_ON(atomic_read(&transaction->use_count) == 0);
        if (atomic_dec_and_test(&transaction->use_count)) {
+                BUG_ON(!list_empty(&transaction->list));
                memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
@@ -49,46 +50,72 @@ static noinline void switch_commit_root(struct btrfs_root *root)
 /*
 * either allocate a new transaction or hop into the existing one
 */
-static noinline int join_transaction(struct btrfs_root *root)
+static noinline int join_transaction(struct btrfs_root *root, int nofail)
 {
        struct btrfs_transaction *cur_trans;
+        spin_lock(&root->fs_info->trans_lock);
+        if (root->fs_info->trans_no_join) {
+                if (!nofail) {
+                        spin_unlock(&root->fs_info->trans_lock);
+                        return -EBUSY;
+                }
+        }
        cur_trans = root->fs_info->running_transaction;
-        if (!cur_trans) {
+        if (cur_trans) {
-                cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
+                atomic_inc(&cur_trans->use_count);
-                                             GFP_NOFS);
+                atomic_inc(&cur_trans->num_writers);
-                if (!cur_trans)
+                cur_trans->num_joined++;
-                        return -ENOMEM;
+                spin_unlock(&root->fs_info->trans_lock);
-                root->fs_info->generation++;
+                return 0;
-                atomic_set(&cur_trans->num_writers, 1);
+        }
-                cur_trans->num_joined = 0;
+        spin_unlock(&root->fs_info->trans_lock);
-                cur_trans->transid = root->fs_info->generation;
-                init_waitqueue_head(&cur_trans->writer_wait);
+        cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
-                init_waitqueue_head(&cur_trans->commit_wait);
+        if (!cur_trans)
-                cur_trans->in_commit = 0;
+                return -ENOMEM;
-                cur_trans->blocked = 0;
+        spin_lock(&root->fs_info->trans_lock);
-                atomic_set(&cur_trans->use_count, 1);
+        if (root->fs_info->running_transaction) {
-                cur_trans->commit_done = 0;
+                kmem_cache_free(btrfs_transaction_cachep, cur_trans);
-                cur_trans->start_time = get_seconds();
+                cur_trans = root->fs_info->running_transaction;
+                atomic_inc(&cur_trans->use_count);
-                cur_trans->delayed_refs.root = RB_ROOT;
-                cur_trans->delayed_refs.num_entries = 0;
-                cur_trans->delayed_refs.num_heads_ready = 0;
-                cur_trans->delayed_refs.num_heads = 0;
-                cur_trans->delayed_refs.flushing = 0;
-                cur_trans->delayed_refs.run_delayed_start = 0;
-                spin_lock_init(&cur_trans->delayed_refs.lock);
-                INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-                list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
-                extent_io_tree_init(&cur_trans->dirty_pages,
-                                     root->fs_info->btree_inode->i_mapping);
-                spin_lock(&root->fs_info->new_trans_lock);
-                root->fs_info->running_transaction = cur_trans;
-                spin_unlock(&root->fs_info->new_trans_lock);
-        } else {
                atomic_inc(&cur_trans->num_writers);
                cur_trans->num_joined++;
+                spin_unlock(&root->fs_info->trans_lock);
+                return 0;
        }
+        atomic_set(&cur_trans->num_writers, 1);
+        cur_trans->num_joined = 0;
+        init_waitqueue_head(&cur_trans->writer_wait);
+        init_waitqueue_head(&cur_trans->commit_wait);
+        cur_trans->in_commit = 0;
+        cur_trans->blocked = 0;
+        /*
+         * One for this trans handle, one so it will live on until we
+         * commit the transaction.
+         */
+        atomic_set(&cur_trans->use_count, 2);
+        cur_trans->commit_done = 0;
+        cur_trans->start_time = get_seconds();
+        cur_trans->delayed_refs.root = RB_ROOT;
+        cur_trans->delayed_refs.num_entries = 0;
+        cur_trans->delayed_refs.num_heads_ready = 0;
+        cur_trans->delayed_refs.num_heads = 0;
+        cur_trans->delayed_refs.flushing = 0;
+        cur_trans->delayed_refs.run_delayed_start = 0;
+        spin_lock_init(&cur_trans->commit_lock);
+        spin_lock_init(&cur_trans->delayed_refs.lock);
+        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+        list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+        extent_io_tree_init(&cur_trans->dirty_pages,
+                             root->fs_info->btree_inode->i_mapping);
+        root->fs_info->generation++;
+        cur_trans->transid = root->fs_info->generation;
+        root->fs_info->running_transaction = cur_trans;
+        spin_unlock(&root->fs_info->trans_lock);
        return 0;
 }
@@ -99,36 +126,82 @@ static noinline int join_transaction(struct btrfs_root *root)
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
 */
-static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
+static int record_root_in_trans(struct btrfs_trans_handle *trans,
-                                         struct btrfs_root *root)
+                               struct btrfs_root *root)
 {
        if (root->ref_cows && root->last_trans < trans->transid) {
                WARN_ON(root == root->fs_info->extent_root);
                WARN_ON(root->commit_root != root->node);
+                /*
+                 * see below for in_trans_setup usage rules
+                 * we have the reloc mutex held now, so there
+                 * is only one writer in this function
+                 */
+                root->in_trans_setup = 1;
+                /* make sure readers find in_trans_setup before
+                 * they find our root->last_trans update
+                 */
+                smp_wmb();
+                spin_lock(&root->fs_info->fs_roots_radix_lock);
+                if (root->last_trans == trans->transid) {
+                        spin_unlock(&root->fs_info->fs_roots_radix_lock);
+                        return 0;
+                }
                radix_tree_tag_set(&root->fs_info->fs_roots_radix,
                           (unsigned long)root->root_key.objectid,
                           BTRFS_ROOT_TRANS_TAG);
+                spin_unlock(&root->fs_info->fs_roots_radix_lock);
                root->last_trans = trans->transid;
+                /* this is pretty tricky.  We don't want to
+                 * take the relocation lock in btrfs_record_root_in_trans
+                 * unless we're really doing the first setup for this root in
+                 * this transaction.
+                 *
+                 * Normally we'd use root->last_trans as a flag to decide
+                 * if we want to take the expensive mutex.
+                 *
+                 * But, we have to set root->last_trans before we
+                 * init the relocation root, otherwise, we trip over warnings
+                 * in ctree.c.  The solution used here is to flag ourselves
+                 * with root->in_trans_setup.  When this is 1, we're still
+                 * fixing up the reloc trees and everyone must wait.
+                 *
+                 * When this is zero, they can trust root->last_trans and fly
+                 * through btrfs_record_root_in_trans without having to take the
+                 * lock.  smp_wmb() makes sure that all the writes above are
+                 * done before we pop in the zero below
+                 */
                btrfs_init_reloc_root(trans, root);
+                smp_wmb();
+                root->in_trans_setup = 0;
        }
        return 0;
 }
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
 {
        if (!root->ref_cows)
                return 0;
-        mutex_lock(&root->fs_info->trans_mutex);
+        /*
-        if (root->last_trans == trans->transid) {
+         * see record_root_in_trans for comments about in_trans_setup usage
-                mutex_unlock(&root->fs_info->trans_mutex);
+         * and barriers
+         */
+        smp_rmb();
+        if (root->last_trans == trans->transid &&
+            !root->in_trans_setup)
                return 0;
-        }
+        mutex_lock(&root->fs_info->reloc_mutex);
        record_root_in_trans(trans, root);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        mutex_unlock(&root->fs_info->reloc_mutex);
        return 0;
 }
@@ -140,21 +213,23 @@ static void wait_current_trans(struct btrfs_root *root)
 {
        struct btrfs_transaction *cur_trans;
+        spin_lock(&root->fs_info->trans_lock);
        cur_trans = root->fs_info->running_transaction;
        if (cur_trans && cur_trans->blocked) {
                DEFINE_WAIT(wait);
                atomic_inc(&cur_trans->use_count);
+                spin_unlock(&root->fs_info->trans_lock);
                while (1) {
                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
                                        TASK_UNINTERRUPTIBLE);
                        if (!cur_trans->blocked)
                                break;
-                        mutex_unlock(&root->fs_info->trans_mutex);
                        schedule();
-                        mutex_lock(&root->fs_info->trans_mutex);
                }
                finish_wait(&root->fs_info->transaction_wait, &wait);
                put_transaction(cur_trans);
+        } else {
+                spin_unlock(&root->fs_info->trans_lock);
        }
 }
@@ -167,10 +242,16 @@ enum btrfs_trans_type {
 static int may_wait_transaction(struct btrfs_root *root, int type)
 {
-        if (!root->fs_info->log_root_recovering &&
+        if (root->fs_info->log_root_recovering)
-            ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+                return 0;
-             type == TRANS_USERSPACE))
+        if (type == TRANS_USERSPACE)
                return 1;
+        if (type == TRANS_START &&
+            !atomic_read(&root->fs_info->open_ioctl_trans))
+                return 1;
        return 0;
 }
@@ -184,36 +265,44 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
                return ERR_PTR(-EROFS);
+        if (current->journal_info) {
+                WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+                h = current->journal_info;
+                h->use_count++;
+                h->orig_rsv = h->block_rsv;
+                h->block_rsv = NULL;
+                goto got_it;
+        }
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
        if (!h)
                return ERR_PTR(-ENOMEM);
-        if (type != TRANS_JOIN_NOLOCK)
-                mutex_lock(&root->fs_info->trans_mutex);
        if (may_wait_transaction(root, type))
                wait_current_trans(root);
-        ret = join_transaction(root);
+        do {
+                ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+                if (ret == -EBUSY)
+                        wait_current_trans(root);
+        } while (ret == -EBUSY);
        if (ret < 0) {
                kmem_cache_free(btrfs_trans_handle_cachep, h);
-                if (type != TRANS_JOIN_NOLOCK)
-                        mutex_unlock(&root->fs_info->trans_mutex);
                return ERR_PTR(ret);
        }
        cur_trans = root->fs_info->running_transaction;
-        atomic_inc(&cur_trans->use_count);
-        if (type != TRANS_JOIN_NOLOCK)
-                mutex_unlock(&root->fs_info->trans_mutex);
        h->transid = cur_trans->transid;
        h->transaction = cur_trans;
        h->blocks_used = 0;
-        h->block_group = 0;
        h->bytes_reserved = 0;
        h->delayed_ref_updates = 0;
+        h->use_count = 1;
        h->block_rsv = NULL;
+        h->orig_rsv = NULL;
        smp_mb();
        if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -241,11 +330,8 @@ again:
                }
        }
-        if (type != TRANS_JOIN_NOLOCK)
+got_it:
-                mutex_lock(&root->fs_info->trans_mutex);
+        btrfs_record_root_in_trans(h, root);
-        record_root_in_trans(h, root);
-        if (type != TRANS_JOIN_NOLOCK)
-                mutex_unlock(&root->fs_info->trans_mutex);
        if (!current->journal_info && type != TRANS_USERSPACE)
                current->journal_info = h;
@@ -257,22 +343,19 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 {
        return start_transaction(root, num_items, TRANS_START);
 }
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
-                                                   int num_blocks)
 {
        return start_transaction(root, 0, TRANS_JOIN);
 }
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
-                                                          int num_blocks)
 {
        return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
 }
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
-                                                         int num_blocks)
 {
-        return start_transaction(r, 0, TRANS_USERSPACE);
+        return start_transaction(root, 0, TRANS_USERSPACE);
 }
 /* wait for a transaction commit to be fully complete */
@@ -280,17 +363,13 @@ static noinline int wait_for_commit(struct btrfs_root *root,
                                    struct btrfs_transaction *commit)
 {
        DEFINE_WAIT(wait);
-        mutex_lock(&root->fs_info->trans_mutex);
        while (!commit->commit_done) {
                prepare_to_wait(&commit->commit_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
                if (commit->commit_done)
                        break;
-                mutex_unlock(&root->fs_info->trans_mutex);
                schedule();
-                mutex_lock(&root->fs_info->trans_mutex);
        }
-        mutex_unlock(&root->fs_info->trans_mutex);
        finish_wait(&commit->commit_wait, &wait);
        return 0;
 }
@@ -300,59 +379,56 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
        struct btrfs_transaction *cur_trans = NULL, *t;
        int ret;
-        mutex_lock(&root->fs_info->trans_mutex);
        ret = 0;
        if (transid) {
                if (transid <= root->fs_info->last_trans_committed)
-                        goto out_unlock;
+                        goto out;
                /* find specified transaction */
+                spin_lock(&root->fs_info->trans_lock);
                list_for_each_entry(t, &root->fs_info->trans_list, list) {
                        if (t->transid == transid) {
                                cur_trans = t;
+                                atomic_inc(&cur_trans->use_count);
                                break;
                        }
                        if (t->transid > transid)
                                break;
                }
+                spin_unlock(&root->fs_info->trans_lock);
                ret = -EINVAL;
                if (!cur_trans)
-                        goto out_unlock;  /* bad transid */
+                        goto out;  /* bad transid */
        } else {
                /* find newest transaction that is committing | committed */
+                spin_lock(&root->fs_info->trans_lock);
                list_for_each_entry_reverse(t, &root->fs_info->trans_list,
                                            list) {
                        if (t->in_commit) {
                                if (t->commit_done)
-                                        goto out_unlock;
+                                        break;
                                cur_trans = t;
+                                atomic_inc(&cur_trans->use_count);
                                break;
                        }
                }
+                spin_unlock(&root->fs_info->trans_lock);
                if (!cur_trans)
-                        goto out_unlock;  /* nothing committing|committed */
+                        goto out;  /* nothing committing|committed */
        }
-        atomic_inc(&cur_trans->use_count);
-        mutex_unlock(&root->fs_info->trans_mutex);
        wait_for_commit(root, cur_trans);
-        mutex_lock(&root->fs_info->trans_mutex);
        put_transaction(cur_trans);
        ret = 0;
-out_unlock:
+out:
-        mutex_unlock(&root->fs_info->trans_mutex);
        return ret;
 }
 void btrfs_throttle(struct btrfs_root *root)
 {
-        mutex_lock(&root->fs_info->trans_mutex);
+        if (!atomic_read(&root->fs_info->open_ioctl_trans))
-        if (!root->fs_info->open_ioctl_trans)
                wait_current_trans(root);
-        mutex_unlock(&root->fs_info->trans_mutex);
 }
 static int should_end_transaction(struct btrfs_trans_handle *trans,
@@ -370,6 +446,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
        struct btrfs_transaction *cur_trans = trans->transaction;
        int updates;
+        smp_mb();
        if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
                return 1;
@@ -388,6 +465,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        struct btrfs_fs_info *info = root->fs_info;
        int count = 0;
+        if (--trans->use_count) {
+                trans->block_rsv = trans->orig_rsv;
+                return 0;
+        }
        while (count < 4) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
@@ -410,9 +492,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        btrfs_trans_release_metadata(trans, root);
-        if (lock && !root->fs_info->open_ioctl_trans &&
+        if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
-            should_end_transaction(trans, root))
+            should_end_transaction(trans, root)) {
                trans->transaction->blocked = 1;
+                smp_wmb();
+        }
        if (lock && cur_trans->blocked && !cur_trans->in_commit) {
                if (throttle)
@@ -703,9 +787,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 */
 int btrfs_add_dead_root(struct btrfs_root *root)
 {
-        mutex_lock(&root->fs_info->trans_mutex);
+        spin_lock(&root->fs_info->trans_lock);
        list_add(&root->root_list, &root->fs_info->dead_roots);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        spin_unlock(&root->fs_info->trans_lock);
        return 0;
 }
@@ -721,6 +805,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
        int ret;
        int err = 0;
+        spin_lock(&fs_info->fs_roots_radix_lock);
        while (1) {
                ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
                                                 (void **)gang, 0,
@@ -733,6 +818,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        radix_tree_tag_clear(&fs_info->fs_roots_radix,
                                        (unsigned long)root->root_key.objectid,
                                        BTRFS_ROOT_TRANS_TAG);
+                        spin_unlock(&fs_info->fs_roots_radix_lock);
                        btrfs_free_log(trans, root);
                        btrfs_update_reloc_root(trans, root);
@@ -753,10 +839,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        err = btrfs_update_root(trans, fs_info->tree_root,
                                                &root->root_key,
                                                &root->root_item);
+                        spin_lock(&fs_info->fs_roots_radix_lock);
                        if (err)
                                break;
                }
        }
+        spin_unlock(&fs_info->fs_roots_radix_lock);
        return err;
 }
@@ -786,7 +874,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
                btrfs_btree_balance_dirty(info->tree_root, nr);
                cond_resched();
-                if (root->fs_info->closing || ret != -EAGAIN)
+                if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
                        break;
        }
        root->defrag_running = 0;
@@ -869,6 +957,15 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        BUG_ON(ret);
+        /*
+         * pull in the delayed directory update
+         * and the delayed inode item
+         * otherwise we corrupt the FS during
+         * snapshot
+         */
+        ret = btrfs_run_delayed_items(trans, root);
+        BUG_ON(ret);
        record_root_in_trans(trans, root);
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
@@ -930,14 +1027,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
        int ret;
        list_for_each_entry(pending, head, list) {
-                /*
-                 * We must deal with the delayed items before creating
-                 * snapshots, or we will create a snapthot with inconsistent
-                 * information.
-                */
-                ret = btrfs_run_delayed_items(trans, fs_info->fs_root);
-                BUG_ON(ret);
                ret = create_pending_snapshot(trans, fs_info, pending);
                BUG_ON(ret);
        }
@@ -967,20 +1056,20 @@ static void update_super_roots(struct btrfs_root *root)
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
 {
        int ret = 0;
-        spin_lock(&info->new_trans_lock);
+        spin_lock(&info->trans_lock);
        if (info->running_transaction)
                ret = info->running_transaction->in_commit;
-        spin_unlock(&info->new_trans_lock);
+        spin_unlock(&info->trans_lock);
        return ret;
 }
 int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 {
        int ret = 0;
-        spin_lock(&info->new_trans_lock);
+        spin_lock(&info->trans_lock);
        if (info->running_transaction)
                ret = info->running_transaction->blocked;
-        spin_unlock(&info->new_trans_lock);
+        spin_unlock(&info->trans_lock);
        return ret;
 }
@@ -1004,9 +1093,7 @@ static void wait_current_trans_commit_start(struct btrfs_root *root,
                                    &wait);
                        break;
                }
-                mutex_unlock(&root->fs_info->trans_mutex);
                schedule();
-                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
        }
 }
@@ -1032,9 +1119,7 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
                                    &wait);
                        break;
                }
-                mutex_unlock(&root->fs_info->trans_mutex);
                schedule();
-                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&root->fs_info->transaction_wait,
                            &wait);
        }
@@ -1072,7 +1157,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        INIT_DELAYED_WORK(&ac->work, do_async_commit);
        ac->root = root;
-        ac->newtrans = btrfs_join_transaction(root, 0);
+        ac->newtrans = btrfs_join_transaction(root);
        if (IS_ERR(ac->newtrans)) {
                int err = PTR_ERR(ac->newtrans);
                kfree(ac);
@@ -1080,23 +1165,22 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        }
        /* take transaction reference */
-        mutex_lock(&root->fs_info->trans_mutex);
        cur_trans = trans->transaction;
        atomic_inc(&cur_trans->use_count);
-        mutex_unlock(&root->fs_info->trans_mutex);
        btrfs_end_transaction(trans, root);
        schedule_delayed_work(&ac->work, 0);
        /* wait for transaction to start and unblock */
-        mutex_lock(&root->fs_info->trans_mutex);
        if (wait_for_unblock)
                wait_current_trans_commit_start_and_unblock(root, cur_trans);
        else
                wait_current_trans_commit_start(root, cur_trans);
-        put_transaction(cur_trans);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        if (current->journal_info == trans)
+                current->journal_info = NULL;
+        put_transaction(cur_trans);
        return 0;
 }
@@ -1139,38 +1223,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        ret = btrfs_run_delayed_refs(trans, root, 0);
        BUG_ON(ret);
-        mutex_lock(&root->fs_info->trans_mutex);
+        spin_lock(&cur_trans->commit_lock);
        if (cur_trans->in_commit) {
+                spin_unlock(&cur_trans->commit_lock);
                atomic_inc(&cur_trans->use_count);
-                mutex_unlock(&root->fs_info->trans_mutex);
                btrfs_end_transaction(trans, root);
                ret = wait_for_commit(root, cur_trans);
                BUG_ON(ret);
-                mutex_lock(&root->fs_info->trans_mutex);
                put_transaction(cur_trans);
-                mutex_unlock(&root->fs_info->trans_mutex);
                return 0;
        }
        trans->transaction->in_commit = 1;
        trans->transaction->blocked = 1;
+        spin_unlock(&cur_trans->commit_lock);
        wake_up(&root->fs_info->transaction_blocked_wait);
+        spin_lock(&root->fs_info->trans_lock);
        if (cur_trans->list.prev != &root->fs_info->trans_list) {
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
                if (!prev_trans->commit_done) {
                        atomic_inc(&prev_trans->use_count);
-                        mutex_unlock(&root->fs_info->trans_mutex);
+                        spin_unlock(&root->fs_info->trans_lock);
                        wait_for_commit(root, prev_trans);
-                        mutex_lock(&root->fs_info->trans_mutex);
                        put_transaction(prev_trans);
+                } else {
+                        spin_unlock(&root->fs_info->trans_lock);
                }
+        } else {
+                spin_unlock(&root->fs_info->trans_lock);
        }
        if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
@@ -1178,12 +1265,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        do {
                int snap_pending = 0;
                joined = cur_trans->num_joined;
                if (!list_empty(&trans->transaction->pending_snapshots))
                        snap_pending = 1;
                WARN_ON(cur_trans != trans->transaction);
-                mutex_unlock(&root->fs_info->trans_mutex);
                if (flush_on_commit || snap_pending) {
                        btrfs_start_delalloc_inodes(root, 1);
@@ -1206,26 +1293,48 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                prepare_to_wait(&cur_trans->writer_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
-                smp_mb();
                if (atomic_read(&cur_trans->num_writers) > 1)
                        schedule_timeout(MAX_SCHEDULE_TIMEOUT);
                else if (should_grow)
                        schedule_timeout(1);
-                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&cur_trans->writer_wait, &wait);
        } while (atomic_read(&cur_trans->num_writers) > 1 ||
                 (should_grow && cur_trans->num_joined != joined));
-        ret = create_pending_snapshots(trans, root->fs_info);
+        /*
-        BUG_ON(ret);
+         * Ok now we need to make sure to block out any other joins while we
+         * commit the transaction.  We could have started a join before setting
+         * no_join so make sure to wait for num_writers to == 1 again.
+         */
+        spin_lock(&root->fs_info->trans_lock);
+        root->fs_info->trans_no_join = 1;
+        spin_unlock(&root->fs_info->trans_lock);
+        wait_event(cur_trans->writer_wait,
+                   atomic_read(&cur_trans->num_writers) == 1);
+        /*
+         * the reloc mutex makes sure that we stop
+         * the balancing code from coming in and moving
+         * extents around in the middle of the commit
+         */
+        mutex_lock(&root->fs_info->reloc_mutex);
        ret = btrfs_run_delayed_items(trans, root);
        BUG_ON(ret);
+        ret = create_pending_snapshots(trans, root->fs_info);
+        BUG_ON(ret);
        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
        BUG_ON(ret);
+        /*
+         * make sure none of the code above managed to slip in a
+         * delayed item
+         */
+        btrfs_assert_delayed_root_empty(root);
        WARN_ON(cur_trans != trans->transaction);
        btrfs_scrub_pause(root);
@@ -1258,9 +1367,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_prepare_extent_commit(trans, root);
        cur_trans = root->fs_info->running_transaction;
-        spin_lock(&root->fs_info->new_trans_lock);
-        root->fs_info->running_transaction = NULL;
-        spin_unlock(&root->fs_info->new_trans_lock);
        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
                            root->fs_info->tree_root->node);
@@ -1281,10 +1387,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
               sizeof(root->fs_info->super_copy));
        trans->transaction->blocked = 0;
+        spin_lock(&root->fs_info->trans_lock);
+        root->fs_info->running_transaction = NULL;
+        root->fs_info->trans_no_join = 0;
+        spin_unlock(&root->fs_info->trans_lock);
+        mutex_unlock(&root->fs_info->reloc_mutex);
        wake_up(&root->fs_info->transaction_wait);
-        mutex_unlock(&root->fs_info->trans_mutex);
        ret = btrfs_write_and_wait_transaction(trans, root);
        BUG_ON(ret);
        write_ctree_super(trans, root, 0);
@@ -1297,22 +1407,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_finish_extent_commit(trans, root);
-        mutex_lock(&root->fs_info->trans_mutex);
        cur_trans->commit_done = 1;
        root->fs_info->last_trans_committed = cur_trans->transid;
        wake_up(&cur_trans->commit_wait);
+        spin_lock(&root->fs_info->trans_lock);
        list_del_init(&cur_trans->list);
+        spin_unlock(&root->fs_info->trans_lock);
        put_transaction(cur_trans);
        put_transaction(cur_trans);
        trace_btrfs_transaction_commit(root);
-        mutex_unlock(&root->fs_info->trans_mutex);
        btrfs_scrub_continue(root);
        if (current->journal_info == trans)
@@ -1334,9 +1443,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
        LIST_HEAD(list);
        struct btrfs_fs_info *fs_info = root->fs_info;
-        mutex_lock(&fs_info->trans_mutex);
+        spin_lock(&fs_info->trans_lock);
        list_splice_init(&fs_info->dead_roots, &list);
-        mutex_unlock(&fs_info->trans_mutex);
+        spin_unlock(&fs_info->trans_lock);
        while (!list_empty(&list)) {
                root = list_entry(list.next, struct btrfs_root, root_list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 804c88639e5d..02564e6230ac 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -28,10 +28,12 @@ struct btrfs_transaction {
         * transaction can end
         */
        atomic_t num_writers;
+        atomic_t use_count;
        unsigned long num_joined;
+        spinlock_t commit_lock;
        int in_commit;
-        atomic_t use_count;
        int commit_done;
        int blocked;
        struct list_head list;
@@ -45,13 +47,14 @@ struct btrfs_transaction {
 struct btrfs_trans_handle {
        u64 transid;
-        u64 block_group;
        u64 bytes_reserved;
+        unsigned long use_count;
        unsigned long blocks_reserved;
        unsigned long blocks_used;
        unsigned long delayed_ref_updates;
        struct btrfs_transaction *transaction;
        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_rsv *orig_rsv;
 };
 struct btrfs_pending_snapshot {
@@ -66,19 +69,6 @@ struct btrfs_pending_snapshot {
        struct list_head list;
 };
-static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
-                                               struct inode *inode)
-{
-        trans->block_group = BTRFS_I(inode)->block_group;
-}
-static inline void btrfs_update_inode_block_group(
-                                          struct btrfs_trans_handle *trans,
-                                          struct inode *inode)
-{
-        BTRFS_I(inode)->block_group = trans->block_group;
-}
 static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
                                              struct inode *inode)
 {
@@ -92,12 +82,9 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items);
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
-                                                  int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
-                                                          int num_blocks);
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
-                                                         int num_blocks);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 592396c6dc47..4ce8a9f41d1e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3177,7 +3177,7 @@ again:
                tmp_key.offset = (u64)-1;
                wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
-                BUG_ON(!wc.replay_dest);
+                BUG_ON(IS_ERR_OR_NULL(wc.replay_dest));
                wc.replay_dest->log_root = log;
                btrfs_record_root_in_trans(trans, wc.replay_dest);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c48214ef5c09..1efa56e18f9b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -504,7 +504,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                BUG_ON(!new_device);
                memcpy(new_device, device, sizeof(*new_device));
                new_device->name = kstrdup(device->name, GFP_NOFS);
-                BUG_ON(!new_device->name);
+                BUG_ON(device->name && !new_device->name);
                new_device->bdev = NULL;
                new_device->writeable = 0;
                new_device->in_fs_metadata = 0;
@@ -689,12 +689,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        transid = btrfs_super_generation(disk_super);
        if (disk_super->label[0])
                printk(KERN_INFO "device label %s ", disk_super->label);
-        else {
+        else
-                /* FIXME, make a readl uuid parser */
+                printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
-                printk(KERN_INFO "device fsid %llx-%llx ",
-                       *(unsigned long long *)disk_super->fsid,
-                       *(unsigned long long *)(disk_super->fsid + 8));
-        }
        printk(KERN_CONT "devid %llu transid %llu %s\n",
               (unsigned long long)devid, (unsigned long long)transid, path);
        ret = device_list_add(path, disk_super, devid, fs_devices_ret);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index f3107e4b4d56..5366fe452ab0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -158,8 +158,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, inode);
        ret = do_setxattr(trans, inode, name, value, size, flags);
        if (ret)
                goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 49c9aada0374..1a80b048ade8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1902,10 +1902,8 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                if (!buffer_uptodate(*wait_bh))
                        err = -EIO;
        }
-        if (unlikely(err)) {
+        if (unlikely(err))
                page_zero_new_buffers(page, from, to);
-                ClearPageUptodate(page);
-        }
        return err;
 }
 EXPORT_SYMBOL(__block_write_begin);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 33da49dc3cc6..5a3953db8118 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -453,7 +453,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
        int err;
        struct inode *inode = page->mapping->host;
        BUG_ON(!inode);
-        igrab(inode);
+        ihold(inode);
        err = writepage_nounlock(page, wbc);
        unlock_page(page);
        iput(inode);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 1f72b00447c4..f605753c8fe9 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2940,14 +2940,12 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
        while (!list_empty(&mdsc->cap_dirty)) {
                ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
                                      i_dirty_item);
-                inode = igrab(&ci->vfs_inode);
+                inode = &ci->vfs_inode;
+                ihold(inode);
                dout("flush_dirty_caps %p\n", inode);
                spin_unlock(&mdsc->cap_dirty_lock);
-                if (inode) {
+                ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
-                        ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
+                iput(inode);
-                                        NULL);
-                        iput(inode);
-                }
                spin_lock(&mdsc->cap_dirty_lock);
        }
        spin_unlock(&mdsc->cap_dirty_lock);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 33729e822bb9..ef8f08c343e8 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -308,7 +308,8 @@ more:
                req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
                if (IS_ERR(req))
                        return PTR_ERR(req);
-                req->r_inode = igrab(inode);
+                req->r_inode = inode;
+                ihold(inode);
                req->r_dentry = dget(filp->f_dentry);
                /* hints to request -> mds selection code */
                req->r_direct_mode = USE_AUTH_MDS;
@@ -787,10 +788,12 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
        err = ceph_mdsc_do_request(mdsc, dir, req);
-        if (err)
+        if (err) {
                d_drop(dentry);
-        else if (!req->r_reply_info.head->is_dentry)
+        } else if (!req->r_reply_info.head->is_dentry) {
-                d_instantiate(dentry, igrab(old_dentry->d_inode));
+                ihold(old_dentry->d_inode);
+                d_instantiate(dentry, old_dentry->d_inode);
+        }
        ceph_mdsc_put_request(req);
        return err;
 }
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index a610d3d67488..f67b687550de 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -109,7 +109,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
                err = ceph_mdsc_do_request(mdsc, NULL, req);
                inode = req->r_target_inode;
                if (inode)
-                        igrab(inode);
+                        ihold(inode);
                ceph_mdsc_put_request(req);
                if (!inode)
                        return ERR_PTR(-ESTALE);
@@ -167,7 +167,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
                err = ceph_mdsc_do_request(mdsc, NULL, req);
                inode = req->r_target_inode;
                if (inode)
-                        igrab(inode);
+                        ihold(inode);
                ceph_mdsc_put_request(req);
                if (!inode)
                        return ERR_PTR(err ? err : -ESTALE);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 203252d88d9f..9542f07d0b93 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -191,7 +191,8 @@ int ceph_open(struct inode *inode, struct file *file)
                err = PTR_ERR(req);
                goto out;
        }
-        req->r_inode = igrab(inode);
+        req->r_inode = inode;
+        ihold(inode);
        req->r_num_caps = 1;
        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
        if (!err)
@@ -282,7 +283,7 @@ int ceph_release(struct inode *inode, struct file *file)
 static int striped_read(struct inode *inode,
                        u64 off, u64 len,
                        struct page **pages, int num_pages,
-                        int *checkeof, bool align_to_pages,
+                        int *checkeof, bool o_direct,
                        unsigned long buf_align)
 {
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -307,7 +308,7 @@ static int striped_read(struct inode *inode,
        io_align = off & ~PAGE_MASK;
 more:
-        if (align_to_pages)
+        if (o_direct)
                page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
        else
                page_align = pos & ~PAGE_MASK;
@@ -317,10 +318,10 @@ more:
                                  ci->i_truncate_seq,
                                  ci->i_truncate_size,
                                  page_pos, pages_left, page_align);
-        hit_stripe = this_len < left;
-        was_short = ret >= 0 && ret < this_len;
        if (ret == -ENOENT)
                ret = 0;
+        hit_stripe = this_len < left;
+        was_short = ret >= 0 && ret < this_len;
        dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
             ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
@@ -345,20 +346,22 @@ more:
        }
        if (was_short) {
-                /* was original extent fully inside i_size? */
+                /* did we bounce off eof? */
-                if (pos + left <= inode->i_size) {
+                if (pos + left > inode->i_size)
-                        dout("zero tail\n");
+                        *checkeof = 1;
-                        ceph_zero_page_vector_range(page_off + read, len - read,
+                /* zero trailing bytes (inside i_size) */
+                if (left > 0 && pos < inode->i_size) {
+                        if (pos + left > inode->i_size)
+                                left = inode->i_size - pos;
+                        dout("zero tail %d\n", left);
+                        ceph_zero_page_vector_range(page_off + read, left,
                                                    pages);
-                        read = len;
+                        read += left;
-                        goto out;
                }
-                /* check i_size */
-                *checkeof = 1;
        }
-out:
        if (ret >= 0)
                ret = read;
        dout("striped_read returns %d\n", ret);
@@ -658,7 +661,7 @@ out:
                /* hit EOF or hole? */
                if (statret == 0 && *ppos < inode->i_size) {
-                        dout("aio_read sync_read hit hole, reading more\n");
+                        dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
                        read += ret;
                        base += ret;
                        len -= ret;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 70b6a4839c38..d8858e96ab18 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1101,10 +1101,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                                goto done;
                        }
                        req->r_dentry = dn;  /* may have spliced */
-                        igrab(in);
+                        ihold(in);
                } else if (ceph_ino(in) == vino.ino &&
                           ceph_snap(in) == vino.snap) {
-                        igrab(in);
+                        ihold(in);
                } else {
                        dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
                             dn, in, ceph_ino(in), ceph_snap(in),
@@ -1144,7 +1144,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        goto done;
                }
                req->r_dentry = dn;  /* may have spliced */
-                igrab(in);
+                ihold(in);
                rinfo->head->is_dentry = 1;  /* fool notrace handlers */
        }
@@ -1328,7 +1328,7 @@ void ceph_queue_writeback(struct inode *inode)
        if (queue_work(ceph_inode_to_client(inode)->wb_wq,
                       &ceph_inode(inode)->i_wb_work)) {
                dout("ceph_queue_writeback %p\n", inode);
-                igrab(inode);
+                ihold(inode);
        } else {
                dout("ceph_queue_writeback %p failed\n", inode);
        }
@@ -1353,7 +1353,7 @@ void ceph_queue_invalidate(struct inode *inode)
        if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
                       &ceph_inode(inode)->i_pg_inv_work)) {
                dout("ceph_queue_invalidate %p\n", inode);
-                igrab(inode);
+                ihold(inode);
        } else {
                dout("ceph_queue_invalidate %p failed\n", inode);
        }
@@ -1477,7 +1477,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
        if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
                       &ci->i_vmtruncate_work)) {
                dout("ceph_queue_vmtruncate %p\n", inode);
-                igrab(inode);
+                ihold(inode);
        } else {
                dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
                     inode, ci->i_truncate_pending);
@@ -1738,7 +1738,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
                __mark_inode_dirty(inode, inode_dirty_flags);
        if (mask) {
-                req->r_inode = igrab(inode);
+                req->r_inode = inode;
+                ihold(inode);
                req->r_inode_drop = release;
                req->r_args.setattr.mask = cpu_to_le32(mask);
                req->r_num_caps = 1;
@@ -1779,7 +1780,8 @@ int ceph_do_getattr(struct inode *inode, int mask)
        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
-        req->r_inode = igrab(inode);
+        req->r_inode = inode;
+        ihold(inode);
        req->r_num_caps = 1;
        req->r_args.getattr.mask = cpu_to_le32(mask);
        err = ceph_mdsc_do_request(mdsc, NULL, req);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8888c9ba68db..ef0b5f48e13a 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -73,7 +73,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
                                       USE_AUTH_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
-        req->r_inode = igrab(inode);
+        req->r_inode = inode;
+        ihold(inode);
        req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
        req->r_args.setlayout.layout.fl_stripe_unit =
@@ -135,7 +136,8 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
        if (IS_ERR(req))
                return PTR_ERR(req);
-        req->r_inode = igrab(inode);
+        req->r_inode = inode;
+        ihold(inode);
        req->r_args.setlayout.layout.fl_stripe_unit =
                        cpu_to_le32(l.stripe_unit);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 476b329867d4..80576d05d687 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -23,7 +23,8 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
        req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
-        req->r_inode = igrab(inode);
+        req->r_inode = inode;
+        ihold(inode);
        /* mds requires start and length rather than start and end */
        if (LLONG_MAX == fl->fl_end)
@@ -32,11 +33,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
                length = fl->fl_end - fl->fl_start + 1;
        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
-             "length: %llu, wait: %d, type`: %d", (int)lock_type,
+             "length: %llu, wait: %d, type: %d", (int)lock_type,
             (int)operation, (u64)fl->fl_pid, fl->fl_start,
             length, wait, fl->fl_type);
        req->r_args.filelock_change.rule = lock_type;
        req->r_args.filelock_change.type = cmd;
        req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
@@ -70,7 +70,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
        }
        ceph_mdsc_put_request(req);
        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
-             "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
+             "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type,
             (int)operation, (u64)fl->fl_pid, fl->fl_start,
             length, wait, fl->fl_type, err);
        return err;
@@ -109,16 +109,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
                        dout("mds locked, locking locally");
                        err = posix_lock_file(file, fl, NULL);
                        if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
-                                /* undo! This should only happen if the kernel detects
+                                /* undo! This should only happen if
-                                 * local deadlock. */
+                                 * the kernel detects local
+                                 * deadlock. */
                                ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
                                                  CEPH_LOCK_UNLOCK, 0, fl);
-                                dout("got %d on posix_lock_file, undid lock", err);
+                                dout("got %d on posix_lock_file, undid lock",
+                                     err);
                        }
                }
-        } else {
+        } else if (err == -ERESTARTSYS) {
-                dout("mds returned error code %d", err);
+                dout("undoing lock\n");
+                ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+                                  CEPH_LOCK_UNLOCK, 0, fl);
        }
        return err;
 }
@@ -155,8 +159,11 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
                                          file, CEPH_LOCK_UNLOCK, 0, fl);
                        dout("got %d on flock_lock_file_wait, undid lock", err);
                }
-        } else {
+        } else if (err == -ERESTARTSYS) {
-                dout("mds error code %d", err);
+                dout("undoing lock\n");
+                ceph_lock_message(CEPH_LOCK_FLOCK,
+                                  CEPH_MDS_OP_SETFILELOCK,
+                                  file, CEPH_LOCK_UNLOCK, 0, fl);
        }
        return err;
 }
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 24067d68a554..54b14de2e729 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -722,7 +722,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
                ci = list_first_entry(&mdsc->snap_flush_list,
                                struct ceph_inode_info, i_snap_flush_item);
                inode = &ci->vfs_inode;
-                igrab(inode);
+                ihold(inode);
                spin_unlock(&mdsc->snap_flush_lock);
                spin_lock(&inode->i_lock);
                __ceph_flush_snaps(ci, &session, 0);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index f2b628696180..f42d730f1b66 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -665,7 +665,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                err = PTR_ERR(req);
                goto out;
        }
-        req->r_inode = igrab(inode);
+        req->r_inode = inode;
+        ihold(inode);
        req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
        req->r_num_caps = 1;
        req->r_args.setxattr.flags = cpu_to_le32(flags);
@@ -795,7 +796,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
                                       USE_AUTH_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
-        req->r_inode = igrab(inode);
+        req->r_inode = inode;
+        ihold(inode);
        req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
        req->r_num_caps = 1;
        req->r_path2 = kstrdup(name, GFP_NOFS);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 1cd4c3a1862d..f66cc1625150 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -7,6 +7,7 @@ config CIFS
        select CRYPTO_MD5
        select CRYPTO_HMAC
        select CRYPTO_ARC4
+        select CRYPTO_ECB
        select CRYPTO_DES
        help
          This is the client VFS module for the Common Internet File System
@@ -148,13 +149,13 @@ config CIFS_FSCACHE
 config CIFS_ACL
          bool "Provide CIFS ACL support (EXPERIMENTAL)"
-          depends on EXPERIMENTAL && CIFS_XATTR
+          depends on EXPERIMENTAL && CIFS_XATTR && KEYS
          help
            Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
            is handed over to the application/caller.
 config CIFS_NFSD_EXPORT
          bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)"
-          depends on CIFS && EXPERIMENTAL
+          depends on CIFS && EXPERIMENTAL && BROKEN
          help
           Allows NFS server to export a CIFS mounted share (nfsd over cifs)
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index dd8584d35a14..545509c3313b 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -92,7 +92,7 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
                break;
        default:
-                cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family);
+                cERROR(1, "Unknown network family '%d'", sa->sa_family);
                key_len = 0;
                break;
        }
@@ -152,7 +152,7 @@ static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
        sharename = extract_sharename(tcon->treeName);
        if (IS_ERR(sharename)) {
-                cFYI(1, "CIFS: couldn't extract sharename\n");
+                cFYI(1, "%s: couldn't extract sharename\n", __func__);
                sharename = NULL;
                return 0;
        }
@@ -302,7 +302,7 @@ static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
        pagevec_init(&pvec, 0);
        first = 0;
-        cFYI(1, "cifs inode 0x%p now uncached", cifsi);
+        cFYI(1, "%s: cifs inode 0x%p now uncached", __func__, cifsi);
        for (;;) {
                nr_pages = pagevec_lookup(&pvec,
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index ffb1459dc6ec..7260e11e21f8 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -42,6 +42,7 @@
 #define CIFS_MOUNT_MULTIUSER    0x20000 /* multiuser mount */
 #define CIFS_MOUNT_STRICT_IO    0x40000 /* strict cache mode */
 #define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */
+#define CIFS_MOUNT_POSIXACL     0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
 struct cifs_sb_info {
        struct rb_root tlink_tree;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index dfbd9f1f373d..5a0ee7f2af06 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -184,7 +184,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
        if (cifs_pdu == NULL || server == NULL)
                return -EINVAL;
-        if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
+        if (!server->session_estab)
                return 0;
        if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 989442dcfb45..35f9154615fa 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -104,8 +104,7 @@ cifs_sb_deactive(struct super_block *sb)
 }
 static int
-cifs_read_super(struct super_block *sb, struct smb_vol *volume_info,
+cifs_read_super(struct super_block *sb)
-                const char *devname, int silent)
 {
        struct inode *inode;
        struct cifs_sb_info *cifs_sb;
@@ -113,22 +112,16 @@ cifs_read_super(struct super_block *sb, struct smb_vol *volume_info,
        cifs_sb = CIFS_SB(sb);
-        spin_lock_init(&cifs_sb->tlink_tree_lock);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
-        cifs_sb->tlink_tree = RB_ROOT;
+                sb->s_flags |= MS_POSIXACL;
-        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+        if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES)
-        if (rc)
+                sb->s_maxbytes = MAX_LFS_FILESIZE;
-                return rc;
+        else
+                sb->s_maxbytes = MAX_NON_LFS;
-        cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
-        rc = cifs_mount(sb, cifs_sb, volume_info, devname);
+        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
+        sb->s_time_gran = 100;
-        if (rc) {
-                if (!silent)
-                        cERROR(1, "cifs_mount failed w/return code = %d", rc);
-                goto out_mount_failed;
-        }
        sb->s_magic = CIFS_MAGIC_NUMBER;
        sb->s_op = &cifs_super_ops;
@@ -170,37 +163,14 @@ out_no_root:
        if (inode)
                iput(inode);
-        cifs_umount(sb, cifs_sb);
-out_mount_failed:
-        bdi_destroy(&cifs_sb->bdi);
        return rc;
 }
-static void
+static void cifs_kill_sb(struct super_block *sb)
-cifs_put_super(struct super_block *sb)
 {
-        int rc = 0;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifs_sb_info *cifs_sb;
+        kill_anon_super(sb);
+        cifs_umount(cifs_sb);
-        cFYI(1, "In cifs_put_super");
-        cifs_sb = CIFS_SB(sb);
-        if (cifs_sb == NULL) {
-                cFYI(1, "Empty cifs superblock info passed to unmount");
-                return;
-        }
-        rc = cifs_umount(sb, cifs_sb);
-        if (rc)
-                cERROR(1, "cifs_umount failed with return code %d", rc);
-        if (cifs_sb->mountdata) {
-                kfree(cifs_sb->mountdata);
-                cifs_sb->mountdata = NULL;
-        }
-        unload_nls(cifs_sb->local_nls);
-        bdi_destroy(&cifs_sb->bdi);
-        kfree(cifs_sb);
 }
 static int
@@ -257,9 +227,6 @@ static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct cifs_sb_info *cifs_sb;
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
        cifs_sb = CIFS_SB(inode->i_sb);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
@@ -352,6 +319,37 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
        }
 }
+static void
+cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
+{
+        seq_printf(s, ",sec=");
+        switch (server->secType) {
+        case LANMAN:
+                seq_printf(s, "lanman");
+                break;
+        case NTLMv2:
+                seq_printf(s, "ntlmv2");
+                break;
+        case NTLM:
+                seq_printf(s, "ntlm");
+                break;
+        case Kerberos:
+                seq_printf(s, "krb5");
+                break;
+        case RawNTLMSSP:
+                seq_printf(s, "ntlmssp");
+                break;
+        default:
+                /* shouldn't ever happen */
+                seq_printf(s, "unknown");
+                break;
+        }
+        if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                seq_printf(s, "i");
+}
 /*
 * cifs_show_options() is for displaying mount options in /proc/mounts.
 * Not all settable options are displayed but most of the important
@@ -365,6 +363,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
        struct sockaddr *srcaddr;
        srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
+        cifs_show_security(s, tcon->ses->server);
        seq_printf(s, ",unc=%s", tcon->treeName);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
@@ -518,7 +518,6 @@ static int cifs_drop_inode(struct inode *inode)
 }
 static const struct super_operations cifs_super_ops = {
-        .put_super = cifs_put_super,
        .statfs = cifs_statfs,
        .alloc_inode = cifs_alloc_inode,
        .destroy_inode = cifs_destroy_inode,
@@ -555,7 +554,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
        full_path = cifs_build_path_to_root(vol, cifs_sb,
                                            cifs_sb_master_tcon(cifs_sb));
        if (full_path == NULL)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        cFYI(1, "Get root dentry for %s", full_path);
@@ -584,7 +583,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
                        dchild = d_alloc(dparent, &name);
                        if (dchild == NULL) {
                                dput(dparent);
-                                dparent = NULL;
+                                dparent = ERR_PTR(-ENOMEM);
                                goto out;
                        }
                }
@@ -602,7 +601,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
                        if (rc) {
                                dput(dchild);
                                dput(dparent);
-                                dparent = NULL;
+                                dparent = ERR_PTR(rc);
                                goto out;
                        }
                        alias = d_materialise_unique(dchild, inode);
@@ -610,7 +609,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
                                dput(dchild);
                                if (IS_ERR(alias)) {
                                        dput(dparent);
-                                        dparent = NULL;
+                                        dparent = ERR_PTR(-EINVAL); /* XXX */
                                        goto out;
                                }
                                dchild = alias;
@@ -630,6 +629,13 @@ out:
        return dparent;
 }
+static int cifs_set_super(struct super_block *sb, void *data)
+{
+        struct cifs_mnt_data *mnt_data = data;
+        sb->s_fs_info = mnt_data->cifs_sb;
+        return set_anon_super(sb, NULL);
+}
 static struct dentry *
 cifs_do_mount(struct file_system_type *fs_type,
              int flags, const char *dev_name, void *data)
@@ -650,75 +656,73 @@ cifs_do_mount(struct file_system_type *fs_type,
        cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL);
        if (cifs_sb == NULL) {
                root = ERR_PTR(-ENOMEM);
-                goto out;
+                goto out_nls;
+        }
+        cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
+        if (cifs_sb->mountdata == NULL) {
+                root = ERR_PTR(-ENOMEM);
+                goto out_cifs_sb;
        }
        cifs_setup_cifs_sb(volume_info, cifs_sb);
+        rc = cifs_mount(cifs_sb, volume_info);
+        if (rc) {
+                if (!(flags & MS_SILENT))
+                        cERROR(1, "cifs_mount failed w/return code = %d", rc);
+                root = ERR_PTR(rc);
+                goto out_mountdata;
+        }
        mnt_data.vol = volume_info;
        mnt_data.cifs_sb = cifs_sb;
        mnt_data.flags = flags;
-        sb = sget(fs_type, cifs_match_super, set_anon_super, &mnt_data);
+        sb = sget(fs_type, cifs_match_super, cifs_set_super, &mnt_data);
        if (IS_ERR(sb)) {
                root = ERR_CAST(sb);
-                goto out_cifs_sb;
+                cifs_umount(cifs_sb);
+                goto out;
        }
-        if (sb->s_fs_info) {
+        if (sb->s_root) {
                cFYI(1, "Use existing superblock");
-                goto out_shared;
+                cifs_umount(cifs_sb);
-        }
+        } else {
+                sb->s_flags = flags;
-        /*
+                /* BB should we make this contingent on mount parm? */
-         * Copy mount params for use in submounts. Better to do
+                sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
-         * the copy here and deal with the error before cleanup gets
-         * complicated post-mount.
+                rc = cifs_read_super(sb);
-         */
+                if (rc) {
-        cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
+                        root = ERR_PTR(rc);
-        if (cifs_sb->mountdata == NULL) {
+                        goto out_super;
-                root = ERR_PTR(-ENOMEM);
+                }
-                goto out_super;
-        }
-        sb->s_flags = flags;
-        /* BB should we make this contingent on mount parm? */
-        sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
-        sb->s_fs_info = cifs_sb;
-        rc = cifs_read_super(sb, volume_info, dev_name,
+                sb->s_flags |= MS_ACTIVE;
-                             flags & MS_SILENT ? 1 : 0);
-        if (rc) {
-                root = ERR_PTR(rc);
-                goto out_super;
        }
-        sb->s_flags |= MS_ACTIVE;
        root = cifs_get_root(volume_info, sb);
-        if (root == NULL)
+        if (IS_ERR(root))
                goto out_super;
        cFYI(1, "dentry root is: %p", root);
        goto out;
-out_shared:
-        root = cifs_get_root(volume_info, sb);
-        if (root)
-                cFYI(1, "dentry root is: %p", root);
-        goto out;
 out_super:
-        kfree(cifs_sb->mountdata);
        deactivate_locked_super(sb);
-out_cifs_sb:
-        unload_nls(cifs_sb->local_nls);
-        kfree(cifs_sb);
 out:
        cifs_cleanup_volume_info(&volume_info);
        return root;
+out_mountdata:
+        kfree(cifs_sb->mountdata);
+out_cifs_sb:
+        kfree(cifs_sb);
+out_nls:
+        unload_nls(volume_info->local_nls);
+        goto out;
 }
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
@@ -807,7 +811,7 @@ struct file_system_type cifs_fs_type = {
        .owner = THIS_MODULE,
        .name = "cifs",
        .mount = cifs_do_mount,
-        .kill_sb = kill_anon_super,
+        .kill_sb = cifs_kill_sb,
        /*  .fs_flags */
 };
 const struct inode_operations cifs_dir_inode_ops = {
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 64313f778ebf..0900e1658c96 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -129,5 +129,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CIFS_NFSD_EXPORT */
-#define CIFS_VERSION   "1.72"
+#define CIFS_VERSION   "1.73"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 953f84413c77..257f312ede42 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -157,9 +157,8 @@ extern int cifs_match_super(struct super_block *, void *);
 extern void cifs_cleanup_volume_info(struct smb_vol **pvolume_info);
 extern int cifs_setup_volume_info(struct smb_vol **pvolume_info,
                                  char *mount_data, const char *devname);
-extern int cifs_mount(struct super_block *, struct cifs_sb_info *,
+extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);
-                      struct smb_vol *, const char *);
+extern void cifs_umount(struct cifs_sb_info *);
-extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
 extern void cifs_dfs_release_automount_timer(void);
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
@@ -218,7 +217,8 @@ extern int get_dfs_path(int xid, struct cifs_ses *pSesInfo,
                        struct dfs_info3_param **preferrals,
                        int remap);
 extern void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
-                                 struct super_block *sb, struct smb_vol *vol);
+                                 struct cifs_sb_info *cifs_sb,
+                                 struct smb_vol *vol);
 extern int CIFSSMBQFSInfo(const int xid, struct cifs_tcon *tcon,
                        struct kstatfs *FSData);
 extern int SMBOldQFSInfo(const int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 6d88b82537c3..7f540df52527 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -152,7 +152,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
                mid_entry->callback(mid_entry);
        }
-        while (server->tcpStatus == CifsNeedReconnect) {
+        do {
                try_to_freeze();
                /* we should try only the port we connected to before */
@@ -167,7 +167,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
                                server->tcpStatus = CifsNeedNegotiate;
                        spin_unlock(&GlobalMid_Lock);
                }
-        }
+        } while (server->tcpStatus == CifsNeedReconnect);
        return rc;
 }
@@ -784,7 +784,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                         struct smb_vol *vol)
 {
        char *value, *data, *end;
-        char *mountdata_copy, *options;
+        char *mountdata_copy = NULL, *options;
        unsigned int  temp_len, i, j;
        char separator[2];
        short int override_uid = -1;
@@ -1391,7 +1391,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                                "/proc/fs/cifs/LookupCacheEnabled to 0\n");
                } else if (strnicmp(data, "fsc", 3) == 0) {
 #ifndef CONFIG_CIFS_FSCACHE
-                        cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
+                        cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE "
                                  "kernel config option set");
                        goto cifs_parse_mount_err;
 #endif
@@ -1976,7 +1976,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
                warned_on_ntlm = true;
                cERROR(1, "default security mechanism requested.  The default "
                        "security mechanism will be upgraded from ntlm to "
-                        "ntlmv2 in kernel release 2.6.41");
+                        "ntlmv2 in kernel release 3.1");
        }
        ses->overrideSecFlg = volume_info->secFlg;
@@ -2149,7 +2149,10 @@ cifs_put_tlink(struct tcon_link *tlink)
 }
 static inline struct tcon_link *
-cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb);
+cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
+{
+        return cifs_sb->master_tlink;
+}
 static int
 compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
@@ -2543,7 +2546,7 @@ ip_connect(struct TCP_Server_Info *server)
 }
 void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
-                          struct super_block *sb, struct smb_vol *vol_info)
+                          struct cifs_sb_info *cifs_sb, struct smb_vol *vol_info)
 {
        /* if we are reconnecting then should we check to see if
         * any requested capabilities changed locally e.g. via
@@ -2597,22 +2600,23 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
                        cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
                else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
                        cFYI(1, "negotiated posix acl support");
-                        if (sb)
+                        if (cifs_sb)
-                                sb->s_flags |= MS_POSIXACL;
+                                cifs_sb->mnt_cifs_flags |=
+                                        CIFS_MOUNT_POSIXACL;
                }
                if (vol_info && vol_info->posix_paths == 0)
                        cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
                else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
                        cFYI(1, "negotiate posix pathnames");
-                        if (sb)
+                        if (cifs_sb)
-                                CIFS_SB(sb)->mnt_cifs_flags |=
+                                cifs_sb->mnt_cifs_flags |=
                                        CIFS_MOUNT_POSIX_PATHS;
                }
-                if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
+                if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) {
                        if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
-                                CIFS_SB(sb)->rsize = 127 * 1024;
+                                cifs_sb->rsize = 127 * 1024;
                                cFYI(DBG2, "larger reads not supported by srv");
                        }
                }
@@ -2659,6 +2663,9 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 {
        INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
+        spin_lock_init(&cifs_sb->tlink_tree_lock);
+        cifs_sb->tlink_tree = RB_ROOT;
        if (pvolume_info->rsize > CIFSMaxBufSize) {
                cERROR(1, "rsize %d too large, using MaxBufSize",
                        pvolume_info->rsize);
@@ -2747,21 +2754,21 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 /*
 * When the server supports very large writes via POSIX extensions, we can
- * allow up to 2^24 - PAGE_CACHE_SIZE.
+ * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including
+ * the RFC1001 length.
 *
 * Note that this might make for "interesting" allocation problems during
- * writeback however (as we have to allocate an array of pointers for the
+ * writeback however as we have to allocate an array of pointers for the
- * pages). A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
 */
-#define CIFS_MAX_WSIZE ((1<<24) - PAGE_CACHE_SIZE)
+#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
 /*
- * When the server doesn't allow large posix writes, default to a wsize of
+ * When the server doesn't allow large posix writes, only allow a wsize of
- * 128k - PAGE_CACHE_SIZE -- one page less than the largest frame size
+ * 128k minus the size of the WRITE_AND_X header. That allows for a write up
- * described in RFC1001. This allows space for the header without going over
+ * to the maximum size described by RFC1002.
- * that by default.
 */
-#define CIFS_MAX_RFC1001_WSIZE (128 * 1024 - PAGE_CACHE_SIZE)
+#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4)
 /*
 * The default wsize is 1M. find_get_pages seems to return a maximum of 256
@@ -2780,11 +2787,18 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
        /* can server support 24-bit write sizes? (via UNIX extensions) */
        if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
-                wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1001_WSIZE);
+                wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE);
-        /* no CAP_LARGE_WRITE_X? Limit it to 16 bits */
+        /*
-        if (!(server->capabilities & CAP_LARGE_WRITE_X))
+         * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set?
-                wsize = min_t(unsigned int, wsize, USHRT_MAX);
+         * Limit it to max buffer offered by the server, minus the size of the
+         * WRITEX header, not including the 4 byte RFC1001 length.
+         */
+        if (!(server->capabilities & CAP_LARGE_WRITE_X) ||
+            (!(server->capabilities & CAP_UNIX) &&
+             (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED))))
+                wsize = min_t(unsigned int, wsize,
+                                server->maxBuf - sizeof(WRITE_REQ) + 4);
        /* hard limit of CIFS_MAX_WSIZE */
        wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE);
@@ -2934,7 +2948,11 @@ int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data,
        if (volume_info->nullauth) {
                cFYI(1, "null user");
-                volume_info->username = "";
+                volume_info->username = kzalloc(1, GFP_KERNEL);
+                if (volume_info->username == NULL) {
+                        rc = -ENOMEM;
+                        goto out;
+                }
        } else if (volume_info->username) {
                /* BB fixme parse for domain name here */
                cFYI(1, "Username: %s", volume_info->username);
@@ -2968,8 +2986,7 @@ out:
 }
 int
-cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
+cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
-           struct smb_vol *volume_info, const char *devname)
 {
        int rc = 0;
        int xid;
@@ -2980,6 +2997,13 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        struct tcon_link *tlink;
 #ifdef CONFIG_CIFS_DFS_UPCALL
        int referral_walks_count = 0;
+        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+        if (rc)
+                return rc;
+        cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
 try_mount_again:
        /* cleanup activities if we're chasing a referral */
        if (referral_walks_count) {
@@ -3004,6 +3028,7 @@ try_mount_again:
        srvTcp = cifs_get_tcp_session(volume_info);
        if (IS_ERR(srvTcp)) {
                rc = PTR_ERR(srvTcp);
+                bdi_destroy(&cifs_sb->bdi);
                goto out;
        }
@@ -3015,14 +3040,6 @@ try_mount_again:
                goto mount_fail_check;
        }
-        if (pSesInfo->capabilities & CAP_LARGE_FILES)
-                sb->s_maxbytes = MAX_LFS_FILESIZE;
-        else
-                sb->s_maxbytes = MAX_NON_LFS;
-        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
-        sb->s_time_gran = 100;
        /* search for existing tcon to this server share */
        tcon = cifs_get_tcon(pSesInfo, volume_info);
        if (IS_ERR(tcon)) {
@@ -3035,7 +3052,7 @@ try_mount_again:
        if (tcon->ses->capabilities & CAP_UNIX) {
                /* reset of caps checks mount to see if unix extensions
                   disabled for just this mount */
-                reset_cifs_unix_caps(xid, tcon, sb, volume_info);
+                reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info);
                if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
                    (le64_to_cpu(tcon->fsUnixInfo.Capability) &
                     CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) {
@@ -3158,6 +3175,7 @@ mount_fail_check:
                        cifs_put_smb_ses(pSesInfo);
                else
                        cifs_put_tcp_session(srvTcp);
+                bdi_destroy(&cifs_sb->bdi);
                goto out;
        }
@@ -3171,6 +3189,10 @@ out:
        return rc;
 }
+/*
+ * Issue a TREE_CONNECT request. Note that for IPC$ shares, that the tcon
+ * pointer may be NULL.
+ */
 int
 CIFSTCon(unsigned int xid, struct cifs_ses *ses,
         const char *tree, struct cifs_tcon *tcon,
@@ -3205,7 +3227,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
        pSMB->AndXCommand = 0xFF;
        pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO);
        bcc_ptr = &pSMB->Password[0];
-        if ((ses->server->sec_mode) & SECMODE_USER) {
+        if (!tcon || (ses->server->sec_mode & SECMODE_USER)) {
                pSMB->PasswordLength = cpu_to_le16(1);  /* minimum */
                *bcc_ptr = 0; /* password is null byte */
                bcc_ptr++;              /* skip password */
@@ -3328,8 +3350,8 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
        return rc;
 }
-int
+void
-cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
+cifs_umount(struct cifs_sb_info *cifs_sb)
 {
        struct rb_root *root = &cifs_sb->tlink_tree;
        struct rb_node *node;
@@ -3350,7 +3372,10 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
        }
        spin_unlock(&cifs_sb->tlink_tree_lock);
-        return 0;
+        bdi_destroy(&cifs_sb->bdi);
+        kfree(cifs_sb->mountdata);
+        unload_nls(cifs_sb->local_nls);
+        kfree(cifs_sb);
 }
 int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses)
@@ -3371,7 +3396,7 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses)
        }
        if (rc == 0) {
                spin_lock(&GlobalMid_Lock);
-                if (server->tcpStatus != CifsExiting)
+                if (server->tcpStatus == CifsNeedNegotiate)
                        server->tcpStatus = CifsGood;
                else
                        rc = -EHOSTDOWN;
@@ -3484,12 +3509,6 @@ out:
        return tcon;
 }
-static inline struct tcon_link *
-cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
-{
-        return cifs_sb->master_tlink;
-}
 struct cifs_tcon *
 cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
 {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index d368a47ba5eb..816696621ec9 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -28,14 +28,14 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
        server->fscache =
                fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
                                &cifs_fscache_server_index_def, server);
-        cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server,
+        cFYI(1, "%s: (0x%p/0x%p)", __func__, server,
-                                server->fscache);
+                        server->fscache);
 }
 void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
 {
-        cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server,
+        cFYI(1, "%s: (0x%p/0x%p)", __func__, server,
-                                server->fscache);
+                        server->fscache);
        fscache_relinquish_cookie(server->fscache, 0);
        server->fscache = NULL;
 }
@@ -47,13 +47,13 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
        tcon->fscache =
                fscache_acquire_cookie(server->fscache,
                                &cifs_fscache_super_index_def, tcon);
-        cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)",
+        cFYI(1, "%s: (0x%p/0x%p)", __func__, server->fscache,
-                                server->fscache, tcon->fscache);
+                        tcon->fscache);
 }
 void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
 {
-        cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache);
+        cFYI(1, "%s: (0x%p)", __func__, tcon->fscache);
        fscache_relinquish_cookie(tcon->fscache, 0);
        tcon->fscache = NULL;
 }
@@ -70,8 +70,8 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
                cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
                                &cifs_fscache_inode_object_def, cifsi);
-                cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
+                cFYI(1, "%s: got FH cookie (0x%p/0x%p)", __func__,
-                                cifsi->fscache);
+                                tcon->fscache, cifsi->fscache);
        }
 }
@@ -80,8 +80,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        if (cifsi->fscache) {
-                cFYI(1, "CIFS releasing inode cookie (0x%p)",
+                cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache);
-                                cifsi->fscache);
                fscache_relinquish_cookie(cifsi->fscache, 0);
                cifsi->fscache = NULL;
        }
@@ -92,8 +91,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode)
        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        if (cifsi->fscache) {
-                cFYI(1, "CIFS disabling inode cookie (0x%p)",
+                cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache);
-                                cifsi->fscache);
                fscache_relinquish_cookie(cifsi->fscache, 1);
                cifsi->fscache = NULL;
        }
@@ -121,8 +119,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
                                        cifs_sb_master_tcon(cifs_sb)->fscache,
                                        &cifs_fscache_inode_object_def,
                                        cifsi);
-                cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
+                cFYI(1, "%s: new cookie 0x%p oldcookie 0x%p",
-                                cifsi->fscache, old);
+                                __func__, cifsi->fscache, old);
        }
 }
@@ -132,8 +130,8 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp)
                struct inode *inode = page->mapping->host;
                struct cifsInodeInfo *cifsi = CIFS_I(inode);
-                cFYI(1, "CIFS: fscache release page (0x%p/0x%p)",
+                cFYI(1, "%s: (0x%p/0x%p)", __func__, page,
-                                page, cifsi->fscache);
+                                cifsi->fscache);
                if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
                        return 0;
        }
@@ -144,8 +142,7 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp)
 static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
                                                int error)
 {
-        cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)",
+        cFYI(1, "%s: (0x%p/%d)", __func__, page, error);
-                        page, error);
        if (!error)
                SetPageUptodate(page);
        unlock_page(page);
@@ -158,7 +155,7 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
 {
        int ret;
-        cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p",
+        cFYI(1, "%s: (fsc:%p, p:%p, i:0x%p", __func__,
                        CIFS_I(inode)->fscache, page, inode);
        ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
                                         cifs_readpage_from_fscache_complete,
@@ -167,11 +164,11 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
        switch (ret) {
        case 0: /* page found in fscache, read submitted */
-                cFYI(1, "CIFS: readpage_from_fscache: submitted");
+                cFYI(1, "%s: submitted", __func__);
                return ret;
        case -ENOBUFS:  /* page won't be cached */
        case -ENODATA:  /* page not in cache */
-                cFYI(1, "CIFS: readpage_from_fscache %d", ret);
+                cFYI(1, "%s: %d", __func__, ret);
                return 1;
        default:
@@ -190,7 +187,7 @@ int __cifs_readpages_from_fscache(struct inode *inode,
 {
        int ret;
-        cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)",
+        cFYI(1, "%s: (0x%p/%u/0x%p)", __func__,
                        CIFS_I(inode)->fscache, *nr_pages, inode);
        ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
                                          pages, nr_pages,
@@ -199,12 +196,12 @@ int __cifs_readpages_from_fscache(struct inode *inode,
                                          mapping_gfp_mask(mapping));
        switch (ret) {
        case 0: /* read submitted to the cache for all pages */
-                cFYI(1, "CIFS: readpages_from_fscache: submitted");
+                cFYI(1, "%s: submitted", __func__);
                return ret;
        case -ENOBUFS:  /* some pages are not cached and can't be */
        case -ENODATA:  /* some pages are not cached */
-                cFYI(1, "CIFS: readpages_from_fscache: no page");
+                cFYI(1, "%s: no page", __func__);
                return 1;
        default:
@@ -218,7 +215,7 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
 {
        int ret;
-        cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p",
+        cFYI(1, "%s: (fsc: %p, p: %p, i: %p)", __func__,
                        CIFS_I(inode)->fscache, page, inode);
        ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
        if (ret != 0)
@@ -230,7 +227,7 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        struct fscache_cookie *cookie = cifsi->fscache;
-        cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie);
+        cFYI(1, "%s: (0x%p/0x%p)", __func__, page, cookie);
        fscache_wait_on_page_write(cookie, page);
        fscache_uncache_page(cookie, page);
 }
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 1525d5e662b6..1c5b770c3141 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -90,12 +90,10 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
        sg_init_one(&sgout, out, 8);
        rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8);
-        if (rc) {
+        if (rc)
                cERROR(1, "could not encrypt crypt key rc: %d\n", rc);
-                crypto_free_blkcipher(tfm_des);
-                goto smbhash_err;
-        }
+        crypto_free_blkcipher(tfm_des);
 smbhash_err:
        return rc;
 }
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 6cbb3afb36dc..cb140ef293e4 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -43,8 +43,6 @@ const struct file_operations coda_ioctl_operations = {
 /* the coda pioctl inode ops */
 static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
        return (mask & MAY_EXEC) ? -EACCES : 0;
 }
diff --git a/fs/dcookies.c b/fs/dcookies.c
index a21cabdbd87b..dda0dc702d1b 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -178,6 +178,8 @@ SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len)
        /* FIXME: (deleted) ? */
        path = d_path(&dcs->path, kbuf, PAGE_SIZE);
+        mutex_unlock(&dcookie_mutex);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                goto out_free;
@@ -194,6 +196,7 @@ SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len)
 out_free:
        kfree(kbuf);
+        return err;
 out:
        mutex_unlock(&dcookie_mutex);
        return err;
diff --git a/fs/exec.c b/fs/exec.c
index ea5f748906a8..6075a1e727ae 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1093,6 +1093,7 @@ int flush_old_exec(struct linux_binprm * bprm)
        bprm->mm = NULL;                /* We're using it now */
+        set_fs(USER_DS);
        current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
        flush_thread();
        current->personality &= ~bprm->per_clear;
@@ -1357,10 +1358,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
        if (retval)
                return retval;
-        /* kernel module loader fixup */
-        /* so we don't try to load run modprobe in kernel space. */
-        set_fs(USER_DS);
        retval = audit_bprm(bprm);
        if (retval)
                return retval;
@@ -1999,7 +1996,7 @@ static void wait_for_dump_helpers(struct file *file)
 * is a special value that we use to trap recursive
 * core dumps
 */
-static int umh_pipe_setup(struct subprocess_info *info)
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 {
        struct file *rp, *wp;
        struct fdtable *fdt;
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 2e29abb30f76..095c36f3b612 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -125,7 +125,7 @@ struct ext4_ext_path {
 * positive retcode - signal for ext4_ext_walk_space(), see below
 * callback must return valid extent (passed or newly created)
 */
-typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
+typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
                                        struct ext4_ext_cache *,
                                        struct ext4_extent *, void *);
@@ -133,8 +133,11 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
 #define EXT_BREAK      1
 #define EXT_REPEAT     2
-/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
+/*
-#define EXT_MAX_BLOCK   0xffffffff
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS  0xffffffff
 /*
 * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5199bac7fc62..f815cc81e7a2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1408,7 +1408,7 @@ got_index:
 /*
 * ext4_ext_next_allocated_block:
- * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
+ * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
 * NOTE: it considers block number from index entry as
 * allocated block. Thus, index entries have to be consistent
 * with leaves.
@@ -1422,7 +1422,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
        depth = path->p_depth;
        if (depth == 0 && path->p_ext == NULL)
-                return EXT_MAX_BLOCK;
+                return EXT_MAX_BLOCKS;
        while (depth >= 0) {
                if (depth == path->p_depth) {
@@ -1439,12 +1439,12 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
                depth--;
        }
-        return EXT_MAX_BLOCK;
+        return EXT_MAX_BLOCKS;
 }
 /*
 * ext4_ext_next_leaf_block:
- * returns first allocated block from next leaf or EXT_MAX_BLOCK
+ * returns first allocated block from next leaf or EXT_MAX_BLOCKS
 */
 static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
                                        struct ext4_ext_path *path)
@@ -1456,7 +1456,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
        /* zero-tree has no leaf blocks at all */
        if (depth == 0)
-                return EXT_MAX_BLOCK;
+                return EXT_MAX_BLOCKS;
        /* go to index block */
        depth--;
@@ -1469,7 +1469,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
                depth--;
        }
-        return EXT_MAX_BLOCK;
+        return EXT_MAX_BLOCKS;
 }
 /*
@@ -1677,13 +1677,13 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
         */
        if (b2 < b1) {
                b2 = ext4_ext_next_allocated_block(path);
-                if (b2 == EXT_MAX_BLOCK)
+                if (b2 == EXT_MAX_BLOCKS)
                        goto out;
        }
        /* check for wrap through zero on extent logical start block*/
        if (b1 + len1 < b1) {
-                len1 = EXT_MAX_BLOCK - b1;
+                len1 = EXT_MAX_BLOCKS - b1;
                newext->ee_len = cpu_to_le16(len1);
                ret = 1;
        }
@@ -1767,7 +1767,7 @@ repeat:
        fex = EXT_LAST_EXTENT(eh);
        next = ext4_ext_next_leaf_block(inode, path);
        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
-            && next != EXT_MAX_BLOCK) {
+            && next != EXT_MAX_BLOCKS) {
                ext_debug("next leaf block - %d\n", next);
                BUG_ON(npath != NULL);
                npath = ext4_ext_find_extent(inode, next, NULL);
@@ -1887,7 +1887,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
        BUG_ON(func == NULL);
        BUG_ON(inode == NULL);
-        while (block < last && block != EXT_MAX_BLOCK) {
+        while (block < last && block != EXT_MAX_BLOCKS) {
                num = last - block;
                /* find extent for this block */
                down_read(&EXT4_I(inode)->i_data_sem);
@@ -1958,7 +1958,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                        err = -EIO;
                        break;
                }
-                err = func(inode, path, &cbex, ex, cbdata);
+                err = func(inode, next, &cbex, ex, cbdata);
                ext4_ext_drop_refs(path);
                if (err < 0)
@@ -2020,7 +2020,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
        if (ex == NULL) {
                /* there is no extent yet, so gap is [0;-] */
                lblock = 0;
-                len = EXT_MAX_BLOCK;
+                len = EXT_MAX_BLOCKS;
                ext_debug("cache gap(whole file):");
        } else if (block < le32_to_cpu(ex->ee_block)) {
                lblock = block;
@@ -2350,7 +2350,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                         * never happen because at least one of the end points
                         * needs to be on the edge of the extent.
                         */
-                        if (end == EXT_MAX_BLOCK) {
+                        if (end == EXT_MAX_BLOCKS - 1) {
                                ext_debug("  bad truncate %u:%u\n",
                                                start, end);
                                block = 0;
@@ -2398,7 +2398,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                         * If this is a truncate, this condition
                         * should never happen
                         */
-                        if (end == EXT_MAX_BLOCK) {
+                        if (end == EXT_MAX_BLOCKS - 1) {
                                ext_debug("  bad truncate %u:%u\n",
                                        start, end);
                                err = -EIO;
@@ -2478,7 +2478,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                 * we need to remove it from the leaf
                 */
                if (num == 0) {
-                        if (end != EXT_MAX_BLOCK) {
+                        if (end != EXT_MAX_BLOCKS - 1) {
                                /*
                                 * For hole punching, we need to scoot all the
                                 * extents up when an extent is removed so that
@@ -3699,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
-        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
+        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
        /* In a multi-transaction truncate, we only make the final
         * transaction synchronous.
@@ -3914,14 +3914,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 /*
 * Callback function called for each extent to gather FIEMAP information.
 */
-static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
                       struct ext4_ext_cache *newex, struct ext4_extent *ex,
                       void *data)
 {
        __u64   logical;
        __u64   physical;
        __u64   length;
-        loff_t  size;
        __u32   flags = 0;
        int             ret = 0;
        struct fiemap_extent_info *fieinfo = data;
@@ -4103,8 +4102,7 @@ found_delayed_extent:
        if (ex && ext4_ext_is_uninitialized(ex))
                flags |= FIEMAP_EXTENT_UNWRITTEN;
-        size = i_size_read(inode);
+        if (next == EXT_MAX_BLOCKS)
-        if (logical + length >= size)
                flags |= FIEMAP_EXTENT_LAST;
        ret = fiemap_fill_next_extent(fieinfo, logical, physical,
@@ -4347,8 +4345,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                start_blk = start >> inode->i_sb->s_blocksize_bits;
                last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
-                if (last_blk >= EXT_MAX_BLOCK)
+                if (last_blk >= EXT_MAX_BLOCKS)
-                        last_blk = EXT_MAX_BLOCK-1;
+                        last_blk = EXT_MAX_BLOCKS-1;
                len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
                /*
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a5763e3505ba..e3126c051006 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2634,7 +2634,7 @@ static int ext4_writepage(struct page *page,
        struct buffer_head *page_bufs = NULL;
        struct inode *inode = page->mapping->host;
-        trace_ext4_writepage(inode, page);
+        trace_ext4_writepage(page);
        size = i_size_read(inode);
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 859f2ae8864e..6ed859d56850 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3578,8 +3578,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                free += next - bit;
                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
-                trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
+                trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit,
-                                               grp_blk_start + bit, next - bit);
+                                               next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
@@ -3608,7 +3608,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
        ext4_group_t group;
        ext4_grpblk_t bit;
-        trace_ext4_mb_release_group_pa(sb, pa);
+        trace_ext4_mb_release_group_pa(pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -4448,7 +4448,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 * @inode:              inode
 * @block:              start physical block to free
 * @count:              number of blocks to count
- * @metadata:           Are these metadata blocks
+ * @flags:              flags used by ext4_free_blocks
 */
 void ext4_free_blocks(handle_t *handle, struct inode *inode,
                      struct buffer_head *bh, ext4_fsblk_t block,
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2b8304bf3c50..f57455a1b1b2 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1002,12 +1002,12 @@ mext_check_arguments(struct inode *orig_inode,
                return -EINVAL;
        }
-        if ((orig_start > EXT_MAX_BLOCK) ||
+        if ((orig_start >= EXT_MAX_BLOCKS) ||
-            (donor_start > EXT_MAX_BLOCK) ||
+            (donor_start >= EXT_MAX_BLOCKS) ||
-            (*len > EXT_MAX_BLOCK) ||
+            (*len > EXT_MAX_BLOCKS) ||
-            (orig_start + *len > EXT_MAX_BLOCK))  {
+            (orig_start + *len >= EXT_MAX_BLOCKS))  {
                ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
-                        "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
+                        "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
                        orig_inode->i_ino, donor_inode->i_ino);
                return -EINVAL;
        }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cc5c157aa11d..9ea71aa864b3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2243,6 +2243,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
 * so that won't be a limiting factor.
 *
+ * However there is other limiting factor. We do store extents in the form
+ * of starting block and length, hence the resulting length of the extent
+ * covering maximum file size must fit into on-disk format containers as
+ * well. Given that length is always by 1 unit bigger than max unit (because
+ * we count 0 as well) we have to lower the s_maxbytes by one fs block.
+ *
 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
 */
 static loff_t ext4_max_size(int blkbits, int has_huge_files)
@@ -2264,10 +2270,13 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
                upper_limit <<= blkbits;
        }
-        /* 32-bit extent-start container, ee_block */
+        /*
-        res = 1LL << 32;
+         * 32-bit extent-start container, ee_block. We lower the maxbytes
+         * by one fs block, so ee_len can cover the extent of maximum file
+         * size
+         */
+        res = (1LL << 32) - 1;
        res <<= blkbits;
-        res -= 1;
        /* Sanity check against vm- & vfs- imposed limits */
        if (res > upper_limit)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 7257752b6d5d..7018e1d8902d 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -102,7 +102,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
                if (attr & ATTR_SYS)
                        inode->i_flags |= S_IMMUTABLE;
                else
-                        inode->i_flags &= S_IMMUTABLE;
+                        inode->i_flags &= ~S_IMMUTABLE;
        }
        fat_save_attrs(inode, attr);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index cc6ec4b2f0ff..38f84cd48b67 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -921,6 +921,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        if (sb->s_flags & MS_MANDLOCK)
                goto err;
+        sb->s_flags &= ~MS_NOSEC;
        if (!parse_fuse_opt((char *) data, &d, is_bdev))
                goto err;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 2792a790e50b..1c1336e7b3b2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -663,14 +663,19 @@ static void glock_work_func(struct work_struct *work)
                drop_ref = 1;
        }
        spin_lock(&gl->gl_spin);
-        if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+        if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
            gl->gl_state != LM_ST_UNLOCKED &&
            gl->gl_demote_state != LM_ST_EXCLUSIVE) {
                unsigned long holdtime, now = jiffies;
                holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
                if (time_before(now, holdtime))
                        delay = holdtime - now;
-                set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags);
+                if (!delay) {
+                        clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags);
+                        set_bit(GLF_DEMOTE, &gl->gl_flags);
+                }
        }
        run_queue(gl, 0);
        spin_unlock(&gl->gl_spin);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3db5ba4568fc..b3cc8586984e 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -974,7 +974,7 @@ out_no_inode:
 out_no_read:
        printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n",
                __func__, s->s_id, iso_blknum, block);
-        goto out_freesbi;
+        goto out_freebh;
 out_bad_zone_size:
        printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n",
                sbi->s_log_zone_size);
@@ -989,6 +989,7 @@ out_unknown_format:
 out_freebh:
        brelse(bh);
+        brelse(pri_bh);
 out_freesbi:
        kfree(opt.iocharset);
        kfree(sbi);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6a79fd0a1a32..2c62c5aae82f 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -97,10 +97,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
        if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
            !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
+                /*
+                 * Get our reference so that bh cannot be freed before
+                 * we unlock it
+                 */
+                get_bh(bh);
                JBUFFER_TRACE(jh, "remove from checkpoint list");
                ret = __jbd2_journal_remove_checkpoint(jh) + 1;
                jbd_unlock_bh_state(bh);
-                jbd2_journal_remove_journal_head(bh);
                BUFFER_TRACE(bh, "release");
                __brelse(bh);
        } else {
@@ -223,8 +227,8 @@ restart:
                        spin_lock(&journal->j_list_lock);
                        goto restart;
                }
+                get_bh(bh);
                if (buffer_locked(bh)) {
-                        atomic_inc(&bh->b_count);
                        spin_unlock(&journal->j_list_lock);
                        jbd_unlock_bh_state(bh);
                        wait_on_buffer(bh);
@@ -243,7 +247,6 @@ restart:
                 */
                released = __jbd2_journal_remove_checkpoint(jh);
                jbd_unlock_bh_state(bh);
-                jbd2_journal_remove_journal_head(bh);
                __brelse(bh);
        }
@@ -284,7 +287,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
        int ret = 0;
        if (buffer_locked(bh)) {
-                atomic_inc(&bh->b_count);
+                get_bh(bh);
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
                wait_on_buffer(bh);
@@ -316,12 +319,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                ret = 1;
                if (unlikely(buffer_write_io_error(bh)))
                        ret = -EIO;
+                get_bh(bh);
                J_ASSERT_JH(jh, !buffer_jbddirty(bh));
                BUFFER_TRACE(bh, "remove from checkpoint");
                __jbd2_journal_remove_checkpoint(jh);
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
-                jbd2_journal_remove_journal_head(bh);
                __brelse(bh);
        } else {
                /*
@@ -554,7 +557,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 /*
 * journal_clean_one_cp_list
 *
- * Find all the written-back checkpoint buffers in the given list and release them.
+ * Find all the written-back checkpoint buffers in the given list and
+ * release them.
 *
 * Called with the journal locked.
 * Called with j_list_lock held.
@@ -663,8 +667,8 @@ out:
 * checkpoint lists.
 *
 * The function returns 1 if it frees the transaction, 0 otherwise.
+ * The function can free jh and bh.
 *
- * This function is called with the journal locked.
 * This function is called with j_list_lock held.
 * This function is called with jbd_lock_bh_state(jh2bh(jh))
 */
@@ -684,13 +688,14 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
        }
        journal = transaction->t_journal;
+        JBUFFER_TRACE(jh, "removing from transaction");
        __buffer_unlink(jh);
        jh->b_cp_transaction = NULL;
+        jbd2_journal_put_journal_head(jh);
        if (transaction->t_checkpoint_list != NULL ||
            transaction->t_checkpoint_io_list != NULL)
                goto out;
-        JBUFFER_TRACE(jh, "transaction has no more buffers");
        /*
         * There is one special case to worry about: if we have just pulled the
@@ -701,10 +706,8 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
         * The locking here around t_state is a bit sleazy.
         * See the comment at the end of jbd2_journal_commit_transaction().
         */
-        if (transaction->t_state != T_FINISHED) {
+        if (transaction->t_state != T_FINISHED)
-                JBUFFER_TRACE(jh, "belongs to running/committing transaction");
                goto out;
-        }
        /* OK, that was the last buffer for the transaction: we can now
           safely remove this transaction from the log */
@@ -723,7 +726,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
        wake_up(&journal->j_wait_logspace);
        ret = 1;
 out:
-        JBUFFER_TRACE(jh, "exit");
        return ret;
 }
@@ -742,6 +744,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
        J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
        J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+        /* Get reference for checkpointing transaction */
+        jbd2_journal_grab_journal_head(jh2bh(jh));
        jh->b_cp_transaction = transaction;
        if (!transaction->t_checkpoint_list) {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7f21cf3aaf92..eef6979821a4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -848,10 +848,16 @@ restart_loop:
        while (commit_transaction->t_forget) {
                transaction_t *cp_transaction;
                struct buffer_head *bh;
+                int try_to_free = 0;
                jh = commit_transaction->t_forget;
                spin_unlock(&journal->j_list_lock);
                bh = jh2bh(jh);
+                /*
+                 * Get a reference so that bh cannot be freed before we are
+                 * done with it.
+                 */
+                get_bh(bh);
                jbd_lock_bh_state(bh);
                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
@@ -914,28 +920,27 @@ restart_loop:
                        __jbd2_journal_insert_checkpoint(jh, commit_transaction);
                        if (is_journal_aborted(journal))
                                clear_buffer_jbddirty(bh);
-                        JBUFFER_TRACE(jh, "refile for checkpoint writeback");
-                        __jbd2_journal_refile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
                } else {
                        J_ASSERT_BH(bh, !buffer_dirty(bh));
-                        /* The buffer on BJ_Forget list and not jbddirty means
+                        /*
+                         * The buffer on BJ_Forget list and not jbddirty means
                         * it has been freed by this transaction and hence it
                         * could not have been reallocated until this
                         * transaction has committed. *BUT* it could be
                         * reallocated once we have written all the data to
                         * disk and before we process the buffer on BJ_Forget
-                         * list. */
+                         * list.
-                        JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+                         */
-                        __jbd2_journal_refile_buffer(jh);
+                        if (!jh->b_next_transaction)
-                        if (!jh->b_transaction) {
+                                try_to_free = 1;
-                                jbd_unlock_bh_state(bh);
-                                 /* needs a brelse */
-                                jbd2_journal_remove_journal_head(bh);
-                                release_buffer_page(bh);
-                        } else
-                                jbd_unlock_bh_state(bh);
                }
+                JBUFFER_TRACE(jh, "refile or unfile buffer");
+                __jbd2_journal_refile_buffer(jh);
+                jbd_unlock_bh_state(bh);
+                if (try_to_free)
+                        release_buffer_page(bh);        /* Drops bh reference */
+                else
+                        __brelse(bh);
                cond_resched_lock(&journal->j_list_lock);
        }
        spin_unlock(&journal->j_list_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9a7826990304..0dfa5b598e68 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2078,10 +2078,9 @@ static void journal_free_journal_head(struct journal_head *jh)
 * When a buffer has its BH_JBD bit set it is immune from being released by
 * core kernel code, mainly via ->b_count.
 *
- * A journal_head may be detached from its buffer_head when the journal_head's
+ * A journal_head is detached from its buffer_head when the journal_head's
- * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
+ * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
- * Various places in JBD call jbd2_journal_remove_journal_head() to indicate that the
+ * transaction (b_cp_transaction) hold their references to b_jcount.
- * journal_head can be dropped if needed.
 *
 * Various places in the kernel want to attach a journal_head to a buffer_head
 * _before_ attaching the journal_head to a transaction.  To protect the
@@ -2094,17 +2093,16 @@ static void journal_free_journal_head(struct journal_head *jh)
 *      (Attach a journal_head if needed.  Increments b_jcount)
 *      struct journal_head *jh = jbd2_journal_add_journal_head(bh);
 *      ...
+ *      (Get another reference for transaction)
+ *      jbd2_journal_grab_journal_head(bh);
 *      jh->b_transaction = xxx;
+ *      (Put original reference)
 *      jbd2_journal_put_journal_head(jh);
- *
- * Now, the journal_head's b_jcount is zero, but it is safe from being released
- * because it has a non-zero b_transaction.
 */
 /*
 * Give a buffer_head a journal_head.
 *
- * Doesn't need the journal lock.
 * May sleep.
 */
 struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
@@ -2168,61 +2166,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
        struct journal_head *jh = bh2jh(bh);
        J_ASSERT_JH(jh, jh->b_jcount >= 0);
+        J_ASSERT_JH(jh, jh->b_transaction == NULL);
-        get_bh(bh);
+        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-        if (jh->b_jcount == 0) {
+        J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
-                if (jh->b_transaction == NULL &&
+        J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
-                                jh->b_next_transaction == NULL &&
+        J_ASSERT_BH(bh, buffer_jbd(bh));
-                                jh->b_cp_transaction == NULL) {
+        J_ASSERT_BH(bh, jh2bh(jh) == bh);
-                        J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+        BUFFER_TRACE(bh, "remove journal_head");
-                        J_ASSERT_BH(bh, buffer_jbd(bh));
+        if (jh->b_frozen_data) {
-                        J_ASSERT_BH(bh, jh2bh(jh) == bh);
+                printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
-                        BUFFER_TRACE(bh, "remove journal_head");
+                jbd2_free(jh->b_frozen_data, bh->b_size);
-                        if (jh->b_frozen_data) {
-                                printk(KERN_WARNING "%s: freeing "
-                                                "b_frozen_data\n",
-                                                __func__);
-                                jbd2_free(jh->b_frozen_data, bh->b_size);
-                        }
-                        if (jh->b_committed_data) {
-                                printk(KERN_WARNING "%s: freeing "
-                                                "b_committed_data\n",
-                                                __func__);
-                                jbd2_free(jh->b_committed_data, bh->b_size);
-                        }
-                        bh->b_private = NULL;
-                        jh->b_bh = NULL;        /* debug, really */
-                        clear_buffer_jbd(bh);
-                        __brelse(bh);
-                        journal_free_journal_head(jh);
-                } else {
-                        BUFFER_TRACE(bh, "journal_head was locked");
-                }
        }
+        if (jh->b_committed_data) {
+                printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
+                jbd2_free(jh->b_committed_data, bh->b_size);
+        }
+        bh->b_private = NULL;
+        jh->b_bh = NULL;        /* debug, really */
+        clear_buffer_jbd(bh);
+        journal_free_journal_head(jh);
 }
 /*
- * jbd2_journal_remove_journal_head(): if the buffer isn't attached to a transaction
+ * Drop a reference on the passed journal_head.  If it fell to zero then
- * and has a zero b_jcount then remove and release its journal_head.   If we did
- * see that the buffer is not used by any transaction we also "logically"
- * decrement ->b_count.
- *
- * We in fact take an additional increment on ->b_count as a convenience,
- * because the caller usually wants to do additional things with the bh
- * after calling here.
- * The caller of jbd2_journal_remove_journal_head() *must* run __brelse(bh) at some
- * time.  Once the caller has run __brelse(), the buffer is eligible for
- * reaping by try_to_free_buffers().
- */
-void jbd2_journal_remove_journal_head(struct buffer_head *bh)
-{
-        jbd_lock_bh_journal_head(bh);
-        __journal_remove_journal_head(bh);
-        jbd_unlock_bh_journal_head(bh);
-}
-/*
- * Drop a reference on the passed journal_head.  If it fell to zero then try to
 * release the journal_head from the buffer_head.
 */
 void jbd2_journal_put_journal_head(struct journal_head *jh)
@@ -2232,11 +2198,12 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
        jbd_lock_bh_journal_head(bh);
        J_ASSERT_JH(jh, jh->b_jcount > 0);
        --jh->b_jcount;
-        if (!jh->b_jcount && !jh->b_transaction) {
+        if (!jh->b_jcount) {
                __journal_remove_journal_head(bh);
+                jbd_unlock_bh_journal_head(bh);
                __brelse(bh);
-        }
+        } else
-        jbd_unlock_bh_journal_head(bh);
+                jbd_unlock_bh_journal_head(bh);
 }
 /*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 3eec82d32fd4..2d7109414cdd 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
+static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
 /*
 * jbd2_get_transaction: obtain a new transaction_t object.
@@ -764,7 +765,6 @@ repeat:
        if (!jh->b_transaction) {
                JBUFFER_TRACE(jh, "no transaction");
                J_ASSERT_JH(jh, !jh->b_next_transaction);
-                jh->b_transaction = transaction;
                JBUFFER_TRACE(jh, "file as BJ_Reserved");
                spin_lock(&journal->j_list_lock);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
@@ -814,7 +814,6 @@ out:
 * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
 * @handle: transaction to add buffer modifications to
 * @bh:     bh to be used for metadata writes
- * @credits: variable that will receive credits for the buffer
 *
 * Returns an error code or 0 on success.
 *
@@ -896,8 +895,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
                 * committed and so it's safe to clear the dirty bit.
                 */
                clear_buffer_dirty(jh2bh(jh));
-                jh->b_transaction = transaction;
                /* first access by this transaction */
                jh->b_modified = 0;
@@ -932,7 +929,6 @@ out:
 *     non-rewindable consequences
 * @handle: transaction
 * @bh: buffer to undo
- * @credits: store the number of taken credits here (if not NULL)
 *
 * Sometimes there is a need to distinguish between metadata which has
 * been committed to disk and that which has not.  The ext3fs code uses
@@ -1232,8 +1228,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                        __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                } else {
                        __jbd2_journal_unfile_buffer(jh);
-                        jbd2_journal_remove_journal_head(bh);
-                        __brelse(bh);
                        if (!buffer_jbd(bh)) {
                                spin_unlock(&journal->j_list_lock);
                                jbd_unlock_bh_state(bh);
@@ -1556,19 +1550,32 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
                mark_buffer_dirty(bh);  /* Expose it to the VM */
 }
-void __jbd2_journal_unfile_buffer(struct journal_head *jh)
+/*
+ * Remove buffer from all transactions.
+ *
+ * Called with bh_state lock and j_list_lock
+ *
+ * jh and bh may be already freed when this function returns.
+ */
+static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
 {
        __jbd2_journal_temp_unlink_buffer(jh);
        jh->b_transaction = NULL;
+        jbd2_journal_put_journal_head(jh);
 }
 void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
 {
-        jbd_lock_bh_state(jh2bh(jh));
+        struct buffer_head *bh = jh2bh(jh);
+        /* Get reference so that buffer cannot be freed before we unlock it */
+        get_bh(bh);
+        jbd_lock_bh_state(bh);
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_unfile_buffer(jh);
        spin_unlock(&journal->j_list_lock);
-        jbd_unlock_bh_state(jh2bh(jh));
+        jbd_unlock_bh_state(bh);
+        __brelse(bh);
 }
 /*
@@ -1595,8 +1602,6 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
                if (jh->b_jlist == BJ_None) {
                        JBUFFER_TRACE(jh, "remove from checkpoint list");
                        __jbd2_journal_remove_checkpoint(jh);
-                        jbd2_journal_remove_journal_head(bh);
-                        __brelse(bh);
                }
        }
        spin_unlock(&journal->j_list_lock);
@@ -1659,7 +1664,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
                /*
                 * We take our own ref against the journal_head here to avoid
                 * having to add tons of locking around each instance of
-                 * jbd2_journal_remove_journal_head() and
                 * jbd2_journal_put_journal_head().
                 */
                jh = jbd2_journal_grab_journal_head(bh);
@@ -1697,10 +1701,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        int may_free = 1;
        struct buffer_head *bh = jh2bh(jh);
-        __jbd2_journal_unfile_buffer(jh);
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+                __jbd2_journal_temp_unlink_buffer(jh);
                /*
                 * We don't want to write the buffer anymore, clear the
                 * bit so that we don't confuse checks in
@@ -1711,8 +1714,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
-                jbd2_journal_remove_journal_head(bh);
+                __jbd2_journal_unfile_buffer(jh);
-                __brelse(bh);
        }
        return may_free;
 }
@@ -1990,6 +1992,8 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
        if (jh->b_transaction)
                __jbd2_journal_temp_unlink_buffer(jh);
+        else
+                jbd2_journal_grab_journal_head(bh);
        jh->b_transaction = transaction;
        switch (jlist) {
@@ -2041,9 +2045,10 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
 * already started to be used by a subsequent transaction, refile the
 * buffer on that transaction's metadata list.
 *
- * Called under journal->j_list_lock
+ * Called under j_list_lock
- *
 * Called under jbd_lock_bh_state(jh2bh(jh))
+ *
+ * jh and bh may be already free when this function returns
 */
 void __jbd2_journal_refile_buffer(struct journal_head *jh)
 {
@@ -2067,6 +2072,11 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
        was_dirty = test_clear_buffer_jbddirty(bh);
        __jbd2_journal_temp_unlink_buffer(jh);
+        /*
+         * We set b_transaction here because b_next_transaction will inherit
+         * our jh reference and thus __jbd2_journal_file_buffer() must not
+         * take a new one.
+         */
        jh->b_transaction = jh->b_next_transaction;
        jh->b_next_transaction = NULL;
        if (buffer_freed(bh))
@@ -2083,30 +2093,21 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
 }
 /*
- * For the unlocked version of this call, also make sure that any
+ * __jbd2_journal_refile_buffer() with necessary locking added. We take our
- * hanging journal_head is cleaned up if necessary.
+ * bh reference so that we can safely unlock bh.
- *
+ *
- * __jbd2_journal_refile_buffer is usually called as part of a single locked
+ * The jh and bh may be freed by this call.
- * operation on a buffer_head, in which the caller is probably going to
- * be hooking the journal_head onto other lists.  In that case it is up
- * to the caller to remove the journal_head if necessary.  For the
- * unlocked jbd2_journal_refile_buffer call, the caller isn't going to be
- * doing anything else to the buffer so we need to do the cleanup
- * ourselves to avoid a jh leak.
- *
- * *** The journal_head may be freed by this call! ***
 */
 void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 {
        struct buffer_head *bh = jh2bh(jh);
+        /* Get reference so that buffer cannot be freed before we unlock it */
+        get_bh(bh);
        jbd_lock_bh_state(bh);
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_refile_buffer(jh);
        jbd_unlock_bh_state(bh);
-        jbd2_journal_remove_journal_head(bh);
        spin_unlock(&journal->j_list_lock);
        __brelse(bh);
 }
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index c5ce6c1d1ff4..2f3f531f3606 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -66,9 +66,9 @@ static int jfs_open(struct inode *inode, struct file *file)
                struct jfs_inode_info *ji = JFS_IP(inode);
                spin_lock_irq(&ji->ag_lock);
                if (ji->active_ag == -1) {
-                        ji->active_ag = ji->agno;
+                        struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb);
-                        atomic_inc(
+                        ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb);
-                            &JFS_SBI(inode->i_sb)->bmap->db_active[ji->agno]);
+                        atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]);
                }
                spin_unlock_irq(&ji->ag_lock);
        }
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index ed53a4740168..b78b2f978f04 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -397,7 +397,7 @@ int diRead(struct inode *ip)
        release_metapage(mp);
        /* set the ag for the inode */
-        JFS_IP(ip)->agno = BLKTOAG(agstart, sbi);
+        JFS_IP(ip)->agstart = agstart;
        JFS_IP(ip)->active_ag = -1;
        return (rc);
@@ -901,7 +901,7 @@ int diFree(struct inode *ip)
        /* get the allocation group for this ino.
         */
-        agno = JFS_IP(ip)->agno;
+        agno = BLKTOAG(JFS_IP(ip)->agstart, JFS_SBI(ip->i_sb));
        /* Lock the AG specific inode map information
         */
@@ -1315,12 +1315,11 @@ int diFree(struct inode *ip)
 static inline void
 diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
 {
-        struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
        struct jfs_inode_info *jfs_ip = JFS_IP(ip);
        ip->i_ino = (iagno << L2INOSPERIAG) + ino;
        jfs_ip->ixpxd = iagp->inoext[extno];
-        jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
+        jfs_ip->agstart = le64_to_cpu(iagp->agstart);
        jfs_ip->active_ag = -1;
 }
@@ -1379,7 +1378,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
         */
        /* get the ag number of this iag */
-        agno = JFS_IP(pip)->agno;
+        agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb));
        if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
                /*
@@ -2921,10 +2920,9 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
                        continue;
                }
-                /* agstart that computes to the same ag is treated as same; */
                agstart = le64_to_cpu(iagp->agstart);
-                /* iagp->agstart = agstart & ~(mp->db_agsize - 1); */
                n = agstart >> mp->db_agl2size;
+                iagp->agstart = cpu_to_le64((s64)n << mp->db_agl2size);
                /* compute backed inodes */
                numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts))
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 1439f119ec83..584a4a1a6e81 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -50,8 +50,9 @@ struct jfs_inode_info {
        short   btindex;        /* btpage entry index*/
        struct inode *ipimap;   /* inode map                    */
        unsigned long cflag;    /* commit flags         */
+        u64     agstart;        /* agstart of the containing IAG */
        u16     bxflag;         /* xflag of pseudo buffer?      */
-        unchar  agno;           /* ag number                    */
+        unchar  pad;
        signed char active_ag;  /* ag currently allocating from */
        lid_t   blid;           /* lid of pseudo buffer?        */
        lid_t   atlhead;        /* anonymous tlock list head    */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 278e3fb40b71..583636f745e5 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1123,7 +1123,7 @@ int lmLogOpen(struct super_block *sb)
        bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
                                 log);
        if (IS_ERR(bdev)) {
-                rc = -PTR_ERR(bdev);
+                rc = PTR_ERR(bdev);
                goto free;
        }
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 8ea5efb5a34e..8d0c1c7c0820 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -80,7 +80,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        int log_formatted = 0;
        struct inode *iplist[1];
        struct jfs_superblock *j_sb, *j_sb2;
-        uint old_agsize;
+        s64 old_agsize;
        int agsizechanged = 0;
        struct buffer_head *bh, *bh2;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index adb45ec9038c..e374050a911c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -708,7 +708,13 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
        if (task->tk_status < 0) {
                dprintk("lockd: unlock failed (err = %d)\n", -task->tk_status);
-                goto retry_rebind;
+                switch (task->tk_status) {
+                case -EACCES:
+                case -EIO:
+                        goto die;
+                default:
+                        goto retry_rebind;
+                }
        }
        if (status == NLM_LCK_DENIED_GRACE_PERIOD) {
                rpc_delay(task, NLMCLNT_GRACE_WAIT);
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9ed89d1663f8..1afae26cf236 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -555,13 +555,6 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
        return __logfs_create(dir, dentry, inode, target, destlen);
 }
-static int logfs_permission(struct inode *inode, int mask, unsigned int flags)
-{
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
-        return generic_permission(inode, mask, flags, NULL);
-}
 static int logfs_link(struct dentry *old_dentry, struct inode *dir,
                struct dentry *dentry)
 {
@@ -820,7 +813,6 @@ const struct inode_operations logfs_dir_iops = {
        .mknod          = logfs_mknod,
        .rename         = logfs_rename,
        .rmdir          = logfs_rmdir,
-        .permission     = logfs_permission,
        .symlink        = logfs_symlink,
        .unlink         = logfs_unlink,
 };
diff --git a/fs/namei.c b/fs/namei.c
index e2e4e8d032ee..0223c41fb114 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -238,7 +238,8 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
        /*
         * Read/write DACs are always overridable.
-         * Executable DACs are overridable if at least one exec bit is set.
+         * Executable DACs are overridable for all directories and
+         * for non-directories that have least one exec bit set.
         */
        if (!(mask & MAY_EXEC) || execute_ok(inode))
                if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
@@ -812,6 +813,11 @@ static int follow_automount(struct path *path, unsigned flags,
        if (!mnt) /* mount collision */
                return 0;
+        if (!*need_mntput) {
+                /* lock_mount() may release path->mnt on error */
+                mntget(path->mnt);
+                *need_mntput = true;
+        }
        err = finish_automount(mnt, path);
        switch (err) {
@@ -819,12 +825,9 @@ static int follow_automount(struct path *path, unsigned flags,
                /* Someone else made a mount here whilst we were busy */
                return 0;
        case 0:
-                dput(path->dentry);
+                path_put(path);
-                if (*need_mntput)
-                        mntput(path->mnt);
                path->mnt = mnt;
                path->dentry = dget(mnt->mnt_root);
-                *need_mntput = true;
                return 0;
        default:
                return err;
@@ -844,9 +847,10 @@ static int follow_automount(struct path *path, unsigned flags,
 */
 static int follow_managed(struct path *path, unsigned flags)
 {
+        struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
        unsigned managed;
        bool need_mntput = false;
-        int ret;
+        int ret = 0;
        /* Given that we're not holding a lock here, we retain the value in a
         * local variable for each dentry as we look at it so that we don't see
@@ -861,7 +865,7 @@ static int follow_managed(struct path *path, unsigned flags)
                        BUG_ON(!path->dentry->d_op->d_manage);
                        ret = path->dentry->d_op->d_manage(path->dentry, false);
                        if (ret < 0)
-                                return ret == -EISDIR ? 0 : ret;
+                                break;
                }
                /* Transit to a mounted filesystem. */
@@ -887,14 +891,19 @@ static int follow_managed(struct path *path, unsigned flags)
                if (managed & DCACHE_NEED_AUTOMOUNT) {
                        ret = follow_automount(path, flags, &need_mntput);
                        if (ret < 0)
-                                return ret == -EISDIR ? 0 : ret;
+                                break;
                        continue;
                }
                /* We didn't change the current path point */
                break;
        }
-        return 0;
+        if (need_mntput && path->mnt == mnt)
+                mntput(path->mnt);
+        if (ret == -EISDIR)
+                ret = 0;
+        return ret;
 }
 int follow_down_one(struct path *path)
@@ -1003,9 +1012,6 @@ failed:
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
- *
- * Care must be taken as namespace_sem may be held (indicated by mounting_here
- * being true).
 */
 int follow_down(struct path *path)
 {
@@ -2624,6 +2630,10 @@ static long do_rmdir(int dfd, const char __user *pathname)
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto exit2;
+        if (!dentry->d_inode) {
+                error = -ENOENT;
+                goto exit3;
+        }
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto exit3;
@@ -2712,8 +2722,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                if (nd.last.name[nd.last.len])
                        goto slashes;
                inode = dentry->d_inode;
-                if (inode)
+                if (!inode)
-                        ihold(inode);
+                        goto slashes;
+                ihold(inode);
                error = mnt_want_write(nd.path.mnt);
                if (error)
                        goto exit2;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 144f2a3c7185..6f4850deb272 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -256,7 +256,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
        nfs_attr_check_mountpoint(sb, fattr);
-        if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0 && (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0)
+        if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) &&
+            !nfs_attr_use_mounted_on_fileid(fattr))
                goto out_no_inode;
        if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
                goto out_no_inode;
@@ -1294,7 +1295,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                if (new_isize != cur_isize) {
                        /* Do we perhaps have any outstanding writes, or has
                         * the file grown beyond our last write? */
-                        if (nfsi->npages == 0 || new_isize > cur_isize) {
+                        if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) ||
+                             new_isize > cur_isize) {
                                i_size_write(inode, new_isize);
                                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
                        }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b9056cbe68d6..2a55347a2daa 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -45,6 +45,17 @@ static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct
                fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT;
 }
+static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr)
+{
+        if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) ||
+            (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) &&
+             ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0)))
+                return 0;
+        fattr->fileid = fattr->mounted_on_fileid;
+        return 1;
+}
 struct nfs_clone_mount {
        const struct super_block *sb;
        const struct dentry *dentry;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 426908809c97..0bafcc91c27f 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -30,6 +30,7 @@
 */
 #include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
 #include "internal.h"
 #include "nfs4filelayout.h"
@@ -552,13 +553,18 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
                __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
                fl->pattern_offset);
-        if (!fl->num_fh)
+        /* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
+         * Futher checking is done in filelayout_check_layout */
+        if (fl->num_fh < 0 || fl->num_fh >
+            max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
                goto out_err;
-        fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
+        if (fl->num_fh > 0) {
-                               gfp_flags);
+                fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
-        if (!fl->fh_array)
+                                       gfp_flags);
-                goto out_err;
+                if (!fl->fh_array)
+                        goto out_err;
+        }
        for (i = 0; i < fl->num_fh; i++) {
                /* Do we want to use a mempool here? */
@@ -661,8 +667,9 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
        u64 p_stripe, r_stripe;
        u32 stripe_unit;
-        if (!pnfs_generic_pg_test(pgio, prev, req))
+        if (!pnfs_generic_pg_test(pgio, prev, req) ||
-                return 0;
+            !nfs_generic_pg_test(pgio, prev, req))
+                return false;
        if (!pgio->pg_lseg)
                return 1;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d2c4b59c896d..5879b23e0c99 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2265,12 +2265,14 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
        return nfs4_map_errors(status);
 }
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
 /*
 * Get locations and (maybe) other attributes of a referral.
 * Note that we'll actually follow the referral later when
 * we detect fsid mismatch in inode revalidation
 */
-static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle)
+static int nfs4_get_referral(struct inode *dir, const struct qstr *name,
+                             struct nfs_fattr *fattr, struct nfs_fh *fhandle)
 {
        int status = -ENOMEM;
        struct page *page = NULL;
@@ -2288,15 +2290,16 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct
                goto out;
        /* Make sure server returned a different fsid for the referral */
        if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
-                dprintk("%s: server did not return a different fsid for a referral at %s\n", __func__, name->name);
+                dprintk("%s: server did not return a different fsid for"
+                        " a referral at %s\n", __func__, name->name);
                status = -EIO;
                goto out;
        }
+        /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */
+        nfs_fixup_referral_attributes(&locations->fattr);
+        /* replace the lookup nfs_fattr with the locations nfs_fattr */
        memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
-        fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL;
-        if (!fattr->mode)
-                fattr->mode = S_IFDIR;
        memset(fhandle, 0, sizeof(struct nfs_fh));
 out:
        if (page)
@@ -4667,11 +4670,15 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
        return len;
 }
+/*
+ * nfs_fhget will use either the mounted_on_fileid or the fileid
+ */
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
 {
-        if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) &&
+        if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) ||
-                (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+               (fattr->valid & NFS_ATTR_FATTR_FILEID)) &&
-                (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
+              (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+              (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
                return;
        fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
@@ -4686,7 +4693,6 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
        struct nfs_server *server = NFS_SERVER(dir);
        u32 bitmask[2] = {
                [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
-                [1] = FATTR4_WORD1_MOUNTED_ON_FILEID,
        };
        struct nfs4_fs_locations_arg args = {
                .dir_fh = NFS_FH(dir),
@@ -4705,11 +4711,18 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
        int status;
        dprintk("%s: start\n", __func__);
+        /* Ask for the fileid of the absent filesystem if mounted_on_fileid
+         * is not supported */
+        if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
+                bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
+        else
+                bitmask[0] |= FATTR4_WORD0_FILEID;
        nfs_fattr_init(&fs_locations->fattr);
        fs_locations->server = server;
        fs_locations->nlocations = 0;
        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
-        nfs_fixup_referral_attributes(&fs_locations->fattr);
        dprintk("%s: returned status = %d\n", __func__, status);
        return status;
 }
@@ -5098,7 +5111,6 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
        if (mxresp_sz == 0)
                mxresp_sz = NFS_MAX_FILE_IO_SIZE;
        /* Fore channel attributes */
-        args->fc_attrs.headerpadsz = 0;
        args->fc_attrs.max_rqst_sz = mxrqst_sz;
        args->fc_attrs.max_resp_sz = mxresp_sz;
        args->fc_attrs.max_ops = NFS4_MAX_OPS;
@@ -5111,7 +5123,6 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
                args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
        /* Back channel attributes */
-        args->bc_attrs.headerpadsz = 0;
        args->bc_attrs.max_rqst_sz = PAGE_SIZE;
        args->bc_attrs.max_resp_sz = PAGE_SIZE;
        args->bc_attrs.max_resp_sz_cached = 0;
@@ -5131,8 +5142,6 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args
        struct nfs4_channel_attrs *sent = &args->fc_attrs;
        struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
-        if (rcvd->headerpadsz > sent->headerpadsz)
-                return -EINVAL;
        if (rcvd->max_resp_sz > sent->max_resp_sz)
                return -EINVAL;
        /*
@@ -5697,6 +5706,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_layoutreturn *lrp = calldata;
        struct nfs_server *server;
+        struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
        dprintk("--> %s\n", __func__);
@@ -5708,16 +5718,15 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
                nfs_restart_rpc(task, lrp->clp);
                return;
        }
+        spin_lock(&lo->plh_inode->i_lock);
        if (task->tk_status == 0) {
-                struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
                if (lrp->res.lrs_present) {
-                        spin_lock(&lo->plh_inode->i_lock);
                        pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
-                        spin_unlock(&lo->plh_inode->i_lock);
                } else
                        BUG_ON(!list_empty(&lo->plh_segs));
        }
+        lo->plh_block_lgets--;
+        spin_unlock(&lo->plh_inode->i_lock);
        dprintk("<-- %s\n", __func__);
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d869a5e5464b..6870bc61ceec 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -255,7 +255,7 @@ static int nfs4_stat_to_errno(int);
 #define decode_fs_locations_maxsz \
                                (0)
 #define encode_secinfo_maxsz    (op_encode_hdr_maxsz + nfs4_name_maxsz)
-#define decode_secinfo_maxsz    (op_decode_hdr_maxsz + 4 + (NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)))
+#define decode_secinfo_maxsz    (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4))
 #if defined(CONFIG_NFS_V4_1)
 #define NFS4_MAX_MACHINE_NAME_LEN (64)
@@ -1725,7 +1725,7 @@ static void encode_create_session(struct xdr_stream *xdr,
        *p++ = cpu_to_be32(args->flags);                        /*flags */
        /* Fore Channel */
-        *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
+        *p++ = cpu_to_be32(0);                          /* header padding size */
        *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
        *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
        *p++ = cpu_to_be32(max_resp_sz_cached);         /* Max resp sz cached */
@@ -1734,7 +1734,7 @@ static void encode_create_session(struct xdr_stream *xdr,
        *p++ = cpu_to_be32(0);                          /* rdmachannel_attrs */
        /* Back Channel */
-        *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
+        *p++ = cpu_to_be32(0);                          /* header padding size */
        *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */
        *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */
        *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached);  /* Max resp sz cached */
@@ -3098,7 +3098,7 @@ out_overflow:
        return -EIO;
 }
-static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
+static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res)
 {
        __be32 *p;
@@ -3109,7 +3109,7 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
                if (unlikely(!p))
                        goto out_overflow;
                bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
-                return -be32_to_cpup(p);
+                *res = -be32_to_cpup(p);
        }
        return 0;
 out_overflow:
@@ -4070,6 +4070,7 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
        int status;
        umode_t fmode = 0;
        uint32_t type;
+        int32_t err;
        status = decode_attr_type(xdr, bitmap, &type);
        if (status < 0)
@@ -4095,13 +4096,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_error(xdr, bitmap);
+        err = 0;
-        if (status == -NFS4ERR_WRONGSEC) {
+        status = decode_attr_error(xdr, bitmap, &err);
-                nfs_fixup_secinfo_attributes(fattr, fh);
-                status = 0;
-        }
        if (status < 0)
                goto xdr_error;
+        if (err == -NFS4ERR_WRONGSEC)
+                nfs_fixup_secinfo_attributes(fattr, fh);
        status = decode_attr_filehandle(xdr, bitmap, fh);
        if (status < 0)
@@ -4997,12 +4997,14 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
                             struct nfs4_channel_attrs *attrs)
 {
        __be32 *p;
-        u32 nr_attrs;
+        u32 nr_attrs, val;
        p = xdr_inline_decode(xdr, 28);
        if (unlikely(!p))
                goto out_overflow;
-        attrs->headerpadsz = be32_to_cpup(p++);
+        val = be32_to_cpup(p++);        /* headerpadsz */
+        if (val)
+                return -EINVAL;         /* no support for header padding yet */
        attrs->max_rqst_sz = be32_to_cpup(p++);
        attrs->max_resp_sz = be32_to_cpup(p++);
        attrs->max_resp_sz_cached = be32_to_cpup(p++);
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 9cf208df1f25..8ff2ea3f10ef 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -108,7 +108,6 @@ _dev_list_add(const struct nfs_server *nfss,
                de = n;
        }
-        atomic_inc(&de->id_node.ref);
        return de;
 }
@@ -1001,6 +1000,9 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
        if (!pnfs_generic_pg_test(pgio, prev, req))
                return false;
+        if (pgio->pg_lseg == NULL)
+                return true;
        return pgio->pg_count + req->wb_bytes <=
                        OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
 }
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index dc3956c0de80..1d06f8e2adea 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -291,7 +291,7 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
        struct nfs_read_data *rdata;
        state->status = status;
-        dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
+        dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof);
        rdata = state->rpcdata;
        rdata->task.tk_status = status;
        if (status >= 0) {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7913961aff22..009855716286 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -204,7 +204,7 @@ nfs_wait_on_request(struct nfs_page *req)
                        TASK_UNINTERRUPTIBLE);
 }
-static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
+bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
 {
        /*
         * FIXME: ideally we should be able to coalesce all requests
@@ -218,6 +218,7 @@ static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_p
        return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
 }
+EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
 /**
 * nfs_pageio_init - initialise a page io descriptor
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8c1309d852a6..29c0ca7fc347 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -634,14 +634,16 @@ _pnfs_return_layout(struct inode *ino)
        spin_lock(&ino->i_lock);
        lo = nfsi->layout;
-        if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) {
+        if (!lo) {
                spin_unlock(&ino->i_lock);
-                dprintk("%s: no layout segments to return\n", __func__);
+                dprintk("%s: no layout to return\n", __func__);
-                goto out;
+                return status;
        }
        stateid = nfsi->layout->plh_stateid;
        /* Reference matched in nfs4_layoutreturn_release */
        get_layout_hdr(lo);
+        mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+        lo->plh_block_lgets++;
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
@@ -650,6 +652,9 @@ _pnfs_return_layout(struct inode *ino)
        lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
        if (unlikely(lrp == NULL)) {
                status = -ENOMEM;
+                set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags);
+                set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags);
+                put_layout_hdr(lo);
                goto out;
        }
@@ -887,7 +892,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
                        ret = get_lseg(lseg);
                        break;
                }
-                if (cmp_layout(range, &lseg->pls_range) > 0)
+                if (lseg->pls_range.offset > range->offset)
                        break;
        }
@@ -1059,23 +1064,36 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
                gfp_flags = GFP_NOFS;
        }
-        if (pgio->pg_count == prev->wb_bytes) {
+        if (pgio->pg_lseg == NULL) {
+                if (pgio->pg_count != prev->wb_bytes)
+                        return true;
                /* This is first coelesce call for a series of nfs_pages */
                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                                   prev->wb_context,
-                                                   req_offset(req),
+                                                   req_offset(prev),
                                                   pgio->pg_count,
                                                   access_type,
                                                   gfp_flags);
-                return true;
+                if (pgio->pg_lseg == NULL)
+                        return true;
        }
-        if (pgio->pg_lseg &&
+        /*
-            req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
+         * Test if a nfs_page is fully contained in the pnfs_layout_range.
-                                         pgio->pg_lseg->pls_range.length))
+         * Note that this test makes several assumptions:
-                return false;
+         * - that the previous nfs_page in the struct nfs_pageio_descriptor
+         *   is known to lie within the range.
-        return true;
+         *   - that the nfs_page being tested is known to be contiguous with the
+         *   previous nfs_page.
+         *   - Layout ranges are page aligned, so we only have to test the
+         *   start offset of the request.
+         *
+         * Please also note that 'end_offset' is actually the offset of the
+         * first byte that lies outside the pnfs_layout_range. FIXME?
+         *
+         */
+        return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
+                                         pgio->pg_lseg->pls_range.length);
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 48d0a8e4d062..96bf4e6f45be 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -186,6 +186,7 @@ int pnfs_ld_read_done(struct nfs_read_data *);
 /* pnfs_dev.c */
 struct nfs4_deviceid_node {
        struct hlist_node               node;
+        struct hlist_node               tmpnode;
        const struct pnfs_layoutdriver_type *ld;
        const struct nfs_client         *nfs_client;
        struct nfs4_deviceid            deviceid;
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index c65e133ce9c0..f0f8e1e22f6c 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -174,6 +174,7 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
                        const struct nfs4_deviceid *id)
 {
        INIT_HLIST_NODE(&d->node);
+        INIT_HLIST_NODE(&d->tmpnode);
        d->ld = ld;
        d->nfs_client = nfs_client;
        d->deviceid = *id;
@@ -208,6 +209,7 @@ nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
        hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
        spin_unlock(&nfs4_deviceid_lock);
+        atomic_inc(&new->ref);
        return new;
 }
@@ -238,24 +240,29 @@ static void
 _deviceid_purge_client(const struct nfs_client *clp, long hash)
 {
        struct nfs4_deviceid_node *d;
-        struct hlist_node *n, *next;
+        struct hlist_node *n;
        HLIST_HEAD(tmp);
+        spin_lock(&nfs4_deviceid_lock);
        rcu_read_lock();
        hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
                if (d->nfs_client == clp && atomic_read(&d->ref)) {
                        hlist_del_init_rcu(&d->node);
-                        hlist_add_head(&d->node, &tmp);
+                        hlist_add_head(&d->tmpnode, &tmp);
                }
        rcu_read_unlock();
+        spin_unlock(&nfs4_deviceid_lock);
        if (hlist_empty(&tmp))
                return;
        synchronize_rcu();
-        hlist_for_each_entry_safe(d, n, next, &tmp, node)
+        while (!hlist_empty(&tmp)) {
+                d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode);
+                hlist_del(&d->tmpnode);
                if (atomic_dec_and_test(&d->ref))
                        d->ld->free_deviceid_node(d);
+        }
 }
 void
@@ -263,8 +270,8 @@ nfs4_deviceid_purge_client(const struct nfs_client *clp)
 {
        long h;
-        spin_lock(&nfs4_deviceid_lock);
+        if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
+                return;
        for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
                _deviceid_purge_client(clp, h);
-        spin_unlock(&nfs4_deviceid_lock);
 }
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 18b3e8975fe0..fbb2a5ef5817 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,7 @@ config NFSD_V4
        select NFSD_V3
        select FS_POSIX_ACL
        select SUNRPC_GSS
+        select CRYPTO
        help
          This option enables support in your system's NFS server for
          version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 1f5eae40f34e..2b1449dd2f49 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -13,6 +13,7 @@
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/gss_krb5_enctypes.h>
 #include "idmap.h"
 #include "nfsd.h"
@@ -189,18 +190,10 @@ static struct file_operations export_features_operations = {
        .release        = single_release,
 };
-#ifdef CONFIG_SUNRPC_GSS
+#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
 static int supported_enctypes_show(struct seq_file *m, void *v)
 {
-        struct gss_api_mech *k5mech;
+        seq_printf(m, KRB5_SUPPORTED_ENCTYPES);
-        k5mech = gss_mech_get_by_name("krb5");
-        if (k5mech == NULL)
-                goto out;
-        if (k5mech->gm_upcall_enctypes != NULL)
-                seq_printf(m, k5mech->gm_upcall_enctypes);
-        gss_mech_put(k5mech);
-out:
        return 0;
 }
@@ -215,7 +208,7 @@ static struct file_operations supported_enctypes_ops = {
        .llseek         = seq_lseek,
        .release        = single_release,
 };
-#endif /* CONFIG_SUNRPC_GSS */
+#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
 extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
 extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
@@ -1427,9 +1420,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
                [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
-#ifdef CONFIG_SUNRPC_GSS
+#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
                [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
-#endif /* CONFIG_SUNRPC_GSS */
+#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
 #ifdef CONFIG_NFSD_V4
                [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d5718273bb32..fd0acca5370a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -696,7 +696,15 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
 }
 #endif /* CONFIG_NFSD_V3 */
+static int nfsd_open_break_lease(struct inode *inode, int access)
+{
+        unsigned int mode;
+        if (access & NFSD_MAY_NOT_BREAK_LEASE)
+                return 0;
+        mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY;
+        return break_lease(inode, mode | O_NONBLOCK);
+}
 /*
 * Open an existing file or directory.
@@ -744,12 +752,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        if (!inode->i_fop)
                goto out;
-        /*
+        host_err = nfsd_open_break_lease(inode, access);
-         * Check to see if there are any leases on this file.
-         * This may block while leases are broken.
-         */
-        if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
-                host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
        if (host_err) /* NOMEM or WOULDBLOCK */
                goto out_nfserr;
@@ -1660,8 +1663,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
        if (!dold->d_inode)
                goto out_drop_write;
        host_err = nfsd_break_lease(dold->d_inode);
-        if (host_err)
+        if (host_err) {
+                err = nfserrno(host_err);
                goto out_drop_write;
+        }
        host_err = vfs_link(dold, dirp, dnew);
        if (!host_err) {
                err = nfserrno(commit_metadata(ffhp));
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 7eafe468a29c..b2e3ff347620 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1346,6 +1346,11 @@ static void nilfs_btree_shrink(struct nilfs_bmap *btree,
        path[level].bp_bh = NULL;
 }
+static void nilfs_btree_nop(struct nilfs_bmap *btree,
+                            struct nilfs_btree_path *path,
+                            int level, __u64 *keyp, __u64 *ptrp)
+{
+}
 static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
@@ -1356,20 +1361,19 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
        struct buffer_head *bh;
        struct nilfs_btree_node *node, *parent, *sib;
        __u64 sibptr;
-        int pindex, level, ncmin, ncmax, ncblk, ret;
+        int pindex, dindex, level, ncmin, ncmax, ncblk, ret;
        ret = 0;
        stats->bs_nblocks = 0;
        ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
        ncblk = nilfs_btree_nchildren_per_block(btree);
-        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN, dindex = path[level].bp_index;
             level < nilfs_btree_height(btree) - 1;
             level++) {
                node = nilfs_btree_get_nonroot_node(path, level);
                path[level].bp_oldreq.bpr_ptr =
-                        nilfs_btree_node_get_ptr(node, path[level].bp_index,
+                        nilfs_btree_node_get_ptr(node, dindex, ncblk);
-                                                 ncblk);
                ret = nilfs_bmap_prepare_end_ptr(btree,
                                                 &path[level].bp_oldreq, dat);
                if (ret < 0)
@@ -1383,6 +1387,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
                parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
                pindex = path[level + 1].bp_index;
+                dindex = pindex;
                if (pindex > 0) {
                        /* left sibling */
@@ -1421,6 +1426,14 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_concat_right;
                                stats->bs_nblocks++;
+                                /*
+                                 * When merging right sibling node
+                                 * into the current node, pointer to
+                                 * the right sibling node must be
+                                 * terminated instead.  The adjustment
+                                 * below is required for that.
+                                 */
+                                dindex = pindex + 1;
                                /* continue; */
                        }
                } else {
@@ -1431,29 +1444,31 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
                            NILFS_BTREE_ROOT_NCHILDREN_MAX) {
                                path[level].bp_op = nilfs_btree_shrink;
                                stats->bs_nblocks += 2;
+                                level++;
+                                path[level].bp_op = nilfs_btree_nop;
+                                goto shrink_root_child;
                        } else {
                                path[level].bp_op = nilfs_btree_do_delete;
                                stats->bs_nblocks++;
+                                goto out;
                        }
-                        goto out;
                }
        }
+        /* child of the root node is deleted */
+        path[level].bp_op = nilfs_btree_do_delete;
+        stats->bs_nblocks++;
+shrink_root_child:
        node = nilfs_btree_get_root(btree);
        path[level].bp_oldreq.bpr_ptr =
-                nilfs_btree_node_get_ptr(node, path[level].bp_index,
+                nilfs_btree_node_get_ptr(node, dindex,
                                         NILFS_BTREE_ROOT_NCHILDREN_MAX);
        ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat);
        if (ret < 0)
                goto err_out_child_node;
-        /* child of the root node is deleted */
-        path[level].bp_op = nilfs_btree_do_delete;
-        stats->bs_nblocks++;
        /* success */
 out:
        *levelp = level;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b954878ad6ce..b9b45fc2903e 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -801,12 +801,7 @@ out_err:
 int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        struct nilfs_root *root;
+        struct nilfs_root *root = NILFS_I(inode)->i_root;
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
-        root = NILFS_I(inode)->i_root;
        if ((mask & MAY_WRITE) && root &&
            root->cno != NILFS_CPTREE_CURRENT_CNO)
                return -EROFS; /* snapshot is not writable */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 141646e88fb5..bb24ab6c282f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2573,7 +2573,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
        sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
        if (nilfs->ns_interval)
-                sci->sc_interval = nilfs->ns_interval;
+                sci->sc_interval = HZ * nilfs->ns_interval;
        if (nilfs->ns_watermark)
                sci->sc_watermark = nilfs->ns_watermark;
        return sci;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index cdbaf5e97308..56f61027236b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1072,7 +1072,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = OCFS2_SUPER_MAGIC;
-        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+        sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
                ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
        /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index d738a7e493dd..2c6d95257a4d 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -4,7 +4,6 @@
 * Released under GPL v2.
 */
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0ead43549431..e3c63d1c5e13 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,13 +255,7 @@ static ssize_t part_discard_alignment_show(struct device *dev,
                                           struct device_attribute *attr, char *buf)
 {
        struct hd_struct *p = dev_to_part(dev);
-        struct gendisk *disk = dev_to_disk(dev);
+        return sprintf(buf, "%u\n", p->discard_alignment);
-        unsigned int alignment = 0;
-        if (disk->queue)
-                alignment = queue_limit_discard_alignment(&disk->queue->limits,
-                                                                p->start_sect);
-        return sprintf(buf, "%u\n", alignment);
 }
 ssize_t part_stat_show(struct device *dev,
@@ -455,6 +449,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        p->start_sect = start;
        p->alignment_offset =
                queue_limit_alignment_offset(&disk->queue->limits, start);
+        p->discard_alignment =
+                queue_limit_discard_alignment(&disk->queue->limits, start);
        p->nr_sects = len;
        p->partno = partno;
        p->policy = get_disk_ro(disk);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 14def991d9dd..8a84210ca080 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2169,11 +2169,7 @@ static const struct file_operations proc_fd_operations = {
 */
 static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        int rv;
+        int rv = generic_permission(inode, mask, flags, NULL);
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
-        rv = generic_permission(inode, mask, flags, NULL);
        if (rv == 0)
                return 0;
        if (task_pid(current) == proc_pid(inode))
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 781dec5bd682..be177f702acb 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -38,18 +38,21 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
        struct inode *inode;
        struct proc_inode *ei;
        struct dentry *error = ERR_PTR(-ENOENT);
+        void *ns;
        inode = proc_pid_make_inode(dir->i_sb, task);
        if (!inode)
                goto out;
+        ns = ns_ops->get(task);
+        if (!ns)
+                goto out_iput;
        ei = PROC_I(inode);
        inode->i_mode = S_IFREG|S_IRUSR;
        inode->i_fop  = &ns_file_operations;
        ei->ns_ops    = ns_ops;
-        ei->ns        = ns_ops->get(task);
+        ei->ns        = ns;
-        if (!ei->ns)
-                goto out_iput;
        dentry->d_op = &pid_dentry_operations;
        d_add(dentry, inode);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f50133c11c24..d167de365a8d 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -304,9 +304,6 @@ static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)
        struct ctl_table *table;
        int error;
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
        /* Executable files are not allowed under /proc/sys/ */
        if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
                return -EACCES;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index a9000e9cfee5..d6c3b416529b 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -28,11 +28,12 @@ static int proc_test_super(struct super_block *sb, void *data)
 static int proc_set_super(struct super_block *sb, void *data)
 {
-        struct pid_namespace *ns;
+        int err = set_anon_super(sb, NULL);
+        if (!err) {
-        ns = (struct pid_namespace *)data;
+                struct pid_namespace *ns = (struct pid_namespace *)data;
-        sb->s_fs_info = get_pid_ns(ns);
+                sb->s_fs_info = get_pid_ns(ns);
-        return set_anon_super(sb, NULL);
+        }
+        return err;
 }
 static struct dentry *proc_mount(struct file_system_type *fs_type,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e8a62f41b458..d78089690965 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -954,8 +954,6 @@ static int xattr_mount_check(struct super_block *s)
 int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
        /*
         * We don't do permission checks on the internal objects.
         * Permissions are determined by the "owning" object.
diff --git a/fs/super.c b/fs/super.c
index c75593953c52..ab3d672db0de 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -822,7 +822,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
        } else {
                char b[BDEVNAME_SIZE];
-                s->s_flags = flags;
+                s->s_flags = flags | MS_NOSEC;
                s->s_mode = mode;
                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(bdev));
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 266895783b47..e34f0d99ea4e 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -95,6 +95,14 @@ static int sysfs_set_super(struct super_block *sb, void *data)
        return error;
 }
+static void free_sysfs_super_info(struct sysfs_super_info *info)
+{
+        int type;
+        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
+                kobj_ns_drop(type, info->ns[type]);
+        kfree(info);
+}
 static struct dentry *sysfs_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
@@ -108,11 +116,11 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
                return ERR_PTR(-ENOMEM);
        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-                info->ns[type] = kobj_ns_current(type);
+                info->ns[type] = kobj_ns_grab_current(type);
        sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
        if (IS_ERR(sb) || sb->s_fs_info != info)
-                kfree(info);
+                free_sysfs_super_info(info);
        if (IS_ERR(sb))
                return ERR_CAST(sb);
        if (!sb->s_root) {
@@ -131,12 +139,11 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 static void sysfs_kill_sb(struct super_block *sb)
 {
        struct sysfs_super_info *info = sysfs_info(sb);
        /* Remove the superblock from fs_supers/s_instances
         * so we can't find it, before freeing sysfs_super_info.
         */
        kill_anon_super(sb);
-        kfree(info);
+        free_sysfs_super_info(info);
 }
 static struct file_system_type sysfs_fs_type = {
@@ -145,28 +152,6 @@ static struct file_system_type sysfs_fs_type = {
        .kill_sb        = sysfs_kill_sb,
 };
-void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
-{
-        struct super_block *sb;
-        mutex_lock(&sysfs_mutex);
-        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
-                struct sysfs_super_info *info = sysfs_info(sb);
-                /*
-                 * If we see a superblock on the fs_supers/s_instances
-                 * list the unmount has not completed and sb->s_fs_info
-                 * points to a valid struct sysfs_super_info.
-                 */
-                /* Ignore superblocks with the wrong ns */
-                if (info->ns[type] != ns)
-                        continue;
-                info->ns[type] = NULL;
-        }
-        spin_unlock(&sb_lock);
-        mutex_unlock(&sysfs_mutex);
-}
 int __init sysfs_init(void)
 {
        int err = -ENOMEM;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3d28af31d863..2ed2404f3113 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -136,7 +136,7 @@ struct sysfs_addrm_cxt {
 * instance).
 */
 struct sysfs_super_info {
-        const void *ns[KOBJ_NS_TYPES];
+        void *ns[KOBJ_NS_TYPES];
 };
 #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index f67acbdda5e8..dffeb3795af1 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -61,7 +61,9 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
 /*
 * Called when the clock was set to cancel the timers in the cancel
- * list.
+ * list. This will wake up processes waiting on these timers. The
+ * wake-up requires ctx->ticks to be non zero, therefore we increment
+ * it before calling wake_up_locked().
 */
 void timerfd_clock_was_set(void)
 {
@@ -76,6 +78,7 @@ void timerfd_clock_was_set(void)
                spin_lock_irqsave(&ctx->wqh.lock, flags);
                if (ctx->moffs.tv64 != moffs.tv64) {
                        ctx->moffs.tv64 = KTIME_MAX;
+                        ctx->ticks++;
                        wake_up_locked(&ctx->wqh);
                }
                spin_unlock_irqrestore(&ctx->wqh.lock, flags);
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 166951e0dcd3..3be645e012c9 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -581,6 +581,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        ubifs_assert(wbuf->size % c->min_io_size == 0);
        ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
        ubifs_assert(!c->ro_media && !c->ro_mount);
+        ubifs_assert(!c->space_fixup);
        if (c->leb_size - wbuf->offs >= c->max_write_size)
                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
@@ -759,6 +760,7 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
        ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
        ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
        ubifs_assert(!c->ro_media && !c->ro_mount);
+        ubifs_assert(!c->space_fixup);
        if (c->ro_error)
                return -EROFS;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 34b1679e6e3a..cef0460f4c54 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -669,6 +669,7 @@ out_free:
 out_release:
        release_head(c, BASEHD);
+        kfree(dent);
 out_ro:
        ubifs_ro_mode(c, err);
        if (last_reference)
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index bd644bf587a8..a5422fffbd69 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -674,7 +674,7 @@ static int kill_orphans(struct ubifs_info *c)
                if (IS_ERR(sleb)) {
                        if (PTR_ERR(sleb) == -EUCLEAN)
                                sleb = ubifs_recover_leb(c, lnum, 0,
-                                                         c->sbuf, 0);
+                                                         c->sbuf, -1);
                        if (IS_ERR(sleb)) {
                                err = PTR_ERR(sleb);
                                break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 731d9e2e7b50..783d8e0beb76 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,19 +564,15 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 }
 /**
- * drop_last_node - drop the last node or group of nodes.
+ * drop_last_group - drop the last group of nodes.
 * @sleb: scanned LEB information
 * @offs: offset of dropped nodes is returned here
- * @grouped: non-zero if whole group of nodes have to be dropped
 *
 * This is a helper function for 'ubifs_recover_leb()' which drops the last
- * node of the scanned LEB or the last group of nodes if @grouped is not zero.
+ * group of nodes of the scanned LEB.
- * This function returns %1 if a node was dropped and %0 otherwise.
 */
-static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
+static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
 {
-        int dropped = 0;
        while (!list_empty(&sleb->nodes)) {
                struct ubifs_scan_node *snod;
                struct ubifs_ch *ch;
@@ -585,17 +581,40 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
                                  list);
                ch = snod->node;
                if (ch->group_type != UBIFS_IN_NODE_GROUP)
-                        return dropped;
+                        break;
-                dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs);
+                dbg_rcvry("dropping grouped node at %d:%d",
+                          sleb->lnum, snod->offs);
+                *offs = snod->offs;
+                list_del(&snod->list);
+                kfree(snod);
+                sleb->nodes_cnt -= 1;
+        }
+}
+/**
+ * drop_last_node - drop the last node.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
+ *
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB.
+ */
+static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
+{
+        struct ubifs_scan_node *snod;
+        if (!list_empty(&sleb->nodes)) {
+                snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+                                  list);
+                dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs);
                *offs = snod->offs;
                list_del(&snod->list);
                kfree(snod);
                sleb->nodes_cnt -= 1;
-                dropped = 1;
-                if (!grouped)
-                        break;
        }
-        return dropped;
 }
 /**
@@ -604,7 +623,8 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
 * @lnum: LEB number
 * @offs: offset
 * @sbuf: LEB-sized buffer to use
- * @grouped: nodes may be grouped for recovery
+ * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not
+ *         belong to any journal head)
 *
 * This function does a scan of a LEB, but caters for errors that might have
 * been caused by the unclean unmount from which we are attempting to recover.
@@ -612,13 +632,14 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
 * found, and a negative error code in case of failure.
 */
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
-                                         int offs, void *sbuf, int grouped)
+                                         int offs, void *sbuf, int jhead)
 {
        int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
+        int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped;
        struct ubifs_scan_leb *sleb;
        void *buf = sbuf + offs;
-        dbg_rcvry("%d:%d", lnum, offs);
+        dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped);
        sleb = ubifs_start_scan(c, lnum, offs, sbuf);
        if (IS_ERR(sleb))
@@ -635,7 +656,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                 * Scan quietly until there is an error from which we cannot
                 * recover
                 */
-                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
+                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
                if (ret == SCANNED_A_NODE) {
                        /* A valid node, and not a padding node */
                        struct ubifs_ch *ch = buf;
@@ -695,59 +716,62 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                 * If nodes are grouped, always drop the incomplete group at
                 * the end.
                 */
-                drop_last_node(sleb, &offs, 1);
+                drop_last_group(sleb, &offs);
-        /*
+        if (jhead == GCHD) {
-         * While we are in the middle of the same min. I/O unit keep dropping
+                /*
-         * nodes. So basically, what we want is to make sure that the last min.
+                 * If this LEB belongs to the GC head then while we are in the
-         * I/O unit where we saw the corruption is dropped completely with all
+                 * middle of the same min. I/O unit keep dropping nodes. So
-         * the uncorrupted node which may possibly sit there.
+                 * basically, what we want is to make sure that the last min.
-         *
+                 * I/O unit where we saw the corruption is dropped completely
-         * In other words, let's name the min. I/O unit where the corruption
+                 * with all the uncorrupted nodes which may possibly sit there.
-         * starts B, and the previous min. I/O unit A. The below code tries to
+                 *
-         * deal with a situation when half of B contains valid nodes or the end
+                 * In other words, let's name the min. I/O unit where the
-         * of a valid node, and the second half of B contains corrupted data or
+                 * corruption starts B, and the previous min. I/O unit A. The
-         * garbage. This means that UBIFS had been writing to B just before the
+                 * below code tries to deal with a situation when half of B
-         * power cut happened. I do not know how realistic is this scenario
+                 * contains valid nodes or the end of a valid node, and the
-         * that half of the min. I/O unit had been written successfully and the
+                 * second half of B contains corrupted data or garbage. This
-         * other half not, but this is possible in our 'failure mode emulation'
+                 * means that UBIFS had been writing to B just before the power
-         * infrastructure at least.
+                 * cut happened. I do not know how realistic is this scenario
-         *
+                 * that half of the min. I/O unit had been written successfully
-         * So what is the problem, why we need to drop those nodes? Whey can't
+                 * and the other half not, but this is possible in our 'failure
-         * we just clean-up the second half of B by putting a padding node
+                 * mode emulation' infrastructure at least.
-         * there? We can, and this works fine with one exception which was
+                 *
-         * reproduced with power cut emulation testing and happens extremely
+                 * So what is the problem, why we need to drop those nodes? Why
-         * rarely. The description follows, but it is worth noting that that is
+                 * can't we just clean-up the second half of B by putting a
-         * only about the GC head, so we could do this trick only if the bud
+                 * padding node there? We can, and this works fine with one
-         * belongs to the GC head, but it does not seem to be worth an
+                 * exception which was reproduced with power cut emulation
-         * additional "if" statement.
+                 * testing and happens extremely rarely.
-         *
+                 *
-         * So, imagine the file-system is full, we run GC which is moving valid
+                 * Imagine the file-system is full, we run GC which starts
-         * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
+                 * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is
-         * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
+                 * the current GC head LEB). The @c->gc_lnum is -1, which means
-         * and will try to continue. Imagine that LEB X is currently the
+                 * that GC will retain LEB X and will try to continue. Imagine
-         * dirtiest LEB, and the amount of used space in LEB Y is exactly the
+                 * that LEB X is currently the dirtiest LEB, and the amount of
-         * same as amount of free space in LEB X.
+                 * used space in LEB Y is exactly the same as amount of free
-         *
+                 * space in LEB X.
-         * And a power cut happens when nodes are moved from LEB X to LEB Y. We
+                 *
-         * are here trying to recover LEB Y which is the GC head LEB. We find
+                 * And a power cut happens when nodes are moved from LEB X to
-         * the min. I/O unit B as described above. Then we clean-up LEB Y by
+                 * LEB Y. We are here trying to recover LEB Y which is the GC
-         * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
+                 * head LEB. We find the min. I/O unit B as described above.
-         * fails, because it cannot find a dirty LEB which could be GC'd into
+                 * Then we clean-up LEB Y by padding min. I/O unit. And later
-         * LEB Y! Even LEB X does not match because the amount of valid nodes
+                 * 'ubifs_rcvry_gc_commit()' function fails, because it cannot
-         * there does not fit the free space in LEB Y any more! And this is
+                 * find a dirty LEB which could be GC'd into LEB Y! Even LEB X
-         * because of the padding node which we added to LEB Y. The
+                 * does not match because the amount of valid nodes there does
-         * user-visible effect of this which I once observed and analysed is
+                 * not fit the free space in LEB Y any more! And this is
-         * that we cannot mount the file-system with -ENOSPC error.
+                 * because of the padding node which we added to LEB Y. The
-         *
+                 * user-visible effect of this which I once observed and
-         * So obviously, to make sure that situation does not happen we should
+                 * analysed is that we cannot mount the file-system with
-         * free min. I/O unit B in LEB Y completely and the last used min. I/O
+                 * -ENOSPC error.
-         * unit in LEB Y should be A. This is basically what the below code
+                 *
-         * tries to do.
+                 * So obviously, to make sure that situation does not happen we
-         */
+                 * should free min. I/O unit B in LEB Y completely and the last
-        while (min_io_unit == round_down(offs, c->min_io_size) &&
+                 * used min. I/O unit in LEB Y should be A. This is basically
-               min_io_unit != offs &&
+                 * what the below code tries to do.
-               drop_last_node(sleb, &offs, grouped));
+                 */
+                while (offs > min_io_unit)
+                        drop_last_node(sleb, &offs);
+        }
        buf = sbuf + offs;
        len = c->leb_size - offs;
@@ -881,7 +905,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
                }
                ubifs_scan_destroy(sleb);
        }
-        return ubifs_recover_leb(c, lnum, offs, sbuf, 0);
+        return ubifs_recover_leb(c, lnum, offs, sbuf, -1);
 }
 /**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 6617280d1679..5e97161ce4d3 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -557,8 +557,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
                 * these LEBs could possibly be written to at the power cut
                 * time.
                 */
-                sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
+                sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, b->bud->jhead);
-                                         b->bud->jhead != GCHD);
        else
                sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
        if (IS_ERR(sleb))
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index ca953a945029..9e1d05666fed 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -284,7 +284,11 @@ int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
        long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
        if (nr == 0)
-                return clean_zn_cnt;
+                /*
+                 * Due to the way UBIFS updates the clean znode counter it may
+                 * temporarily be negative.
+                 */
+                return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
        if (!clean_zn_cnt) {
                /*
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1ab0d22e4c94..529be0582029 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -811,15 +811,18 @@ static int alloc_wbufs(struct ubifs_info *c)
                c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
                c->jheads[i].wbuf.jhead = i;
+                c->jheads[i].grouped = 1;
        }
        c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
        /*
         * Garbage Collector head likely contains long-term data and
-         * does not need to be synchronized by timer.
+         * does not need to be synchronized by timer. Also GC head nodes are
+         * not grouped.
         */
        c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
        c->jheads[GCHD].wbuf.no_timer = 1;
+        c->jheads[GCHD].grouped = 0;
        return 0;
 }
@@ -1284,12 +1287,25 @@ static int mount_ubifs(struct ubifs_info *c)
        if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
                ubifs_msg("recovery needed");
                c->need_recovery = 1;
-                if (!c->ro_mount) {
+        }
-                        err = ubifs_recover_inl_heads(c, c->sbuf);
-                        if (err)
+        if (c->need_recovery && !c->ro_mount) {
-                                goto out_master;
+                err = ubifs_recover_inl_heads(c, c->sbuf);
-                }
+                if (err)
-        } else if (!c->ro_mount) {
+                        goto out_master;
+        }
+        err = ubifs_lpt_init(c, 1, !c->ro_mount);
+        if (err)
+                goto out_master;
+        if (!c->ro_mount && c->space_fixup) {
+                err = ubifs_fixup_free_space(c);
+                if (err)
+                        goto out_master;
+        }
+        if (!c->ro_mount) {
                /*
                 * Set the "dirty" flag so that if we reboot uncleanly we
                 * will notice this immediately on the next mount.
@@ -1297,13 +1313,9 @@ static int mount_ubifs(struct ubifs_info *c)
                c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
                err = ubifs_write_master(c);
                if (err)
-                        goto out_master;
+                        goto out_lpt;
        }
-        err = ubifs_lpt_init(c, 1, !c->ro_mount);
-        if (err)
-                goto out_lpt;
        err = dbg_check_idx_size(c, c->bi.old_idx_sz);
        if (err)
                goto out_lpt;
@@ -1396,12 +1408,6 @@ static int mount_ubifs(struct ubifs_info *c)
        } else
                ubifs_assert(c->lst.taken_empty_lebs > 0);
-        if (!c->ro_mount && c->space_fixup) {
-                err = ubifs_fixup_free_space(c);
-                if (err)
-                        goto out_infos;
-        }
        err = dbg_check_filesystem(c);
        if (err)
                goto out_infos;
@@ -1842,7 +1848,6 @@ static void ubifs_put_super(struct super_block *sb)
        bdi_destroy(&c->bdi);
        ubi_close_volume(c->ubi);
        mutex_unlock(&c->umount_mutex);
-        kfree(c);
 }
 static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1965,61 +1970,65 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode)
        return ERR_PTR(-EINVAL);
 }
-static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
+static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
 {
-        struct ubi_volume_desc *ubi = sb->s_fs_info;
        struct ubifs_info *c;
-        struct inode *root;
-        int err;
        c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL);
-        if (!c)
+        if (c) {
-                return -ENOMEM;
+                spin_lock_init(&c->cnt_lock);
+                spin_lock_init(&c->cs_lock);
+                spin_lock_init(&c->buds_lock);
+                spin_lock_init(&c->space_lock);
+                spin_lock_init(&c->orphan_lock);
+                init_rwsem(&c->commit_sem);
+                mutex_init(&c->lp_mutex);
+                mutex_init(&c->tnc_mutex);
+                mutex_init(&c->log_mutex);
+                mutex_init(&c->mst_mutex);
+                mutex_init(&c->umount_mutex);
+                mutex_init(&c->bu_mutex);
+                mutex_init(&c->write_reserve_mutex);
+                init_waitqueue_head(&c->cmt_wq);
+                c->buds = RB_ROOT;
+                c->old_idx = RB_ROOT;
+                c->size_tree = RB_ROOT;
+                c->orph_tree = RB_ROOT;
+                INIT_LIST_HEAD(&c->infos_list);
+                INIT_LIST_HEAD(&c->idx_gc);
+                INIT_LIST_HEAD(&c->replay_list);
+                INIT_LIST_HEAD(&c->replay_buds);
+                INIT_LIST_HEAD(&c->uncat_list);
+                INIT_LIST_HEAD(&c->empty_list);
+                INIT_LIST_HEAD(&c->freeable_list);
+                INIT_LIST_HEAD(&c->frdi_idx_list);
+                INIT_LIST_HEAD(&c->unclean_leb_list);
+                INIT_LIST_HEAD(&c->old_buds);
+                INIT_LIST_HEAD(&c->orph_list);
+                INIT_LIST_HEAD(&c->orph_new);
+                c->no_chk_data_crc = 1;
+                c->highest_inum = UBIFS_FIRST_INO;
+                c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
+                ubi_get_volume_info(ubi, &c->vi);
+                ubi_get_device_info(c->vi.ubi_num, &c->di);
+        }
+        return c;
+}
-        spin_lock_init(&c->cnt_lock);
+static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
-        spin_lock_init(&c->cs_lock);
+{
-        spin_lock_init(&c->buds_lock);
+        struct ubifs_info *c = sb->s_fs_info;
-        spin_lock_init(&c->space_lock);
+        struct inode *root;
-        spin_lock_init(&c->orphan_lock);
+        int err;
-        init_rwsem(&c->commit_sem);
-        mutex_init(&c->lp_mutex);
-        mutex_init(&c->tnc_mutex);
-        mutex_init(&c->log_mutex);
-        mutex_init(&c->mst_mutex);
-        mutex_init(&c->umount_mutex);
-        mutex_init(&c->bu_mutex);
-        mutex_init(&c->write_reserve_mutex);
-        init_waitqueue_head(&c->cmt_wq);
-        c->buds = RB_ROOT;
-        c->old_idx = RB_ROOT;
-        c->size_tree = RB_ROOT;
-        c->orph_tree = RB_ROOT;
-        INIT_LIST_HEAD(&c->infos_list);
-        INIT_LIST_HEAD(&c->idx_gc);
-        INIT_LIST_HEAD(&c->replay_list);
-        INIT_LIST_HEAD(&c->replay_buds);
-        INIT_LIST_HEAD(&c->uncat_list);
-        INIT_LIST_HEAD(&c->empty_list);
-        INIT_LIST_HEAD(&c->freeable_list);
-        INIT_LIST_HEAD(&c->frdi_idx_list);
-        INIT_LIST_HEAD(&c->unclean_leb_list);
-        INIT_LIST_HEAD(&c->old_buds);
-        INIT_LIST_HEAD(&c->orph_list);
-        INIT_LIST_HEAD(&c->orph_new);
-        c->no_chk_data_crc = 1;
        c->vfs_sb = sb;
-        c->highest_inum = UBIFS_FIRST_INO;
-        c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
-        ubi_get_volume_info(ubi, &c->vi);
-        ubi_get_device_info(c->vi.ubi_num, &c->di);
        /* Re-open the UBI device in read-write mode */
        c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE);
        if (IS_ERR(c->ubi)) {
                err = PTR_ERR(c->ubi);
-                goto out_free;
+                goto out;
        }
        /*
@@ -2085,24 +2094,29 @@ out_bdi:
        bdi_destroy(&c->bdi);
 out_close:
        ubi_close_volume(c->ubi);
-out_free:
+out:
-        kfree(c);
        return err;
 }
 static int sb_test(struct super_block *sb, void *data)
 {
-        dev_t *dev = data;
+        struct ubifs_info *c1 = data;
        struct ubifs_info *c = sb->s_fs_info;
-        return c->vi.cdev == *dev;
+        return c->vi.cdev == c1->vi.cdev;
+}
+static int sb_set(struct super_block *sb, void *data)
+{
+        sb->s_fs_info = data;
+        return set_anon_super(sb, NULL);
 }
 static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
                        const char *name, void *data)
 {
        struct ubi_volume_desc *ubi;
-        struct ubi_volume_info vi;
+        struct ubifs_info *c;
        struct super_block *sb;
        int err;
@@ -2119,19 +2133,25 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
                        name, (int)PTR_ERR(ubi));
                return ERR_CAST(ubi);
        }
-        ubi_get_volume_info(ubi, &vi);
-        dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id);
+        c = alloc_ubifs_info(ubi);
+        if (!c) {
+                err = -ENOMEM;
+                goto out_close;
+        }
+        dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
-        sb = sget(fs_type, &sb_test, &set_anon_super, &vi.cdev);
+        sb = sget(fs_type, sb_test, sb_set, c);
        if (IS_ERR(sb)) {
                err = PTR_ERR(sb);
+                kfree(c);
                goto out_close;
        }
        if (sb->s_root) {
                struct ubifs_info *c1 = sb->s_fs_info;
+                kfree(c);
                /* A new mount point for already mounted UBIFS */
                dbg_gen("this ubi volume is already mounted");
                if (!!(flags & MS_RDONLY) != c1->ro_mount) {
@@ -2140,11 +2160,6 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
                }
        } else {
                sb->s_flags = flags;
-                /*
-                 * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is
-                 * replaced by 'c'.
-                 */
-                sb->s_fs_info = ubi;
                err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
                if (err)
                        goto out_deact;
@@ -2164,11 +2179,18 @@ out_close:
        return ERR_PTR(err);
 }
+static void kill_ubifs_super(struct super_block *s)
+{
+        struct ubifs_info *c = s->s_fs_info;
+        kill_anon_super(s);
+        kfree(c);
+}
 static struct file_system_type ubifs_fs_type = {
        .name    = "ubifs",
        .owner   = THIS_MODULE,
        .mount   = ubifs_mount,
-        .kill_sb = kill_anon_super,
+        .kill_sb = kill_ubifs_super,
 };
 /*
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 8119b1fd8d94..91b4213dde84 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2876,12 +2876,13 @@ static void tnc_destroy_cnext(struct ubifs_info *c)
 */
 void ubifs_tnc_close(struct ubifs_info *c)
 {
-        long clean_freed;
        tnc_destroy_cnext(c);
        if (c->zroot.znode) {
-                clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
+                long n;
-                atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt);
+                ubifs_destroy_tnc_subtree(c->zroot.znode);
+                n = atomic_long_read(&c->clean_zn_cnt);
+                atomic_long_sub(n, &ubifs_clean_zn_cnt);
        }
        kfree(c->gap_lebs);
        kfree(c->ilebs);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a70d7b4ffb25..f79983d6f860 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -722,12 +722,14 @@ struct ubifs_bud {
 * struct ubifs_jhead - journal head.
 * @wbuf: head's write-buffer
 * @buds_list: list of bud LEBs belonging to this journal head
+ * @grouped: non-zero if UBIFS groups nodes when writing to this journal head
 *
 * Note, the @buds list is protected by the @c->buds_lock.
 */
 struct ubifs_jhead {
        struct ubifs_wbuf wbuf;
        struct list_head buds_list;
+        unsigned int grouped:1;
 };
 /**
@@ -1742,7 +1744,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
 int ubifs_recover_master_node(struct ubifs_info *c);
 int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
-                                         int offs, void *sbuf, int grouped);
+                                         int offs, void *sbuf, int jhead);
 struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
                                             int offs, void *sbuf);
 int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index f4213ba1ff85..7f782af286bf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -131,19 +131,34 @@ xfs_file_fsync(
 {
        struct inode            *inode = file->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
        int                     error = 0;
        int                     log_flushed = 0;
        trace_xfs_file_fsync(ip);
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+        if (XFS_FORCED_SHUTDOWN(mp))
                return -XFS_ERROR(EIO);
        xfs_iflags_clear(ip, XFS_ITRUNCATED);
        xfs_ioend_wait(ip);
+        if (mp->m_flags & XFS_MOUNT_BARRIER) {
+                /*
+                 * If we have an RT and/or log subvolume we need to make sure
+                 * to flush the write cache the device used for file data
+                 * first.  This is to ensure newly written file data make
+                 * it to disk before logging the new inode size in case of
+                 * an extending write.
+                 */
+                if (XFS_IS_REALTIME_INODE(ip))
+                        xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+                else if (mp->m_logdev_targp != mp->m_ddev_targp)
+                        xfs_blkdev_issue_flush(mp->m_ddev_targp);
+        }
        /*
         * We always need to make sure that the required inode state is safe on
         * disk.  The inode might be clean but we still might need to force the
@@ -175,9 +190,9 @@ xfs_file_fsync(
                 * updates.  The sync transaction will also force the log.
                 */
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+                tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
                error = xfs_trans_reserve(tp, 0,
-                                XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
+                                XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
                if (error) {
                        xfs_trans_cancel(tp, 0);
                        return -error;
@@ -209,28 +224,25 @@ xfs_file_fsync(
                 * force the log.
                 */
                if (xfs_ipincount(ip)) {
-                        error = _xfs_log_force_lsn(ip->i_mount,
+                        error = _xfs_log_force_lsn(mp,
                                        ip->i_itemp->ili_last_lsn,
                                        XFS_LOG_SYNC, &log_flushed);
                }
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
        }
-        if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
+        /*
-                /*
+         * If we only have a single device, and the log force about was
-                 * If the log write didn't issue an ordered tag we need
+         * a no-op we might have to flush the data device cache here.
-                 * to flush the disk cache for the data device now.
+         * This can only happen for fdatasync/O_DSYNC if we were overwriting
-                 */
+         * an already allocated file and thus do not have any metadata to
-                if (!log_flushed)
+         * commit.
-                        xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
+         */
+        if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
-                /*
+            mp->m_logdev_targp == mp->m_ddev_targp &&
-                 * If this inode is on the RT dev we need to flush that
+            !XFS_IS_REALTIME_INODE(ip) &&
-                 * cache as well.
+            !log_flushed)
-                 */
+                xfs_blkdev_issue_flush(mp->m_ddev_targp);
-                if (XFS_IS_REALTIME_INODE(ip))
-                        xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
-        }
        return -error;
 }
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index dd21784525a8..d44d92cd12b1 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -182,7 +182,7 @@ xfs_vn_mknod(
        if (IS_POSIXACL(dir)) {
                default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
                if (IS_ERR(default_acl))
-                        return -PTR_ERR(default_acl);
+                        return PTR_ERR(default_acl);
                if (!default_acl)
                        mode &= ~current_umask();
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 1e3a7ce804dc..a1a881e68a9a 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -627,68 +627,6 @@ xfs_blkdev_put(
                blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
-/*
- * Try to write out the superblock using barriers.
- */
-STATIC int
-xfs_barrier_test(
-        xfs_mount_t     *mp)
-{
-        xfs_buf_t       *sbp = xfs_getsb(mp, 0);
-        int             error;
-        XFS_BUF_UNDONE(sbp);
-        XFS_BUF_UNREAD(sbp);
-        XFS_BUF_UNDELAYWRITE(sbp);
-        XFS_BUF_WRITE(sbp);
-        XFS_BUF_UNASYNC(sbp);
-        XFS_BUF_ORDERED(sbp);
-        xfsbdstrat(mp, sbp);
-        error = xfs_buf_iowait(sbp);
-        /*
-         * Clear all the flags we set and possible error state in the
-         * buffer.  We only did the write to try out whether barriers
-         * worked and shouldn't leave any traces in the superblock
-         * buffer.
-         */
-        XFS_BUF_DONE(sbp);
-        XFS_BUF_ERROR(sbp, 0);
-        XFS_BUF_UNORDERED(sbp);
-        xfs_buf_relse(sbp);
-        return error;
-}
-STATIC void
-xfs_mountfs_check_barriers(xfs_mount_t *mp)
-{
-        int error;
-        if (mp->m_logdev_targp != mp->m_ddev_targp) {
-                xfs_notice(mp,
-                  "Disabling barriers, not supported with external log device");
-                mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                return;
-        }
-        if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
-                xfs_notice(mp,
-                        "Disabling barriers, underlying device is readonly");
-                mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                return;
-        }
-        error = xfs_barrier_test(mp);
-        if (error) {
-                xfs_notice(mp,
-                        "Disabling barriers, trial barrier write failed");
-                mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                return;
-        }
-}
 void
 xfs_blkdev_issue_flush(
        xfs_buftarg_t           *buftarg)
@@ -1240,14 +1178,6 @@ xfs_fs_remount(
                switch (token) {
                case Opt_barrier:
                        mp->m_flags |= XFS_MOUNT_BARRIER;
-                        /*
-                         * Test if barriers are actually working if we can,
-                         * else delay this check until the filesystem is
-                         * marked writeable.
-                         */
-                        if (!(mp->m_flags & XFS_MOUNT_RDONLY))
-                                xfs_mountfs_check_barriers(mp);
                        break;
                case Opt_nobarrier:
                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
@@ -1282,8 +1212,6 @@ xfs_fs_remount(
        /* ro -> rw */
        if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
                mp->m_flags &= ~XFS_MOUNT_RDONLY;
-                if (mp->m_flags & XFS_MOUNT_BARRIER)
-                        xfs_mountfs_check_barriers(mp);
                /*
                 * If this is the first remount to writeable state we
@@ -1465,9 +1393,6 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_sb;
-        if (mp->m_flags & XFS_MOUNT_BARRIER)
-                xfs_mountfs_check_barriers(mp);
        error = xfs_filestream_mount(mp);
        if (error)
                goto out_free_sb;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 211930246f20..41d5b8f2bf92 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1372,8 +1372,17 @@ xlog_sync(xlog_t		*log,
        XFS_BUF_ASYNC(bp);
        bp->b_flags |= XBF_LOG_BUFFER;
-        if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
+        if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
+                /*
+                 * If we have an external log device, flush the data device
+                 * before flushing the log to make sure all meta data
+                 * written back from the AIL actually made it to disk
+                 * before writing out the new log tail LSN in the log buffer.
+                 */
+                if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+                        xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
                XFS_BUF_ORDERED(bp);
+        }
        ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
        ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
author	Jens Axboe <jaxboe@fusionio.com>	2011-07-01 10:17:13 -0400
committer	Jens Axboe <jaxboe@fusionio.com>	2011-07-01 10:17:13 -0400
commit	04bf7869ca0fd12009aee301cac2264a36df4d98 (patch)
tree	66cb81ebf8b76560a31433c2c493dc430c914af9 /fs
parent	d2f31a5fd60d168b00fc4f7617b68a1287b21e90 (diff)
parent	7b28afe01ab6ffb5f152f47831b44933facd2328 (diff)