Merge branch 'linus' into perf/core

Merge reason: Pick up upstream fixes. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2011-04-22 04:19:26 -0400
committer: Ingo Molnar <mingo@elte.hu> 2011-04-22 04:19:30 -0400
commit: eff430de53be6f3328c3eebe93755f1ecf499e37 (patch)
tree: c8e5ae958fe3e6656b4e96c83bbda17e649321a2 /fs
parent: 9cbdb702092a2d82f909312f4ec3eeded77bb82e (diff)
parent: 91e8549bde9e5cc88c5a2e8c8114389279e240b5 (diff)
86 files changed, 1558 insertions, 1168 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 0ee594569dcc..85b67ffa2a43 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -286,11 +286,9 @@ static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
 struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
 {
-        int err, flags;
+        int err;
        struct p9_fid *fid;
-        struct v9fs_session_info *v9ses;
-        v9ses = v9fs_dentry2v9ses(dentry);
        fid = v9fs_fid_clone_with_uid(dentry, 0);
        if (IS_ERR(fid))
                goto error_out;
@@ -299,17 +297,8 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
         * dirty pages. We always request for the open fid in read-write
         * mode so that a partial page write which result in page
         * read can work.
-         *
-         * we don't have a tsyncfs operation for older version
-         * of protocol. So make sure the write back fid is
-         * opened in O_SYNC mode.
         */
-        if (!v9fs_proto_dotl(v9ses))
+        err = p9_client_open(fid, O_RDWR);
-                flags = O_RDWR | O_SYNC;
-        else
-                flags = O_RDWR;
-        err = p9_client_open(fid, flags);
        if (err < 0) {
                p9_client_clunk(fid);
                fid = ERR_PTR(err);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 9665c2b840e6..e5ebedfc5ed8 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -116,7 +116,6 @@ struct v9fs_session_info {
        struct list_head slist; /* list of sessions registered with v9fs */
        struct backing_dev_info bdi;
        struct rw_semaphore rename_sem;
-        struct p9_fid *root_fid; /* Used for file system sync */
 };
 /* cache_validity flags */
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index b6a3b9f7fe4d..e022890c6f40 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -126,7 +126,9 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
                        retval = v9fs_refresh_inode_dotl(fid, inode);
                else
                        retval = v9fs_refresh_inode(fid, inode);
-                if (retval <= 0)
+                if (retval == -ENOENT)
+                        return 0;
+                if (retval < 0)
                        return retval;
        }
 out_valid:
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index ffbb113d5f33..82a7c38ddad0 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -811,7 +811,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid)) {
                __putname(link);
-                link = ERR_PTR(PTR_ERR(fid));
+                link = ERR_CAST(fid);
                goto ndset;
        }
        retval = p9_client_readlink(fid, &target);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index f3eed3383e4f..feef6cdc1fd2 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -154,6 +154,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                retval = PTR_ERR(inode);
                goto release_sb;
        }
        root = d_alloc_root(inode);
        if (!root) {
                iput(inode);
@@ -185,21 +186,10 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                p9stat_free(st);
                kfree(st);
        }
-        v9fs_fid_add(root, fid);
        retval = v9fs_get_acl(inode, fid);
        if (retval)
                goto release_sb;
-        /*
+        v9fs_fid_add(root, fid);
-         * Add the root fid to session info. This is used
-         * for file system sync. We want a cloned fid here
-         * so that we can do a sync_filesystem after a
-         * shrink_dcache_for_umount
-         */
-        v9ses->root_fid = v9fs_fid_clone(root);
-        if (IS_ERR(v9ses->root_fid)) {
-                retval = PTR_ERR(v9ses->root_fid);
-                goto release_sb;
-        }
        P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
        return dget(sb->s_root);
@@ -210,11 +200,15 @@ close_session:
        v9fs_session_close(v9ses);
        kfree(v9ses);
        return ERR_PTR(retval);
 release_sb:
        /*
-         * we will do the session_close and root dentry
+         * we will do the session_close and root dentry release
-         * release in the below call.
+         * in the below call. But we need to clunk fid, because we haven't
+         * attached the fid to dentry so it won't get clunked
+         * automatically.
         */
+        p9_client_clunk(fid);
        deactivate_locked_super(sb);
        return ERR_PTR(retval);
 }
@@ -232,7 +226,7 @@ static void v9fs_kill_super(struct super_block *s)
        P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
        kill_anon_super(s);
-        p9_client_clunk(v9ses->root_fid);
        v9fs_session_cancel(v9ses);
        v9fs_session_close(v9ses);
        kfree(v9ses);
@@ -285,14 +279,6 @@ done:
        return res;
 }
-static int v9fs_sync_fs(struct super_block *sb, int wait)
-{
-        struct v9fs_session_info *v9ses = sb->s_fs_info;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_sync_fs: super_block %p\n", sb);
-        return p9_client_sync_fs(v9ses->root_fid);
-}
 static int v9fs_drop_inode(struct inode *inode)
 {
        struct v9fs_session_info *v9ses;
@@ -307,6 +293,51 @@ static int v9fs_drop_inode(struct inode *inode)
        return 1;
 }
+static int v9fs_write_inode(struct inode *inode,
+                            struct writeback_control *wbc)
+{
+        int ret;
+        struct p9_wstat wstat;
+        struct v9fs_inode *v9inode;
+        /*
+         * send an fsync request to server irrespective of
+         * wbc->sync_mode.
+         */
+        P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+        v9inode = V9FS_I(inode);
+        if (!v9inode->writeback_fid)
+                return 0;
+        v9fs_blank_wstat(&wstat);
+        ret = p9_client_wstat(v9inode->writeback_fid, &wstat);
+        if (ret < 0) {
+                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+                return ret;
+        }
+        return 0;
+}
+static int v9fs_write_inode_dotl(struct inode *inode,
+                                 struct writeback_control *wbc)
+{
+        int ret;
+        struct v9fs_inode *v9inode;
+        /*
+         * send an fsync request to server irrespective of
+         * wbc->sync_mode.
+         */
+        P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+        v9inode = V9FS_I(inode);
+        if (!v9inode->writeback_fid)
+                return 0;
+        ret = p9_client_fsync(v9inode->writeback_fid, 0);
+        if (ret < 0) {
+                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+                return ret;
+        }
+        return 0;
+}
 static const struct super_operations v9fs_super_ops = {
        .alloc_inode = v9fs_alloc_inode,
        .destroy_inode = v9fs_destroy_inode,
@@ -314,17 +345,18 @@ static const struct super_operations v9fs_super_ops = {
        .evict_inode = v9fs_evict_inode,
        .show_options = generic_show_options,
        .umount_begin = v9fs_umount_begin,
+        .write_inode = v9fs_write_inode,
 };
 static const struct super_operations v9fs_super_ops_dotl = {
        .alloc_inode = v9fs_alloc_inode,
        .destroy_inode = v9fs_destroy_inode,
-        .sync_fs = v9fs_sync_fs,
        .statfs = v9fs_statfs,
        .drop_inode = v9fs_drop_inode,
        .evict_inode = v9fs_evict_inode,
        .show_options = generic_show_options,
        .umount_begin = v9fs_umount_begin,
+        .write_inode = v9fs_write_inode_dotl,
 };
 struct file_system_type v9fs_fs_type = {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f34078d702d3..303983fabfd6 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -941,9 +941,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        current->mm->start_stack = bprm->p;
 #ifdef arch_randomize_brk
-        if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1))
+        if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
                current->mm->brk = current->mm->start_brk =
                        arch_randomize_brk(current->mm);
+#ifdef CONFIG_COMPAT_BRK
+                current->brk_randomized = 1;
+#endif
+        }
 #endif
        if (current->personality & MMAP_PAGE_ZERO) {
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index de34bfad9ec3..5d505aaa72fb 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -178,16 +178,17 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        if (value) {
                acl = posix_acl_from_xattr(value, size);
-                if (acl == NULL) {
+                if (acl) {
-                        value = NULL;
+                        ret = posix_acl_valid(acl);
-                        size = 0;
+                        if (ret)
+                                goto out;
                } else if (IS_ERR(acl)) {
                        return PTR_ERR(acl);
                }
        }
        ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
+out:
        posix_acl_release(acl);
        return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3458b5725540..2e61fe1b6b8c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -740,8 +740,10 @@ struct btrfs_space_info {
         */
        unsigned long reservation_progress;
-        int full;               /* indicates that we cannot allocate any more
+        int full:1;             /* indicates that we cannot allocate any more
                                   chunks for this space */
+        int chunk_alloc:1;      /* set if we are allocating a chunk */
        int force_alloc;        /* set if we need to force a chunk alloc for
                                   this space */
@@ -2576,6 +2578,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                              struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
+void btrfs_drop_pages(struct page **pages, size_t num_pages);
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+                      struct page **pages, size_t num_pages,
+                      loff_t pos, size_t write_bytes,
+                      struct extent_state **cached);
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8f1d44ba332f..68c84c8c24bd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3057,7 +3057,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
                btrfs_destroy_pinned_extent(root,
                                            root->fs_info->pinned_extents);
-                t->use_count = 0;
+                atomic_set(&t->use_count, 0);
                list_del_init(&t->list);
                memset(t, 0, sizeof(*t));
                kmem_cache_free(btrfs_transaction_cachep, t);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f619c3cb13b7..31f33ba56fe8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,25 @@
 #include "locking.h"
 #include "free-space-cache.h"
+/* control flags for do_chunk_alloc's force field
+ * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
+ * if we really need one.
+ *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ * CHUNK_ALLOC_LIMITED means to only try and allocate one
+ * if we have very few chunks already allocated.  This is
+ * used as part of the clustering code to help make sure
+ * we have a good pool of storage to cluster in, without
+ * filling the FS with empty chunks
+ *
+ */
+enum {
+        CHUNK_ALLOC_NO_FORCE = 0,
+        CHUNK_ALLOC_FORCE = 1,
+        CHUNK_ALLOC_LIMITED = 2,
+};
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc);
@@ -3019,7 +3038,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->bytes_readonly = 0;
        found->bytes_may_use = 0;
        found->full = 0;
-        found->force_alloc = 0;
+        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
+        found->chunk_alloc = 0;
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
        atomic_set(&found->caching_threads, 0);
@@ -3150,7 +3170,7 @@ again:
                if (!data_sinfo->full && alloc_chunk) {
                        u64 alloc_target;
-                        data_sinfo->force_alloc = 1;
+                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
                        spin_unlock(&data_sinfo->lock);
 alloc:
                        alloc_target = btrfs_get_alloc_profile(root, 1);
@@ -3160,7 +3180,8 @@ alloc:
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
                                             bytes + 2 * 1024 * 1024,
-                                             alloc_target, 0);
+                                             alloc_target,
+                                             CHUNK_ALLOC_NO_FORCE);
                        btrfs_end_transaction(trans, root);
                        if (ret < 0) {
                                if (ret != -ENOSPC)
@@ -3239,31 +3260,56 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
-                        found->force_alloc = 1;
+                        found->force_alloc = CHUNK_ALLOC_FORCE;
        }
        rcu_read_unlock();
 }
 static int should_alloc_chunk(struct btrfs_root *root,
-                              struct btrfs_space_info *sinfo, u64 alloc_bytes)
+                              struct btrfs_space_info *sinfo, u64 alloc_bytes,
+                              int force)
 {
        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+        u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
        u64 thresh;
-        if (sinfo->bytes_used + sinfo->bytes_reserved +
+        if (force == CHUNK_ALLOC_FORCE)
-            alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+                return 1;
+        /*
+         * in limited mode, we want to have some free space up to
+         * about 1% of the FS size.
+         */
+        if (force == CHUNK_ALLOC_LIMITED) {
+                thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+                thresh = max_t(u64, 64 * 1024 * 1024,
+                               div_factor_fine(thresh, 1));
+                if (num_bytes - num_allocated < thresh)
+                        return 1;
+        }
+        /*
+         * we have two similar checks here, one based on percentage
+         * and once based on a hard number of 256MB.  The idea
+         * is that if we have a good amount of free
+         * room, don't allocate a chunk.  A good mount is
+         * less than 80% utilized of the chunks we have allocated,
+         * or more than 256MB free
+         */
+        if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
                return 0;
-        if (sinfo->bytes_used + sinfo->bytes_reserved +
+        if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
-            alloc_bytes < div_factor(num_bytes, 8))
                return 0;
        thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        /* 256MB or 5% of the FS */
        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
        if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
                return 0;
        return 1;
 }
@@ -3273,10 +3319,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 {
        struct btrfs_space_info *space_info;
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
+        int wait_for_alloc = 0;
        int ret = 0;
-        mutex_lock(&fs_info->chunk_mutex);
        flags = btrfs_reduce_alloc_profile(extent_root, flags);
        space_info = __find_space_info(extent_root->fs_info, flags);
@@ -3287,21 +3332,40 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        }
        BUG_ON(!space_info);
+again:
        spin_lock(&space_info->lock);
        if (space_info->force_alloc)
-                force = 1;
+                force = space_info->force_alloc;
        if (space_info->full) {
                spin_unlock(&space_info->lock);
-                goto out;
+                return 0;
        }
-        if (!force && !should_alloc_chunk(extent_root, space_info,
+        if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
-                                          alloc_bytes)) {
                spin_unlock(&space_info->lock);
-                goto out;
+                return 0;
+        } else if (space_info->chunk_alloc) {
+                wait_for_alloc = 1;
+        } else {
+                space_info->chunk_alloc = 1;
        }
        spin_unlock(&space_info->lock);
+        mutex_lock(&fs_info->chunk_mutex);
+        /*
+         * The chunk_mutex is held throughout the entirety of a chunk
+         * allocation, so once we've acquired the chunk_mutex we know that the
+         * other guy is done and we need to recheck and see if we should
+         * allocate.
+         */
+        if (wait_for_alloc) {
+                mutex_unlock(&fs_info->chunk_mutex);
+                wait_for_alloc = 0;
+                goto again;
+        }
        /*
         * If we have mixed data/metadata chunks we want to make sure we keep
         * allocating mixed chunks instead of individual chunks.
@@ -3327,9 +3391,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                space_info->full = 1;
        else
                ret = 1;
-        space_info->force_alloc = 0;
+        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
-out:
        mutex_unlock(&extent_root->fs_info->chunk_mutex);
        return ret;
 }
@@ -5303,11 +5368,13 @@ loop:
                if (allowed_chunk_alloc) {
                        ret = do_chunk_alloc(trans, root, num_bytes +
-                                             2 * 1024 * 1024, data, 1);
+                                             2 * 1024 * 1024, data,
+                                             CHUNK_ALLOC_LIMITED);
                        allowed_chunk_alloc = 0;
                        done_chunk_alloc = 1;
-                } else if (!done_chunk_alloc) {
+                } else if (!done_chunk_alloc &&
-                        space_info->force_alloc = 1;
+                           space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) {
+                        space_info->force_alloc = CHUNK_ALLOC_LIMITED;
                }
                if (loop < LOOP_NO_EMPTY_SIZE) {
@@ -5393,7 +5460,8 @@ again:
         */
        if (empty_size || root->ref_cows)
                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                     num_bytes + 2 * 1024 * 1024, data, 0);
+                                     num_bytes + 2 * 1024 * 1024, data,
+                                     CHUNK_ALLOC_NO_FORCE);
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -5405,7 +5473,7 @@ again:
                num_bytes = num_bytes & ~(root->sectorsize - 1);
                num_bytes = max(num_bytes, min_alloc_size);
                do_chunk_alloc(trans, root->fs_info->extent_root,
-                               num_bytes, data, 1);
+                               num_bytes, data, CHUNK_ALLOC_FORCE);
                goto again;
        }
        if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
@@ -8109,13 +8177,15 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        alloc_flags = update_block_group_flags(root, cache->flags);
        if (alloc_flags != cache->flags)
-                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                               CHUNK_ALLOC_FORCE);
        ret = set_block_group_ro(cache);
        if (!ret)
                goto out;
        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
-        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                             CHUNK_ALLOC_FORCE);
        if (ret < 0)
                goto out;
        ret = set_block_group_ro(cache);
@@ -8128,7 +8198,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 type)
 {
        u64 alloc_flags = get_alloc_profile(root, type);
-        return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+        return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                              CHUNK_ALLOC_FORCE);
 }
 /*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 20ddb28602a8..315138605088 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -690,6 +690,15 @@ static void cache_state(struct extent_state *state,
        }
 }
+static void uncache_state(struct extent_state **cached_ptr)
+{
+        if (cached_ptr && (*cached_ptr)) {
+                struct extent_state *state = *cached_ptr;
+                *cached_ptr = NULL;
+                free_extent_state(state);
+        }
+}
 /*
 * set some bits on a range in the tree.  This may require allocations or
 * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -940,10 +949,10 @@ static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 }
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-                        gfp_t mask)
+                        struct extent_state **cached_state, gfp_t mask)
 {
-        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
-                              NULL, mask);
+                              NULL, cached_state, mask);
 }
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -1012,8 +1021,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
                                mask);
 }
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
-                  gfp_t mask)
 {
        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
                                mask);
@@ -1735,6 +1743,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
        do {
                struct page *page = bvec->bv_page;
+                struct extent_state *cached = NULL;
+                struct extent_state *state;
                tree = &BTRFS_I(page->mapping->host)->io_tree;
                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1749,9 +1760,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                if (++bvec <= bvec_end)
                        prefetchw(&bvec->bv_page->flags);
+                spin_lock(&tree->lock);
+                state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
+                if (state && state->start == start) {
+                        /*
+                         * take a reference on the state, unlock will drop
+                         * the ref
+                         */
+                        cache_state(state, &cached);
+                }
+                spin_unlock(&tree->lock);
                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
                        ret = tree->ops->readpage_end_io_hook(page, start, end,
-                                                              NULL);
+                                                              state);
                        if (ret)
                                uptodate = 0;
                }
@@ -1764,15 +1786,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
                                        uptodate = 0;
+                                uncache_state(&cached);
                                continue;
                        }
                }
                if (uptodate) {
-                        set_extent_uptodate(tree, start, end,
+                        set_extent_uptodate(tree, start, end, &cached,
                                            GFP_ATOMIC);
                }
-                unlock_extent(tree, start, end, GFP_ATOMIC);
+                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
                if (whole_page) {
                        if (uptodate) {
@@ -1811,6 +1834,7 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
        do {
                struct page *page = bvec->bv_page;
+                struct extent_state *cached = NULL;
                tree = &BTRFS_I(page->mapping->host)->io_tree;
                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1821,13 +1845,14 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
                        prefetchw(&bvec->bv_page->flags);
                if (uptodate) {
-                        set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+                        set_extent_uptodate(tree, start, end, &cached,
+                                            GFP_ATOMIC);
                } else {
                        ClearPageUptodate(page);
                        SetPageError(page);
                }
-                unlock_extent(tree, start, end, GFP_ATOMIC);
+                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
        } while (bvec >= bio->bi_io_vec);
@@ -2016,14 +2041,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        while (cur <= end) {
                if (cur >= last_byte) {
                        char *userpage;
+                        struct extent_state *cached = NULL;
                        iosize = PAGE_CACHE_SIZE - page_offset;
                        userpage = kmap_atomic(page, KM_USER0);
                        memset(userpage + page_offset, 0, iosize);
                        flush_dcache_page(page);
                        kunmap_atomic(userpage, KM_USER0);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
-                                            GFP_NOFS);
+                                            &cached, GFP_NOFS);
-                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        unlock_extent_cached(tree, cur, cur + iosize - 1,
+                                             &cached, GFP_NOFS);
                        break;
                }
                em = get_extent(inode, page, page_offset, cur,
@@ -2063,14 +2091,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                /* we've found a hole, just zero and go on */
                if (block_start == EXTENT_MAP_HOLE) {
                        char *userpage;
+                        struct extent_state *cached = NULL;
                        userpage = kmap_atomic(page, KM_USER0);
                        memset(userpage + page_offset, 0, iosize);
                        flush_dcache_page(page);
                        kunmap_atomic(userpage, KM_USER0);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
-                                            GFP_NOFS);
+                                            &cached, GFP_NOFS);
-                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        unlock_extent_cached(tree, cur, cur + iosize - 1,
+                                             &cached, GFP_NOFS);
                        cur = cur + iosize;
                        page_offset += iosize;
                        continue;
@@ -2789,9 +2820,12 @@ int extent_prepare_write(struct extent_io_tree *tree,
                        iocount++;
                        block_start = block_start + iosize;
                } else {
-                        set_extent_uptodate(tree, block_start, cur_end,
+                        struct extent_state *cached = NULL;
+                        set_extent_uptodate(tree, block_start, cur_end, &cached,
                                            GFP_NOFS);
-                        unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+                        unlock_extent_cached(tree, block_start, cur_end,
+                                             &cached, GFP_NOFS);
                        block_start = cur_end + 1;
                }
                page_offset = block_start & (PAGE_CACHE_SIZE - 1);
@@ -3457,7 +3491,7 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
        set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                            GFP_NOFS);
+                            NULL, GFP_NOFS);
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3885,6 +3919,12 @@ static void move_pages(struct page *dst_page, struct page *src_page,
        kunmap_atomic(dst_kaddr, KM_USER0);
 }
+static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
+{
+        unsigned long distance = (src > dst) ? src - dst : dst - src;
+        return distance < len;
+}
 static void copy_pages(struct page *dst_page, struct page *src_page,
                       unsigned long dst_off, unsigned long src_off,
                       unsigned long len)
@@ -3892,10 +3932,12 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
        char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
        char *src_kaddr;
-        if (dst_page != src_page)
+        if (dst_page != src_page) {
                src_kaddr = kmap_atomic(src_page, KM_USER1);
-        else
+        } else {
                src_kaddr = dst_kaddr;
+                BUG_ON(areas_overlap(src_off, dst_off, len));
+        }
        memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
        kunmap_atomic(dst_kaddr, KM_USER0);
@@ -3970,7 +4012,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
                       "len %lu len %lu\n", dst_offset, len, dst->len);
                BUG_ON(1);
        }
-        if (dst_offset < src_offset) {
+        if (!areas_overlap(src_offset, dst_offset, len)) {
                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
                return;
        }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f62c5442835d..af2d7179c372 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -208,7 +208,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   int bits, int exclusive_bits, u64 *failed_start,
                   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-                        gfp_t mask);
+                        struct extent_state **cached_state, gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e621ea54a3fd..75899a01dded 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -104,7 +104,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 /*
 * unlocks pages after btrfs_file_write is done with them
 */
-static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
        size_t i;
        for (i = 0; i < num_pages; i++) {
@@ -127,16 +127,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
 * this also makes the decision about creating an inline extent vs
 * doing real data extents, marking pages dirty and delalloc as required.
 */
-static noinline int dirty_and_release_pages(struct btrfs_root *root,
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
-                                            struct file *file,
+                      struct page **pages, size_t num_pages,
-                                            struct page **pages,
+                      loff_t pos, size_t write_bytes,
-                                            size_t num_pages,
+                      struct extent_state **cached)
-                                            loff_t pos,
-                                            size_t write_bytes)
 {
        int err = 0;
        int i;
-        struct inode *inode = fdentry(file)->d_inode;
        u64 num_bytes;
        u64 start_pos;
        u64 end_of_last_block;
@@ -149,7 +146,7 @@ static noinline int dirty_and_release_pages(struct btrfs_root *root,
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-                                        NULL);
+                                        cached);
        if (err)
                return err;
@@ -992,9 +989,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                }
                if (copied > 0) {
-                        ret = dirty_and_release_pages(root, file, pages,
+                        ret = btrfs_dirty_pages(root, inode, pages,
-                                                      dirty_pages, pos,
+                                                dirty_pages, pos, copied,
-                                                      copied);
+                                                NULL);
                        if (ret) {
                                btrfs_delalloc_release_space(inode,
                                        dirty_pages << PAGE_CACHE_SHIFT);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f561c953205b..11d2e9cea09e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -508,6 +508,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        struct inode *inode;
        struct rb_node *node;
        struct list_head *pos, *n;
+        struct page **pages;
        struct page *page;
        struct extent_state *cached_state = NULL;
        struct btrfs_free_cluster *cluster = NULL;
@@ -517,13 +518,13 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        u64 start, end, len;
        u64 bytes = 0;
        u32 *crc, *checksums;
-        pgoff_t index = 0, last_index = 0;
        unsigned long first_page_offset;
-        int num_checksums;
+        int index = 0, num_pages = 0;
        int entries = 0;
        int bitmaps = 0;
        int ret = 0;
        bool next_page = false;
+        bool out_of_space = false;
        root = root->fs_info->tree_root;
@@ -551,24 +552,31 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                return 0;
        }
-        last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                PAGE_CACHE_SHIFT;
        filemap_write_and_wait(inode->i_mapping);
        btrfs_wait_ordered_range(inode, inode->i_size &
                                 ~(root->sectorsize - 1), (u64)-1);
        /* We need a checksum per page. */
-        num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+        crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
-        crc = checksums  = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
        if (!crc) {
                iput(inode);
                return 0;
        }
+        pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
+        if (!pages) {
+                kfree(crc);
+                iput(inode);
+                return 0;
+        }
        /* Since the first page has all of our checksums and our generation we
         * need to calculate the offset into the page that we can start writing
         * our entries.
         */
-        first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+        first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
        /* Get the cluster for this block_group if it exists */
        if (!list_empty(&block_group->cluster_list))
@@ -590,20 +598,18 @@ int btrfs_write_out_cache(struct btrfs_root *root,
         * after find_get_page at this point.  Just putting this here so people
         * know and don't freak out.
         */
-        while (index <= last_index) {
+        while (index < num_pages) {
                page = grab_cache_page(inode->i_mapping, index);
                if (!page) {
-                        pgoff_t i = 0;
+                        int i;
-                        while (i < index) {
+                        for (i = 0; i < num_pages; i++) {
-                                page = find_get_page(inode->i_mapping, i);
+                                unlock_page(pages[i]);
-                                unlock_page(page);
+                                page_cache_release(pages[i]);
-                                page_cache_release(page);
-                                page_cache_release(page);
-                                i++;
                        }
                        goto out_free;
                }
+                pages[index] = page;
                index++;
        }
@@ -631,7 +637,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                        offset = start_offset;
                }
-                page = find_get_page(inode->i_mapping, index);
+                if (index >= num_pages) {
+                        out_of_space = true;
+                        break;
+                }
+                page = pages[index];
                addr = kmap(page);
                entry = addr + start_offset;
@@ -708,23 +719,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                bytes += PAGE_CACHE_SIZE;
-                ClearPageChecked(page);
-                set_page_extent_mapped(page);
-                SetPageUptodate(page);
-                set_page_dirty(page);
-                /*
-                 * We need to release our reference we got for grab_cache_page,
-                 * except for the first page which will hold our checksums, we
-                 * do that below.
-                 */
-                if (index != 0) {
-                        unlock_page(page);
-                        page_cache_release(page);
-                }
-                page_cache_release(page);
                index++;
        } while (node || next_page);
@@ -734,7 +728,11 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                struct btrfs_free_space *entry =
                        list_entry(pos, struct btrfs_free_space, list);
-                page = find_get_page(inode->i_mapping, index);
+                if (index >= num_pages) {
+                        out_of_space = true;
+                        break;
+                }
+                page = pages[index];
                addr = kmap(page);
                memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
@@ -745,64 +743,58 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                crc++;
                bytes += PAGE_CACHE_SIZE;
-                ClearPageChecked(page);
-                set_page_extent_mapped(page);
-                SetPageUptodate(page);
-                set_page_dirty(page);
-                unlock_page(page);
-                page_cache_release(page);
-                page_cache_release(page);
                list_del_init(&entry->list);
                index++;
        }
+        if (out_of_space) {
+                btrfs_drop_pages(pages, num_pages);
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+                                     i_size_read(inode) - 1, &cached_state,
+                                     GFP_NOFS);
+                ret = 0;
+                goto out_free;
+        }
        /* Zero out the rest of the pages just to make sure */
-        while (index <= last_index) {
+        while (index < num_pages) {
                void *addr;
-                page = find_get_page(inode->i_mapping, index);
+                page = pages[index];
                addr = kmap(page);
                memset(addr, 0, PAGE_CACHE_SIZE);
                kunmap(page);
-                ClearPageChecked(page);
-                set_page_extent_mapped(page);
-                SetPageUptodate(page);
-                set_page_dirty(page);
-                unlock_page(page);
-                page_cache_release(page);
-                page_cache_release(page);
                bytes += PAGE_CACHE_SIZE;
                index++;
        }
-        btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
        /* Write the checksums and trans id to the first page */
        {
                void *addr;
                u64 *gen;
-                page = find_get_page(inode->i_mapping, 0);
+                page = pages[0];
                addr = kmap(page);
-                memcpy(addr, checksums, sizeof(u32) * num_checksums);
+                memcpy(addr, checksums, sizeof(u32) * num_pages);
-                gen = addr + (sizeof(u32) * num_checksums);
+                gen = addr + (sizeof(u32) * num_pages);
                *gen = trans->transid;
                kunmap(page);
-                ClearPageChecked(page);
-                set_page_extent_mapped(page);
-                SetPageUptodate(page);
-                set_page_dirty(page);
-                unlock_page(page);
-                page_cache_release(page);
-                page_cache_release(page);
        }
-        BTRFS_I(inode)->generation = trans->transid;
+        ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
+                                            bytes, &cached_state);
+        btrfs_drop_pages(pages, num_pages);
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
                             i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+        if (ret) {
+                ret = 0;
+                goto out_free;
+        }
+        BTRFS_I(inode)->generation = trans->transid;
        filemap_write_and_wait(inode->i_mapping);
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -853,6 +845,7 @@ out_free:
                BTRFS_I(inode)->generation = 0;
        }
        kfree(checksums);
+        kfree(pages);
        btrfs_update_inode(trans, root, inode);
        iput(inode);
        return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5cc64ab9c485..fcd66b6a8086 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1770,9 +1770,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
-        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+        ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-        ret = btrfs_update_inode(trans, root, inode);
+        if (!ret) {
-        BUG_ON(ret);
+                ret = btrfs_update_inode(trans, root, inode);
+                BUG_ON(ret);
+        }
+        ret = 0;
 out:
        if (nolock) {
                if (trans)
@@ -2590,6 +2593,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct btrfs_inode_item *item,
                            struct inode *inode)
 {
+        if (!leaf->map_token)
+                map_private_extent_buffer(leaf, (unsigned long)item,
+                                          sizeof(struct btrfs_inode_item),
+                                          &leaf->map_token, &leaf->kaddr,
+                                          &leaf->map_start, &leaf->map_len,
+                                          KM_USER1);
        btrfs_set_inode_uid(leaf, item, inode->i_uid);
        btrfs_set_inode_gid(leaf, item, inode->i_gid);
        btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2618,6 +2628,11 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
        btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+        if (leaf->map_token) {
+                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                leaf->map_token = NULL;
+        }
 }
 /*
@@ -4207,10 +4222,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        struct btrfs_key found_key;
        struct btrfs_path *path;
        int ret;
-        u32 nritems;
        struct extent_buffer *leaf;
        int slot;
-        int advance;
        unsigned char d_type;
        int over = 0;
        u32 di_cur;
@@ -4253,27 +4266,19 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
-        advance = 0;
        while (1) {
                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
                slot = path->slots[0];
-                if (advance || slot >= nritems) {
+                if (slot >= btrfs_header_nritems(leaf)) {
-                        if (slot >= nritems - 1) {
+                        ret = btrfs_next_leaf(root, path);
-                                ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
-                                if (ret)
+                                goto err;
-                                        break;
+                        else if (ret > 0)
-                                leaf = path->nodes[0];
+                                break;
-                                nritems = btrfs_header_nritems(leaf);
+                        continue;
-                                slot = path->slots[0];
-                        } else {
-                                slot++;
-                                path->slots[0]++;
-                        }
                }
-                advance = 1;
                item = btrfs_item_nr(leaf, slot);
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -4282,7 +4287,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                if (btrfs_key_type(&found_key) != key_type)
                        break;
                if (found_key.offset < filp->f_pos)
-                        continue;
+                        goto next;
                filp->f_pos = found_key.offset;
@@ -4335,6 +4340,8 @@ skip:
                        di_cur += di_len;
                        di = (struct btrfs_dir_item *)((char *)di + di_len);
                }
+next:
+                path->slots[0]++;
        }
        /* Reached end of directory/root. Bump pos past the last item. */
@@ -4527,14 +4534,17 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        BUG_ON(!path);
        inode = new_inode(root->fs_info->sb);
-        if (!inode)
+        if (!inode) {
+                btrfs_free_path(path);
                return ERR_PTR(-ENOMEM);
+        }
        if (dir) {
                trace_btrfs_inode_request(dir);
                ret = btrfs_set_inode_index(dir, index);
                if (ret) {
+                        btrfs_free_path(path);
                        iput(inode);
                        return ERR_PTR(ret);
                }
@@ -4834,9 +4844,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (inode->i_nlink == ~0U)
                return -EMLINK;
-        btrfs_inc_nlink(inode);
-        inode->i_ctime = CURRENT_TIME;
        err = btrfs_set_inode_index(dir, &index);
        if (err)
                goto fail;
@@ -4852,6 +4859,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                goto fail;
        }
+        btrfs_inc_nlink(inode);
+        inode->i_ctime = CURRENT_TIME;
        btrfs_set_trans_block_group(trans, dir);
        ihold(inode);
@@ -5221,7 +5231,7 @@ again:
                        btrfs_mark_buffer_dirty(leaf);
                }
                set_extent_uptodate(io_tree, em->start,
-                                    extent_map_end(em) - 1, GFP_NOFS);
+                                    extent_map_end(em) - 1, NULL, GFP_NOFS);
                goto insert;
        } else {
                printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
@@ -5428,17 +5438,30 @@ out:
 }
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+                                                  struct extent_map *em,
                                                  u64 start, u64 len)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
-        struct extent_map *em;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct btrfs_key ins;
        u64 alloc_hint;
        int ret;
+        bool insert = false;
-        btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+        /*
+         * Ok if the extent map we looked up is a hole and is for the exact
+         * range we want, there is no reason to allocate a new one, however if
+         * it is not right then we need to free this one and drop the cache for
+         * our range.
+         */
+        if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
+            em->len != len) {
+                free_extent_map(em);
+                em = NULL;
+                insert = true;
+                btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+        }
        trans = btrfs_join_transaction(root, 0);
        if (IS_ERR(trans))
@@ -5454,10 +5477,12 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                goto out;
        }
-        em = alloc_extent_map(GFP_NOFS);
        if (!em) {
-                em = ERR_PTR(-ENOMEM);
+                em = alloc_extent_map(GFP_NOFS);
-                goto out;
+                if (!em) {
+                        em = ERR_PTR(-ENOMEM);
+                        goto out;
+                }
        }
        em->start = start;
@@ -5467,9 +5492,15 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        em->block_start = ins.objectid;
        em->block_len = ins.offset;
        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        /*
+         * We need to do this because if we're using the original em we searched
+         * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
+         */
+        em->flags = 0;
        set_bit(EXTENT_FLAG_PINNED, &em->flags);
-        while (1) {
+        while (insert) {
                write_lock(&em_tree->lock);
                ret = add_extent_mapping(em_tree, em);
                write_unlock(&em_tree->lock);
@@ -5687,8 +5718,7 @@ must_cow:
         * it above
         */
        len = bh_result->b_size;
-        free_extent_map(em);
+        em = btrfs_new_extent_direct(inode, em, start, len);
-        em = btrfs_new_extent_direct(inode, start, len);
        if (IS_ERR(em))
                return PTR_ERR(em);
        len = min(len, em->len - (start - em->start));
@@ -5851,8 +5881,10 @@ again:
        }
        add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
-        btrfs_ordered_update_i_size(inode, 0, ordered);
+        ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-        btrfs_update_inode(trans, root, inode);
+        if (!ret)
+                btrfs_update_inode(trans, root, inode);
+        ret = 0;
 out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
                             ordered->file_offset + ordered->len - 1,
@@ -5938,7 +5970,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                                         int rw, u64 file_offset, int skip_sum,
-                                         u32 *csums)
+                                         u32 *csums, int async_submit)
 {
        int write = rw & REQ_WRITE;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -5949,13 +5981,24 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
        if (ret)
                goto err;
-        if (write && !skip_sum) {
+        if (skip_sum)
+                goto map;
+        if (write && async_submit) {
                ret = btrfs_wq_submit_bio(root->fs_info,
                                   inode, rw, bio, 0, 0,
                                   file_offset,
                                   __btrfs_submit_bio_start_direct_io,
                                   __btrfs_submit_bio_done);
                goto err;
+        } else if (write) {
+                /*
+                 * If we aren't doing async submit, calculate the csum of the
+                 * bio now.
+                 */
+                ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
+                if (ret)
+                        goto err;
        } else if (!skip_sum) {
                ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
                                          file_offset, csums);
@@ -5963,7 +6006,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                        goto err;
        }
-        ret = btrfs_map_bio(root, rw, bio, 0, 1);
+map:
+        ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
 err:
        bio_put(bio);
        return ret;
@@ -5985,15 +6029,9 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        int nr_pages = 0;
        u32 *csums = dip->csums;
        int ret = 0;
+        int async_submit = 0;
        int write = rw & REQ_WRITE;
-        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
-        if (!bio)
-                return -ENOMEM;
-        bio->bi_private = dip;
-        bio->bi_end_io = btrfs_end_dio_bio;
-        atomic_inc(&dip->pending_bios);
        map_length = orig_bio->bi_size;
        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
                              &map_length, NULL, 0);
@@ -6002,6 +6040,19 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                return -EIO;
        }
+        if (map_length >= orig_bio->bi_size) {
+                bio = orig_bio;
+                goto submit;
+        }
+        async_submit = 1;
+        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_private = dip;
+        bio->bi_end_io = btrfs_end_dio_bio;
+        atomic_inc(&dip->pending_bios);
        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
                if (unlikely(map_length < submit_len + bvec->bv_len ||
                    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
@@ -6015,7 +6066,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                        atomic_inc(&dip->pending_bios);
                        ret = __btrfs_submit_dio_bio(bio, inode, rw,
                                                     file_offset, skip_sum,
-                                                     csums);
+                                                     csums, async_submit);
                        if (ret) {
                                bio_put(bio);
                                atomic_dec(&dip->pending_bios);
@@ -6052,8 +6103,9 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                }
        }
+submit:
        ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
-                                     csums);
+                                     csums, async_submit);
        if (!ret)
                return 0;
@@ -6148,6 +6200,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
                        unsigned long nr_segs)
 {
        int seg;
+        int i;
        size_t size;
        unsigned long addr;
        unsigned blocksize_mask = root->sectorsize - 1;
@@ -6162,8 +6215,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
                addr = (unsigned long)iov[seg].iov_base;
                size = iov[seg].iov_len;
                end += size;
-                if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+                if ((addr & blocksize_mask) || (size & blocksize_mask))
                        goto out;
+                /* If this is a write we don't need to check anymore */
+                if (rw & WRITE)
+                        continue;
+                /*
+                 * Check to make sure we don't have duplicate iov_base's in this
+                 * iovec, if so return EINVAL, otherwise we'll get csum errors
+                 * when reading back.
+                 */
+                for (i = seg + 1; i < nr_segs; i++) {
+                        if (iov[seg].iov_base == iov[i].iov_base)
+                                goto out;
+                }
        }
        retval = 0;
 out:
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cfc264fefdb0..ffb48d6c5433 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2287,7 +2287,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
        struct btrfs_ioctl_space_info space;
        struct btrfs_ioctl_space_info *dest;
        struct btrfs_ioctl_space_info *dest_orig;
-        struct btrfs_ioctl_space_info *user_dest;
+        struct btrfs_ioctl_space_info __user *user_dest;
        struct btrfs_space_info *info;
        u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
                       BTRFS_BLOCK_GROUP_SYSTEM,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 58e7de9cc90c..0ac712efcdf2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -159,7 +159,7 @@ enum {
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-        Opt_enospc_debug, Opt_err,
+        Opt_enospc_debug, Opt_subvolrootid, Opt_err,
 };
 static match_table_t tokens = {
@@ -189,6 +189,7 @@ static match_table_t tokens = {
        {Opt_clear_cache, "clear_cache"},
        {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
        {Opt_enospc_debug, "enospc_debug"},
+        {Opt_subvolrootid, "subvolrootid=%d"},
        {Opt_err, NULL},
 };
@@ -232,6 +233,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        break;
                case Opt_subvol:
                case Opt_subvolid:
+                case Opt_subvolrootid:
                case Opt_device:
                        /*
                         * These are parsed by btrfs_parse_early_options
@@ -388,7 +390,7 @@ out:
 */
 static int btrfs_parse_early_options(const char *options, fmode_t flags,
                void *holder, char **subvol_name, u64 *subvol_objectid,
-                struct btrfs_fs_devices **fs_devices)
+                u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
 {
        substring_t args[MAX_OPT_ARGS];
        char *opts, *orig, *p;
@@ -429,6 +431,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                                        *subvol_objectid = intarg;
                        }
                        break;
+                case Opt_subvolrootid:
+                        intarg = 0;
+                        error = match_int(&args[0], &intarg);
+                        if (!error) {
+                                /* we want the original fs_tree */
+                                if (!intarg)
+                                        *subvol_rootid =
+                                                BTRFS_FS_TREE_OBJECTID;
+                                else
+                                        *subvol_rootid = intarg;
+                        }
+                        break;
                case Opt_device:
                        error = btrfs_scan_one_device(match_strdup(&args[0]),
                                        flags, holder, fs_devices);
@@ -736,6 +750,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        fmode_t mode = FMODE_READ;
        char *subvol_name = NULL;
        u64 subvol_objectid = 0;
+        u64 subvol_rootid = 0;
        int error = 0;
        if (!(flags & MS_RDONLY))
@@ -743,7 +758,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        error = btrfs_parse_early_options(data, mode, fs_type,
                                          &subvol_name, &subvol_objectid,
-                                          &fs_devices);
+                                          &subvol_rootid, &fs_devices);
        if (error)
                return ERR_PTR(error);
@@ -807,15 +822,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                s->s_flags |= MS_ACTIVE;
        }
-        root = get_default_root(s, subvol_objectid);
-        if (IS_ERR(root)) {
-                error = PTR_ERR(root);
-                deactivate_locked_super(s);
-                goto error_free_subvol_name;
-        }
        /* if they gave us a subvolume name bind mount into that */
        if (strcmp(subvol_name, ".")) {
                struct dentry *new_root;
+                root = get_default_root(s, subvol_rootid);
+                if (IS_ERR(root)) {
+                        error = PTR_ERR(root);
+                        deactivate_locked_super(s);
+                        goto error_free_subvol_name;
+                }
                mutex_lock(&root->d_inode->i_mutex);
                new_root = lookup_one_len(subvol_name, root,
                                      strlen(subvol_name));
@@ -836,6 +853,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                }
                dput(root);
                root = new_root;
+        } else {
+                root = get_default_root(s, subvol_objectid);
+                if (IS_ERR(root)) {
+                        error = PTR_ERR(root);
+                        deactivate_locked_super(s);
+                        goto error_free_subvol_name;
+                }
        }
        kfree(subvol_name);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5b158da7e0bb..c571734d5e5a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -32,10 +32,8 @@
 static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
-        WARN_ON(transaction->use_count == 0);
+        WARN_ON(atomic_read(&transaction->use_count) == 0);
-        transaction->use_count--;
+        if (atomic_dec_and_test(&transaction->use_count)) {
-        if (transaction->use_count == 0) {
-                list_del_init(&transaction->list);
                memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
@@ -60,14 +58,14 @@ static noinline int join_transaction(struct btrfs_root *root)
                if (!cur_trans)
                        return -ENOMEM;
                root->fs_info->generation++;
-                cur_trans->num_writers = 1;
+                atomic_set(&cur_trans->num_writers, 1);
                cur_trans->num_joined = 0;
                cur_trans->transid = root->fs_info->generation;
                init_waitqueue_head(&cur_trans->writer_wait);
                init_waitqueue_head(&cur_trans->commit_wait);
                cur_trans->in_commit = 0;
                cur_trans->blocked = 0;
-                cur_trans->use_count = 1;
+                atomic_set(&cur_trans->use_count, 1);
                cur_trans->commit_done = 0;
                cur_trans->start_time = get_seconds();
@@ -88,7 +86,7 @@ static noinline int join_transaction(struct btrfs_root *root)
                root->fs_info->running_transaction = cur_trans;
                spin_unlock(&root->fs_info->new_trans_lock);
        } else {
-                cur_trans->num_writers++;
+                atomic_inc(&cur_trans->num_writers);
                cur_trans->num_joined++;
        }
@@ -145,7 +143,7 @@ static void wait_current_trans(struct btrfs_root *root)
        cur_trans = root->fs_info->running_transaction;
        if (cur_trans && cur_trans->blocked) {
                DEFINE_WAIT(wait);
-                cur_trans->use_count++;
+                atomic_inc(&cur_trans->use_count);
                while (1) {
                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
                                        TASK_UNINTERRUPTIBLE);
@@ -181,6 +179,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 {
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
+        int retries = 0;
        int ret;
        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -204,7 +203,7 @@ again:
        }
        cur_trans = root->fs_info->running_transaction;
-        cur_trans->use_count++;
+        atomic_inc(&cur_trans->use_count);
        if (type != TRANS_JOIN_NOLOCK)
                mutex_unlock(&root->fs_info->trans_mutex);
@@ -224,10 +223,18 @@ again:
        if (num_items > 0) {
                ret = btrfs_trans_reserve_metadata(h, root, num_items);
-                if (ret == -EAGAIN) {
+                if (ret == -EAGAIN && !retries) {
+                        retries++;
                        btrfs_commit_transaction(h, root);
                        goto again;
+                } else if (ret == -EAGAIN) {
+                        /*
+                         * We have already retried and got EAGAIN, so really we
+                         * don't have space, so set ret to -ENOSPC.
+                         */
+                        ret = -ENOSPC;
                }
                if (ret < 0) {
                        btrfs_end_transaction(h, root);
                        return ERR_PTR(ret);
@@ -327,7 +334,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
                        goto out_unlock;  /* nothing committing|committed */
        }
-        cur_trans->use_count++;
+        atomic_inc(&cur_trans->use_count);
        mutex_unlock(&root->fs_info->trans_mutex);
        wait_for_commit(root, cur_trans);
@@ -457,18 +464,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                        wake_up_process(info->transaction_kthread);
        }
-        if (lock)
-                mutex_lock(&info->trans_mutex);
        WARN_ON(cur_trans != info->running_transaction);
-        WARN_ON(cur_trans->num_writers < 1);
+        WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
-        cur_trans->num_writers--;
+        atomic_dec(&cur_trans->num_writers);
        smp_mb();
        if (waitqueue_active(&cur_trans->writer_wait))
                wake_up(&cur_trans->writer_wait);
        put_transaction(cur_trans);
-        if (lock)
-                mutex_unlock(&info->trans_mutex);
        if (current->journal_info == trans)
                current->journal_info = NULL;
@@ -1178,7 +1181,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        /* take transaction reference */
        mutex_lock(&root->fs_info->trans_mutex);
        cur_trans = trans->transaction;
-        cur_trans->use_count++;
+        atomic_inc(&cur_trans->use_count);
        mutex_unlock(&root->fs_info->trans_mutex);
        btrfs_end_transaction(trans, root);
@@ -1237,7 +1240,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        mutex_lock(&root->fs_info->trans_mutex);
        if (cur_trans->in_commit) {
-                cur_trans->use_count++;
+                atomic_inc(&cur_trans->use_count);
                mutex_unlock(&root->fs_info->trans_mutex);
                btrfs_end_transaction(trans, root);
@@ -1259,7 +1262,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
                if (!prev_trans->commit_done) {
-                        prev_trans->use_count++;
+                        atomic_inc(&prev_trans->use_count);
                        mutex_unlock(&root->fs_info->trans_mutex);
                        wait_for_commit(root, prev_trans);
@@ -1300,14 +1303,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                                TASK_UNINTERRUPTIBLE);
                smp_mb();
-                if (cur_trans->num_writers > 1)
+                if (atomic_read(&cur_trans->num_writers) > 1)
                        schedule_timeout(MAX_SCHEDULE_TIMEOUT);
                else if (should_grow)
                        schedule_timeout(1);
                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&cur_trans->writer_wait, &wait);
-        } while (cur_trans->num_writers > 1 ||
+        } while (atomic_read(&cur_trans->num_writers) > 1 ||
                 (should_grow && cur_trans->num_joined != joined));
        ret = create_pending_snapshots(trans, root->fs_info);
@@ -1394,6 +1397,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        wake_up(&cur_trans->commit_wait);
+        list_del_init(&cur_trans->list);
        put_transaction(cur_trans);
        put_transaction(cur_trans);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 229a594cacd5..e441acc6c584 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,11 +27,11 @@ struct btrfs_transaction {
         * total writers in this transaction, it must be zero before the
         * transaction can end
         */
-        unsigned long num_writers;
+        atomic_t num_writers;
        unsigned long num_joined;
        int in_commit;
-        int use_count;
+        atomic_t use_count;
        int commit_done;
        int blocked;
        struct list_head list;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a5303b871b13..cfd660550ded 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -180,11 +180,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_dir_item *di;
-        int ret = 0, slot, advance;
+        int ret = 0, slot;
        size_t total_size = 0, size_left = size;
        unsigned long name_ptr;
        size_t name_len;
-        u32 nritems;
        /*
         * ok we want all objects associated with this id.
@@ -204,34 +203,24 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
-        advance = 0;
        while (1) {
                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
                slot = path->slots[0];
                /* this is where we start walking through the path */
-                if (advance || slot >= nritems) {
+                if (slot >= btrfs_header_nritems(leaf)) {
                        /*
                         * if we've reached the last slot in this leaf we need
                         * to go to the next leaf and reset everything
                         */
-                        if (slot >= nritems-1) {
+                        ret = btrfs_next_leaf(root, path);
-                                ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
-                                if (ret)
+                                goto err;
-                                        break;
+                        else if (ret > 0)
-                                leaf = path->nodes[0];
+                                break;
-                                nritems = btrfs_header_nritems(leaf);
+                        continue;
-                                slot = path->slots[0];
-                        } else {
-                                /*
-                                 * just walking through the slots on this leaf
-                                 */
-                                slot++;
-                                path->slots[0]++;
-                        }
                }
-                advance = 1;
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -250,7 +239,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                /* we are just looking for how big our buffer needs to be */
                if (!size)
-                        continue;
+                        goto next;
                if (!buffer || (name_len + 1) > size_left) {
                        ret = -ERANGE;
@@ -263,6 +252,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                size_left -= name_len + 1;
                buffer += name_len + 1;
+next:
+                path->slots[0]++;
        }
        ret = total_size;
diff --git a/fs/cifs/README b/fs/cifs/README
index fe1683590828..74ab165fc646 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -685,22 +685,6 @@ LinuxExtensionsEnabled	If set to one then the client will attempt to
                        support and want to map the uid and gid fields 
                        to values supplied at mount (rather than the 
                        actual values, then set this to zero. (default 1)
-Experimental            When set to 1 used to enable certain experimental
-                        features (currently enables multipage writes
-                        when signing is enabled, the multipage write
-                        performance enhancement was disabled when
-                        signing turned on in case buffer was modified
-                        just before it was sent, also this flag will
-                        be used to use the new experimental directory change 
-                        notification code).  When set to 2 enables
-                        an additional experimental feature, "raw ntlmssp"
-                        session establishment support (which allows
-                        specifying "sec=ntlmssp" on mount). The Linux cifs
-                        module will use ntlmv2 authentication encapsulated
-                        in "raw ntlmssp" (not using SPNEGO) when
-                        "sec=ntlmssp" is specified on mount.
-                        This support also requires building cifs with
-                        the CONFIG_CIFS_EXPERIMENTAL configuration flag.
 These experimental features and tracing can be enabled by changing flags in 
 /proc/fs/cifs (after the cifs module has been installed or built into the 
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index e654dfd092c3..53d57a3fe427 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -50,7 +50,7 @@ void cifs_fscache_unregister(void)
 */
 struct cifs_server_key {
        uint16_t        family;         /* address family */
-        uint16_t        port;           /* IP port */
+        __be16          port;           /* IP port */
        union {
                struct in_addr  ipv4_addr;
                struct in6_addr ipv6_addr;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 65829d32128c..30d01bc90855 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -423,7 +423,6 @@ static const struct file_operations cifs_lookup_cache_proc_fops;
 static const struct file_operations traceSMB_proc_fops;
 static const struct file_operations cifs_multiuser_mount_proc_fops;
 static const struct file_operations cifs_security_flags_proc_fops;
-static const struct file_operations cifs_experimental_proc_fops;
 static const struct file_operations cifs_linux_ext_proc_fops;
 void
@@ -441,8 +440,6 @@ cifs_proc_init(void)
        proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
        proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
        proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops);
-        proc_create("Experimental", 0, proc_fs_cifs,
-                    &cifs_experimental_proc_fops);
        proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
                    &cifs_linux_ext_proc_fops);
        proc_create("MultiuserMount", 0, proc_fs_cifs,
@@ -469,7 +466,6 @@ cifs_proc_clean(void)
        remove_proc_entry("OplockEnabled", proc_fs_cifs);
        remove_proc_entry("SecurityFlags", proc_fs_cifs);
        remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
-        remove_proc_entry("Experimental", proc_fs_cifs);
        remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
        remove_proc_entry("fs/cifs", NULL);
 }
@@ -550,45 +546,6 @@ static const struct file_operations cifs_oplock_proc_fops = {
        .write          = cifs_oplock_proc_write,
 };
-static int cifs_experimental_proc_show(struct seq_file *m, void *v)
-{
-        seq_printf(m, "%d\n", experimEnabled);
-        return 0;
-}
-static int cifs_experimental_proc_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, cifs_experimental_proc_show, NULL);
-}
-static ssize_t cifs_experimental_proc_write(struct file *file,
-                const char __user *buffer, size_t count, loff_t *ppos)
-{
-        char c;
-        int rc;
-        rc = get_user(c, buffer);
-        if (rc)
-                return rc;
-        if (c == '0' || c == 'n' || c == 'N')
-                experimEnabled = 0;
-        else if (c == '1' || c == 'y' || c == 'Y')
-                experimEnabled = 1;
-        else if (c == '2')
-                experimEnabled = 2;
-        return count;
-}
-static const struct file_operations cifs_experimental_proc_fops = {
-        .owner          = THIS_MODULE,
-        .open           = cifs_experimental_proc_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-        .write          = cifs_experimental_proc_write,
-};
 static int cifs_linux_ext_proc_show(struct seq_file *m, void *v)
 {
        seq_printf(m, "%d\n", linuxExtEnabled);
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 4dfba8283165..33d221394aca 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -113,7 +113,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
                   MAX_MECH_STR_LEN +
                   UID_KEY_LEN + (sizeof(uid_t) * 2) +
                   CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
-                   USER_KEY_LEN + strlen(sesInfo->userName) +
+                   USER_KEY_LEN + strlen(sesInfo->user_name) +
                   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
        spnego_key = ERR_PTR(-ENOMEM);
@@ -153,7 +153,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
        dp = description + strlen(description);
-        sprintf(dp, ";user=%s", sesInfo->userName);
+        sprintf(dp, ";user=%s", sesInfo->user_name);
        dp = description + strlen(description);
        sprintf(dp, ";pid=0x%x", current->pid);
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index fc0fd4fde306..23d43cde4306 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -90,7 +90,7 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
        case UNI_COLON:
                *target = ':';
                break;
-        case UNI_ASTERIK:
+        case UNI_ASTERISK:
                *target = '*';
                break;
        case UNI_QUESTION:
@@ -264,40 +264,40 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
 * names are little endian 16 bit Unicode on the wire
 */
 int
-cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+cifsConvertToUCS(__le16 *target, const char *source, int srclen,
                 const struct nls_table *cp, int mapChars)
 {
        int i, j, charlen;
-        int len_remaining = maxlen;
        char src_char;
-        __u16 temp;
+        __le16 dst_char;
+        wchar_t tmp;
        if (!mapChars)
                return cifs_strtoUCS(target, source, PATH_MAX, cp);
-        for (i = 0, j = 0; i < maxlen; j++) {
+        for (i = 0, j = 0; i < srclen; j++) {
                src_char = source[i];
                switch (src_char) {
                case 0:
-                        put_unaligned_le16(0, &target[j]);
+                        put_unaligned(0, &target[j]);
                        goto ctoUCS_out;
                case ':':
-                        temp = UNI_COLON;
+                        dst_char = cpu_to_le16(UNI_COLON);
                        break;
                case '*':
-                        temp = UNI_ASTERIK;
+                        dst_char = cpu_to_le16(UNI_ASTERISK);
                        break;
                case '?':
-                        temp = UNI_QUESTION;
+                        dst_char = cpu_to_le16(UNI_QUESTION);
                        break;
                case '<':
-                        temp = UNI_LESSTHAN;
+                        dst_char = cpu_to_le16(UNI_LESSTHAN);
                        break;
                case '>':
-                        temp = UNI_GRTRTHAN;
+                        dst_char = cpu_to_le16(UNI_GRTRTHAN);
                        break;
                case '|':
-                        temp = UNI_PIPE;
+                        dst_char = cpu_to_le16(UNI_PIPE);
                        break;
                /*
                 * FIXME: We can not handle remapping backslash (UNI_SLASH)
@@ -305,17 +305,17 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
                 * as they use backslash as separator.
                 */
                default:
-                        charlen = cp->char2uni(source+i, len_remaining,
+                        charlen = cp->char2uni(source + i, srclen - i, &tmp);
-                                                &temp);
+                        dst_char = cpu_to_le16(tmp);
                        /*
                         * if no match, use question mark, which at least in
                         * some cases serves as wild card
                         */
                        if (charlen < 1) {
-                                temp = 0x003f;
+                                dst_char = cpu_to_le16(0x003f);
                                charlen = 1;
                        }
-                        len_remaining -= charlen;
                        /*
                         * character may take more than one byte in the source
                         * string, but will take exactly two bytes in the
@@ -324,9 +324,8 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
                        i += charlen;
                        continue;
                }
-                put_unaligned_le16(temp, &target[j]);
+                put_unaligned(dst_char, &target[j]);
                i++; /* move to next char in source string */
-                len_remaining--;
        }
 ctoUCS_out:
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 7fe6b52df507..644dd882a560 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -44,7 +44,7 @@
 * reserved symbols (along with \ and /), otherwise illegal to store
 * in filenames in NTFS
 */
-#define UNI_ASTERIK     (__u16) ('*' + 0xF000)
+#define UNI_ASTERISK    (__u16) ('*' + 0xF000)
 #define UNI_QUESTION    (__u16) ('?' + 0xF000)
 #define UNI_COLON       (__u16) (':' + 0xF000)
 #define UNI_GRTRTHAN    (__u16) ('>' + 0xF000)
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index a51585f9852b..d1a016be73ba 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -30,12 +30,13 @@
 #include <linux/ctype.h>
 #include <linux/random.h>
-/* Calculate and return the CIFS signature based on the mac key and SMB PDU */
+/*
-/* the 16 byte signature must be allocated by the caller  */
+ * Calculate and return the CIFS signature based on the mac key and SMB PDU.
-/* Note we only use the 1st eight bytes */
+ * The 16 byte signature must be allocated by the caller. Note we only use the
-/* Note that the smb header signature field on input contains the
+ * 1st eight bytes and that the smb header signature field on input contains
-        sequence number before this function is called */
+ * the sequence number before this function is called. Also, this function
+ * should be called with the server->srv_mutex held.
+ */
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
                                struct TCP_Server_Info *server, char *signature)
 {
@@ -209,8 +210,10 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
                                        cpu_to_le32(expected_sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
+        mutex_lock(&server->srv_mutex);
        rc = cifs_calculate_signature(cifs_pdu, server,
                what_we_think_sig_should_be);
+        mutex_unlock(&server->srv_mutex);
        if (rc)
                return rc;
@@ -469,15 +472,15 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
                return rc;
        }
-        /* convert ses->userName to unicode and uppercase */
+        /* convert ses->user_name to unicode and uppercase */
-        len = strlen(ses->userName);
+        len = strlen(ses->user_name);
        user = kmalloc(2 + (len * 2), GFP_KERNEL);
        if (user == NULL) {
                cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
                rc = -ENOMEM;
                goto calc_exit_2;
        }
-        len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
+        len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
        UniStrupr(user);
        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f2970136d17d..5c412b33cd7c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -53,7 +53,6 @@ int cifsFYI = 0;
 int cifsERROR = 1;
 int traceSMB = 0;
 unsigned int oplockEnabled = 1;
-unsigned int experimEnabled = 0;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
 unsigned int multiuser_mount = 0;
@@ -127,6 +126,7 @@ cifs_read_super(struct super_block *sb, void *data,
                kfree(cifs_sb);
                return rc;
        }
+        cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
 #ifdef CONFIG_CIFS_DFS_UPCALL
        /* copy mount params to sb for use in submounts */
@@ -409,8 +409,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
                seq_printf(s, ",multiuser");
-        else if (tcon->ses->userName)
+        else if (tcon->ses->user_name)
-                seq_printf(s, ",username=%s", tcon->ses->userName);
+                seq_printf(s, ",username=%s", tcon->ses->user_name);
        if (tcon->ses->domainName)
                seq_printf(s, ",domain=%s", tcon->ses->domainName);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 17afb0fbcaed..a5d1106fcbde 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -37,10 +37,9 @@
 #define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
 #define MAX_SERVER_SIZE 15
-#define MAX_SHARE_SIZE  64      /* used to be 20, this should still be enough */
+#define MAX_SHARE_SIZE 80
-#define MAX_USERNAME_SIZE 32    /* 32 is to allow for 15 char names + null
+#define MAX_USERNAME_SIZE 256   /* reasonable maximum for current servers */
-                                   termination then *2 for unicode versions */
+#define MAX_PASSWORD_SIZE 512   /* max for windows seems to be 256 wide chars */
-#define MAX_PASSWORD_SIZE 512  /* max for windows seems to be 256 wide chars */
 #define CIFS_MIN_RCV_POOL 4
@@ -92,7 +91,8 @@ enum statusEnum {
        CifsNew = 0,
        CifsGood,
        CifsExiting,
-        CifsNeedReconnect
+        CifsNeedReconnect,
+        CifsNeedNegotiate
 };
 enum securityEnum {
@@ -274,7 +274,7 @@ struct cifsSesInfo {
        int capabilities;
        char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
                                TCP names - will ipv6 and sctp addresses fit? */
-        char userName[MAX_USERNAME_SIZE + 1];
+        char *user_name;
        char *domainName;
        char *password;
        struct session_key auth_key;
@@ -817,7 +817,6 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
                                have the uid/password or Kerberos credential
                                or equivalent for current user */
 GLOBAL_EXTERN unsigned int oplockEnabled;
-GLOBAL_EXTERN unsigned int experimEnabled;
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
 GLOBAL_EXTERN unsigned int global_secflags;     /* if on, session setup sent
                                with more secure ntlmssp2 challenge/resp */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 2644a5d6cc67..df959bae6728 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -142,9 +142,9 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
         */
        while (server->tcpStatus == CifsNeedReconnect) {
                wait_event_interruptible_timeout(server->response_q,
-                        (server->tcpStatus == CifsGood), 10 * HZ);
+                        (server->tcpStatus != CifsNeedReconnect), 10 * HZ);
-                /* is TCP session is reestablished now ?*/
+                /* are we still trying to reconnect? */
                if (server->tcpStatus != CifsNeedReconnect)
                        break;
@@ -729,7 +729,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
                return rc;
        /* set up echo request */
-        smb->hdr.Tid = cpu_to_le16(0xffff);
+        smb->hdr.Tid = 0xffff;
        smb->hdr.WordCount = 1;
        put_unaligned_le16(1, &smb->EchoCount);
        put_bcc_le(1, &smb->hdr);
@@ -1884,10 +1884,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
                                        __constant_cpu_to_le16(CIFS_WRLCK))
                                pLockData->fl_type = F_WRLCK;
-                        pLockData->fl_start = parm_data->start;
+                        pLockData->fl_start = le64_to_cpu(parm_data->start);
-                        pLockData->fl_end = parm_data->start +
+                        pLockData->fl_end = pLockData->fl_start +
-                                                parm_data->length - 1;
+                                        le64_to_cpu(parm_data->length) - 1;
-                        pLockData->fl_pid = parm_data->pid;
+                        pLockData->fl_pid = le32_to_cpu(parm_data->pid);
                }
        }
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 6e2b2addfc78..db9d55b507d0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -199,8 +199,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        }
        spin_unlock(&GlobalMid_Lock);
-        while ((server->tcpStatus != CifsExiting) &&
+        while (server->tcpStatus == CifsNeedReconnect) {
-               (server->tcpStatus != CifsGood)) {
                try_to_freeze();
                /* we should try only the port we connected to before */
@@ -212,7 +211,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
                        atomic_inc(&tcpSesReconnectCount);
                        spin_lock(&GlobalMid_Lock);
                        if (server->tcpStatus != CifsExiting)
-                                server->tcpStatus = CifsGood;
+                                server->tcpStatus = CifsNeedNegotiate;
                        spin_unlock(&GlobalMid_Lock);
                }
        }
@@ -248,24 +247,24 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
        total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
        data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
-        remaining = total_data_size - data_in_this_rsp;
+        if (total_data_size == data_in_this_rsp)
-        if (remaining == 0)
                return 0;
-        else if (remaining < 0) {
+        else if (total_data_size < data_in_this_rsp) {
                cFYI(1, "total data %d smaller than data in frame %d",
                        total_data_size, data_in_this_rsp);
                return -EINVAL;
-        } else {
-                cFYI(1, "missing %d bytes from transact2, check next response",
-                        remaining);
-                if (total_data_size > maxBufSize) {
-                        cERROR(1, "TotalDataSize %d is over maximum buffer %d",
-                                total_data_size, maxBufSize);
-                        return -EINVAL;
-                }
-                return remaining;
        }
+        remaining = total_data_size - data_in_this_rsp;
+        cFYI(1, "missing %d bytes from transact2, check next response",
+                remaining);
+        if (total_data_size > maxBufSize) {
+                cERROR(1, "TotalDataSize %d is over maximum buffer %d",
+                        total_data_size, maxBufSize);
+                return -EINVAL;
+        }
+        return remaining;
 }
 static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
@@ -421,7 +420,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                pdu_length = 4; /* enough to get RFC1001 header */
 incomplete_rcv:
-                if (echo_retries > 0 &&
+                if (echo_retries > 0 && server->tcpStatus == CifsGood &&
                    time_after(jiffies, server->lstrp +
                                        (echo_retries * SMB_ECHO_INTERVAL))) {
                        cERROR(1, "Server %s has not responded in %d seconds. "
@@ -881,7 +880,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                                /* null user, ie anonymous, authentication */
                                vol->nullauth = 1;
                        }
-                        if (strnlen(value, 200) < 200) {
+                        if (strnlen(value, MAX_USERNAME_SIZE) <
+                                                MAX_USERNAME_SIZE) {
                                vol->username = value;
                        } else {
                                printk(KERN_WARNING "CIFS: username too long\n");
@@ -1472,7 +1472,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
 static bool
 match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
 {
-        unsigned short int port, *sport;
+        __be16 port, *sport;
        switch (addr->sa_family) {
        case AF_INET:
@@ -1765,6 +1765,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                module_put(THIS_MODULE);
                goto out_err_crypto_release;
        }
+        tcp_ses->tcpStatus = CifsNeedNegotiate;
        /* thread spawned, put it on the list */
        spin_lock(&cifs_tcp_ses_lock);
@@ -1808,7 +1809,9 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
                        break;
                default:
                        /* anything else takes username/password */
-                        if (strncmp(ses->userName, vol->username,
+                        if (ses->user_name == NULL)
+                                continue;
+                        if (strncmp(ses->user_name, vol->username,
                                    MAX_USERNAME_SIZE))
                                continue;
                        if (strlen(vol->username) != 0 &&
@@ -1851,6 +1854,8 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
        cifs_put_tcp_session(server);
 }
+static bool warned_on_ntlm;  /* globals init to false automatically */
 static struct cifsSesInfo *
 cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
@@ -1906,9 +1911,11 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        else
                sprintf(ses->serverName, "%pI4", &addr->sin_addr);
-        if (volume_info->username)
+        if (volume_info->username) {
-                strncpy(ses->userName, volume_info->username,
+                ses->user_name = kstrdup(volume_info->username, GFP_KERNEL);
-                        MAX_USERNAME_SIZE);
+                if (!ses->user_name)
+                        goto get_ses_fail;
+        }
        /* volume_info->password freed at unmount */
        if (volume_info->password) {
@@ -1923,6 +1930,15 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        }
        ses->cred_uid = volume_info->cred_uid;
        ses->linux_uid = volume_info->linux_uid;
+        /* ntlmv2 is much stronger than ntlm security, and has been broadly
+        supported for many years, time to update default security mechanism */
+        if ((volume_info->secFlg == 0) && warned_on_ntlm == false) {
+                warned_on_ntlm = true;
+                cERROR(1, "default security mechanism requested.  The default "
+                        "security mechanism will be upgraded from ntlm to "
+                        "ntlmv2 in kernel release 2.6.41");
+        }
        ses->overrideSecFlg = volume_info->secFlg;
        mutex_lock(&ses->session_mutex);
@@ -2276,7 +2292,7 @@ static int
 generic_ip_connect(struct TCP_Server_Info *server)
 {
        int rc = 0;
-        unsigned short int sport;
+        __be16 sport;
        int slen, sfamily;
        struct socket *socket = server->ssocket;
        struct sockaddr *saddr;
@@ -2361,7 +2377,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
 static int
 ip_connect(struct TCP_Server_Info *server)
 {
-        unsigned short int *sport;
+        __be16 *sport;
        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
@@ -2826,7 +2842,7 @@ try_mount_again:
 remote_path_check:
        /* check if a whole path (including prepath) is not remote */
-        if (!rc && cifs_sb->prepathlen && tcon) {
+        if (!rc && tcon) {
                /* build_path_to_root works only when we have a valid tcon */
                full_path = cifs_build_path_to_root(cifs_sb, tcon);
                if (full_path == NULL) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c27d236738fc..faf59529e847 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -575,8 +575,10 @@ reopen_error_exit:
 int cifs_close(struct inode *inode, struct file *file)
 {
-        cifsFileInfo_put(file->private_data);
+        if (file->private_data != NULL) {
-        file->private_data = NULL;
+                cifsFileInfo_put(file->private_data);
+                file->private_data = NULL;
+        }
        /* return code from the ->release op is always ignored */
        return 0;
@@ -970,6 +972,9 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
             total_written += bytes_written) {
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
+                        struct kvec iov[2];
+                        unsigned int len;
                        if (open_file->invalidHandle) {
                                /* we could deadlock if we called
                                   filemap_fdatawait from here so tell
@@ -979,31 +984,14 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
                                if (rc != 0)
                                        break;
                        }
-                        if (experimEnabled || (pTcon->ses->server &&
-                                ((pTcon->ses->server->secMode &
+                        len = min((size_t)cifs_sb->wsize,
-                                (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                                  write_size - total_written);
-                                == 0))) {
+                        /* iov[0] is reserved for smb header */
-                                struct kvec iov[2];
+                        iov[1].iov_base = (char *)write_data + total_written;
-                                unsigned int len;
+                        iov[1].iov_len = len;
+                        rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid, len,
-                                len = min((size_t)cifs_sb->wsize,
+                                           *poffset, &bytes_written, iov, 1, 0);
-                                          write_size - total_written);
-                                /* iov[0] is reserved for smb header */
-                                iov[1].iov_base = (char *)write_data +
-                                                  total_written;
-                                iov[1].iov_len = len;
-                                rc = CIFSSMBWrite2(xid, pTcon,
-                                                open_file->netfid, len,
-                                                *poffset, &bytes_written,
-                                                iov, 1, 0);
-                        } else
-                                rc = CIFSSMBWrite(xid, pTcon,
-                                         open_file->netfid,
-                                         min_t(const int, cifs_sb->wsize,
-                                               write_size - total_written),
-                                         *poffset, &bytes_written,
-                                         write_data + total_written,
-                                         NULL, 0);
                }
                if (rc || (bytes_written == 0)) {
                        if (total_written)
@@ -1240,12 +1228,6 @@ static int cifs_writepages(struct address_space *mapping,
        }
        tcon = tlink_tcon(open_file->tlink);
-        if (!experimEnabled && tcon->ses->server->secMode &
-                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
-                cifsFileInfo_put(open_file);
-                kfree(iov);
-                return generic_writepages(mapping, wbc);
-        }
        cifsFileInfo_put(open_file);
        xid = GetXid();
@@ -1980,6 +1962,24 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        return total_read;
 }
+/*
+ * If the page is mmap'ed into a process' page tables, then we need to make
+ * sure that it doesn't change while being written back.
+ */
+static int
+cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct page *page = vmf->page;
+        lock_page(page);
+        return VM_FAULT_LOCKED;
+}
+static struct vm_operations_struct cifs_file_vm_ops = {
+        .fault = filemap_fault,
+        .page_mkwrite = cifs_page_mkwrite,
+};
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
 {
        int rc, xid;
@@ -1991,6 +1991,8 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
                cifs_invalidate_mapping(inode);
        rc = generic_file_mmap(file, vma);
+        if (rc == 0)
+                vma->vm_ops = &cifs_file_vm_ops;
        FreeXid(xid);
        return rc;
 }
@@ -2007,6 +2009,8 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
                return rc;
        }
        rc = generic_file_mmap(file, vma);
+        if (rc == 0)
+                vma->vm_ops = &cifs_file_vm_ops;
        FreeXid(xid);
        return rc;
 }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index e8804d373404..ce417a9764a3 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -239,7 +239,7 @@ CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
        if (rc != 0)
                return rc;
-        if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
+        if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
                CIFSSMBClose(xid, tcon, netfid);
                /* it's not a symlink */
                return -EINVAL;
@@ -316,7 +316,7 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
        if (rc != 0)
                goto out;
-        if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
+        if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
                CIFSSMBClose(xid, pTcon, netfid);
                /* it's not a symlink */
                goto out;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 2a930a752a78..0c684ae4c071 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -100,6 +100,7 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
                memset(buf_to_free->password, 0, strlen(buf_to_free->password));
                kfree(buf_to_free->password);
        }
+        kfree(buf_to_free->user_name);
        kfree(buf_to_free->domainName);
        kfree(buf_to_free);
 }
@@ -520,7 +521,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                        (struct smb_com_transaction_change_notify_rsp *)buf;
                struct file_notify_information *pnotify;
                __u32 data_offset = 0;
-                if (pSMBr->ByteCount > sizeof(struct file_notify_information)) {
+                if (get_bcc_le(buf) > sizeof(struct file_notify_information)) {
                        data_offset = le32_to_cpu(pSMBr->DataOffset);
                        pnotify = (struct file_notify_information *)
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 16765703131b..f6728eb6f4b9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -219,12 +219,12 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
                bcc_ptr++;
        } */
        /* copy user */
-        if (ses->userName == NULL) {
+        if (ses->user_name == NULL) {
                /* null user mount */
                *bcc_ptr = 0;
                *(bcc_ptr+1) = 0;
        } else {
-                bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName,
+                bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name,
                                          MAX_USERNAME_SIZE, nls_cp);
        }
        bcc_ptr += 2 * bytes_ret;
@@ -244,12 +244,11 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
        /* copy user */
        /* BB what about null user mounts - check that we do this BB */
        /* copy user */
-        if (ses->userName == NULL) {
+        if (ses->user_name != NULL)
-                /* BB what about null user mounts - check that we do this BB */
+                strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE);
-        } else {
+        /* else null user mount */
-                strncpy(bcc_ptr, ses->userName, MAX_USERNAME_SIZE);
-        }
+        bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
-        bcc_ptr += strnlen(ses->userName, MAX_USERNAME_SIZE);
        *bcc_ptr = 0;
        bcc_ptr++; /* account for null termination */
@@ -405,8 +404,8 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
        /* BB spec says that if AvId field of MsvAvTimestamp is populated then
                we must set the MIC field of the AUTHENTICATE_MESSAGE */
        ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
-        tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
+        tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
-        tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
+        tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
        if (tilen) {
                ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
                if (!ses->auth_key.response) {
@@ -523,14 +522,14 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                tmp += len;
        }
-        if (ses->userName == NULL) {
+        if (ses->user_name == NULL) {
                sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->UserName.Length = 0;
                sec_blob->UserName.MaximumLength = 0;
                tmp += 2;
        } else {
                int len;
-                len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
+                len = cifs_strtoUCS((__le16 *)tmp, ses->user_name,
                                    MAX_USERNAME_SIZE, nls_cp);
                len *= 2; /* unicode is 2 bytes each */
                sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
diff --git a/fs/dcache.c b/fs/dcache.c
index ad25c4cec7d5..129a35730994 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2131,7 +2131,7 @@ EXPORT_SYMBOL(d_rehash);
 */
 void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 {
-        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+        BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
        BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
        spin_lock(&dentry->d_lock);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b5c2f3c97d71..68b2e43d7c35 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3291,7 +3291,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
        if (ext3_should_journal_data(inode))
                ret = 3 * (bpp + indirects) + 2;
        else
-                ret = 2 * (bpp + indirects) + 2;
+                ret = 2 * (bpp + indirects) + indirects + 2;
 #ifdef CONFIG_QUOTA
        /* We know that structure was already allocated during dquot_initialize so
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index e25e99bf7ee1..d0f53538a57f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -86,8 +86,8 @@
 #ifdef CONFIG_QUOTA
 /* Amount of blocks needed for quota update - we know that the structure was
- * allocated so we need to update only inode+data */
+ * allocated so we need to update only data block */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
 /* Amount of blocks needed for quota insert/delete - we do some block writes
 * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 4673bc05274f..e9473cbe80df 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -125,9 +125,11 @@ extern int ext4_flush_completed_IO(struct inode *inode)
 * the parent directory's parent as well, and so on recursively, if
 * they are also freshly created.
 */
-static void ext4_sync_parent(struct inode *inode)
+static int ext4_sync_parent(struct inode *inode)
 {
+        struct writeback_control wbc;
        struct dentry *dentry = NULL;
+        int ret = 0;
        while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -136,8 +138,17 @@ static void ext4_sync_parent(struct inode *inode)
                if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
                        break;
                inode = dentry->d_parent->d_inode;
-                sync_mapping_buffers(inode->i_mapping);
+                ret = sync_mapping_buffers(inode->i_mapping);
+                if (ret)
+                        break;
+                memset(&wbc, 0, sizeof(wbc));
+                wbc.sync_mode = WB_SYNC_ALL;
+                wbc.nr_to_write = 0;         /* only write out the inode */
+                ret = sync_inode(inode, &wbc);
+                if (ret)
+                        break;
        }
+        return ret;
 }
 /*
@@ -176,7 +187,7 @@ int ext4_sync_file(struct file *file, int datasync)
        if (!journal) {
                ret = generic_file_fsync(file, datasync);
                if (!ret && !list_empty(&inode->i_dentry))
-                        ext4_sync_parent(inode);
+                        ret = ext4_sync_parent(inode);
                goto out;
        }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ad8e303c0d29..f2fa5e8a582c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2502,6 +2502,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                 * for partial write.
                 */
                set_buffer_new(bh);
+                set_buffer_mapped(bh);
        }
        return 0;
 }
@@ -4429,8 +4430,8 @@ void ext4_truncate(struct inode *inode)
        Indirect chain[4];
        Indirect *partial;
        __le32 nr = 0;
-        int n;
+        int n = 0;
-        ext4_lblk_t last_block;
+        ext4_lblk_t last_block, max_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
        trace_ext4_truncate_enter(inode);
@@ -4455,14 +4456,18 @@ void ext4_truncate(struct inode *inode)
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
        if (inode->i_size & (blocksize - 1))
                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
                        goto out_stop;
-        n = ext4_block_to_path(inode, last_block, offsets, NULL);
+        if (last_block != max_block) {
-        if (n == 0)
+                n = ext4_block_to_path(inode, last_block, offsets, NULL);
-                goto out_stop;  /* error */
+                if (n == 0)
+                        goto out_stop;  /* error */
+        }
        /*
         * OK.  This truncate is going to happen.  We add the inode to the
@@ -4493,7 +4498,13 @@ void ext4_truncate(struct inode *inode)
         */
        ei->i_disksize = inode->i_size;
-        if (n == 1) {           /* direct blocks */
+        if (last_block == max_block) {
+                /*
+                 * It is unnecessary to free any data blocks if last_block is
+                 * equal to the indirect block limit.
+                 */
+                goto out_unlock;
+        } else if (n == 1) {            /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
                goto do_indirects;
@@ -4553,6 +4564,7 @@ do_indirects:
                ;
        }
+out_unlock:
        up_write(&ei->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
@@ -5398,13 +5410,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
        /* if nrblocks are contiguous */
        if (chunk) {
                /*
-                 * With N contiguous data blocks, it need at most
+                 * With N contiguous data blocks, we need at most
-                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
+                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-                 * 2 dindirect blocks
+                 * 2 dindirect blocks, and 1 tindirect block
-                 * 1 tindirect block
                 */
-                indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
+                return DIV_ROUND_UP(nrblocks,
-                return indirects + 3;
+                                    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
        }
        /*
         * if nrblocks are not contiguous, worse case, each block touch
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 056474b7b8e0..8553dfb310af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -242,27 +242,44 @@ static void ext4_put_nojournal(handle_t *handle)
 * journal_end calls result in the superblock being marked dirty, so
 * that sync() will call the filesystem's write_super callback if
 * appropriate.
+ *
+ * To avoid j_barrier hold in userspace when a user calls freeze(),
+ * ext4 prevents a new handle from being started by s_frozen, which
+ * is in an upper layer.
 */
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 {
        journal_t *journal;
+        handle_t  *handle;
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
-        vfs_check_frozen(sb, SB_FREEZE_TRANS);
-        /* Special case here: if the journal has aborted behind our
-         * backs (eg. EIO in the commit thread), then we still need to
-         * take the FS itself readonly cleanly. */
        journal = EXT4_SB(sb)->s_journal;
-        if (journal) {
+        handle = ext4_journal_current_handle();
-                if (is_journal_aborted(journal)) {
-                        ext4_abort(sb, "Detected aborted journal");
+        /*
-                        return ERR_PTR(-EROFS);
+         * If a handle has been started, it should be allowed to
-                }
+         * finish, otherwise deadlock could happen between freeze
-                return jbd2_journal_start(journal, nblocks);
+         * and others(e.g. truncate) due to the restart of the
+         * journal handle if the filesystem is forzen and active
+         * handles are not stopped.
+         */
+        if (!handle)
+                vfs_check_frozen(sb, SB_FREEZE_TRANS);
+        if (!journal)
+                return ext4_get_nojournal();
+        /*
+         * Special case here: if the journal has aborted behind our
+         * backs (eg. EIO in the commit thread), then we still need to
+         * take the FS itself readonly cleanly.
+         */
+        if (is_journal_aborted(journal)) {
+                ext4_abort(sb, "Detected aborted journal");
+                return ERR_PTR(-EROFS);
        }
-        return ext4_get_nojournal();
+        return jbd2_journal_start(journal, nblocks);
 }
 /*
@@ -2975,6 +2992,12 @@ static int ext4_register_li_request(struct super_block *sb,
        mutex_unlock(&ext4_li_info->li_list_mtx);
        sbi->s_li_request = elr;
+        /*
+         * set elr to NULL here since it has been inserted to
+         * the request_list and the removal and free of it is
+         * handled by ext4_clear_request_list from now on.
+         */
+        elr = NULL;
        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
                ret = ext4_run_lazyinit_thread();
@@ -3385,6 +3408,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        init_timer(&sbi->s_err_report);
+        sbi->s_err_report.function = print_daily_error_info;
+        sbi->s_err_report.data = (unsigned long) sb;
        err = percpu_counter_init(&sbi->s_freeblocks_counter,
                        ext4_count_free_blocks(sb));
        if (!err) {
@@ -3646,9 +3673,6 @@ no_journal:
                 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
                 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
-        init_timer(&sbi->s_err_report);
-        sbi->s_err_report.function = print_daily_error_info;
-        sbi->s_err_report.data = (unsigned long) sb;
        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -3672,6 +3696,7 @@ failed_mount_wq:
                sbi->s_journal = NULL;
        }
 failed_mount3:
+        del_timer(&sbi->s_err_report);
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
                        vfree(sbi->s_flex_groups);
@@ -4138,6 +4163,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 /*
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently, because ext4 prevents a new handle from being started
+ * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from
+ * the upper layer.
 */
 static int ext4_freeze(struct super_block *sb)
 {
@@ -4614,11 +4644,24 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 static int ext4_quota_off(struct super_block *sb, int type)
 {
+        struct inode *inode = sb_dqopt(sb)->files[type];
+        handle_t *handle;
        /* Force all delayed allocation blocks to be allocated.
         * Caller already holds s_umount sem */
        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);
+        /* Update modification times of quota files when userspace can
+         * start looking at them */
+        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle))
+                goto out;
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        ext4_mark_inode_dirty(handle, inode);
+        ext4_journal_stop(handle);
+out:
        return dquot_quota_off(sb, type);
 }
@@ -4714,9 +4757,8 @@ out:
        if (inode->i_size < off + len) {
                i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
+                ext4_mark_inode_dirty(handle, inode);
        }
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        ext4_mark_inode_dirty(handle, inode);
        mutex_unlock(&inode->i_mutex);
        return len;
 }
diff --git a/fs/fhandle.c b/fs/fhandle.c
index bf93ad2bee07..6b088641f5bf 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -7,6 +7,7 @@
 #include <linux/exportfs.h>
 #include <linux/fs_struct.h>
 #include <linux/fsnotify.h>
+#include <linux/personality.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 751d6b255a12..0845f84f2a5f 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -110,14 +110,13 @@ int unregister_filesystem(struct file_system_type * fs)
                        *tmp = fs->next;
                        fs->next = NULL;
                        write_unlock(&file_systems_lock);
+                        synchronize_rcu();
                        return 0;
                }
                tmp = &(*tmp)->next;
        }
        write_unlock(&file_systems_lock);
-        synchronize_rcu();
        return -EINVAL;
 }
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index c71995b111bf..0f5c4f9d5d62 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -884,8 +884,8 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        }
        brelse(dibh);
-        gfs2_trans_end(sdp);
 failed:
+        gfs2_trans_end(sdp);
        if (al) {
                gfs2_inplace_release(ip);
                gfs2_quota_unlock(ip);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 5c356d09c321..f789c5732b7c 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1506,7 +1506,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
                inode = gfs2_inode_lookup(dir->i_sb, 
                                be16_to_cpu(dent->de_type),
                                be64_to_cpu(dent->de_inum.no_addr),
-                                be64_to_cpu(dent->de_inum.no_formal_ino));
+                                be64_to_cpu(dent->de_inum.no_formal_ino), 0);
                brelse(bh);
                return inode;
        }
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b2682e073eee..e48310885c48 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -617,18 +617,51 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
-static void empty_write_end(struct page *page, unsigned from,
+static int empty_write_end(struct page *page, unsigned from,
-                           unsigned to)
+                           unsigned to, int mode)
 {
-        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        struct inode *inode = page->mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct buffer_head *bh;
+        unsigned offset, blksize = 1 << inode->i_blkbits;
+        pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
        zero_user(page, from, to-from);
        mark_page_accessed(page);
-        if (!gfs2_is_writeback(ip))
+        if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
-                gfs2_page_add_databufs(ip, page, from, to);
+                if (!gfs2_is_writeback(ip))
+                        gfs2_page_add_databufs(ip, page, from, to);
+                block_commit_write(page, from, to);
+                return 0;
+        }
+        offset = 0;
+        bh = page_buffers(page);
+        while (offset < to) {
+                if (offset >= from) {
+                        set_buffer_uptodate(bh);
+                        mark_buffer_dirty(bh);
+                        clear_buffer_new(bh);
+                        write_dirty_buffer(bh, WRITE);
+                }
+                offset += blksize;
+                bh = bh->b_this_page;
+        }
-        block_commit_write(page, from, to);
+        offset = 0;
+        bh = page_buffers(page);
+        while (offset < to) {
+                if (offset >= from) {
+                        wait_on_buffer(bh);
+                        if (!buffer_uptodate(bh))
+                                return -EIO;
+                }
+                offset += blksize;
+                bh = bh->b_this_page;
+        }
+        return 0;
 }
 static int needs_empty_write(sector_t block, struct inode *inode)
@@ -643,7 +676,8 @@ static int needs_empty_write(sector_t block, struct inode *inode)
        return !buffer_mapped(&bh_map);
 }
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
+                              int mode)
 {
        struct inode *inode = page->mapping->host;
        unsigned start, end, next, blksize;
@@ -668,7 +702,9 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
                                                          gfs2_block_map);
                                if (unlikely(ret))
                                        return ret;
-                                empty_write_end(page, start, end);
+                                ret = empty_write_end(page, start, end, mode);
+                                if (unlikely(ret))
+                                        return ret;
                                end = 0;
                        }
                        start = next;
@@ -682,7 +718,9 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
                ret = __block_write_begin(page, start, end - start, gfs2_block_map);
                if (unlikely(ret))
                        return ret;
-                empty_write_end(page, start, end);
+                ret = empty_write_end(page, start, end, mode);
+                if (unlikely(ret))
+                        return ret;
        }
        return 0;
@@ -731,7 +769,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
                if (curr == end)
                        to = end_offset;
-                error = write_empty_blocks(page, from, to);
+                error = write_empty_blocks(page, from, to, mode);
                if (!error && offset + to > inode->i_size &&
                    !(mode & FALLOC_FL_KEEP_SIZE)) {
                        i_size_write(inode, offset + to);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 3754e3cbf02b..25eeb2bcee47 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -385,6 +385,10 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
 static void iopen_go_callback(struct gfs2_glock *gl)
 {
        struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        if (sdp->sd_vfs->s_flags & MS_RDONLY)
+                return;
        if (gl->gl_demote_state == LM_ST_UNLOCKED &&
            gl->gl_state == LM_ST_SHARED && ip) {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 97d54a28776a..9134dcb89479 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -40,37 +40,61 @@ struct gfs2_inum_range_host {
        u64 ir_length;
 };
+struct gfs2_skip_data {
+        u64 no_addr;
+        int skipped;
+        int non_block;
+};
 static int iget_test(struct inode *inode, void *opaque)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        u64 *no_addr = opaque;
+        struct gfs2_skip_data *data = opaque;
-        if (ip->i_no_addr == *no_addr)
+        if (ip->i_no_addr == data->no_addr) {
+                if (data->non_block &&
+                    inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
+                        data->skipped = 1;
+                        return 0;
+                }
                return 1;
+        }
        return 0;
 }
 static int iget_set(struct inode *inode, void *opaque)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        u64 *no_addr = opaque;
+        struct gfs2_skip_data *data = opaque;
-        inode->i_ino = (unsigned long)*no_addr;
+        if (data->skipped)
-        ip->i_no_addr = *no_addr;
+                return -ENOENT;
+        inode->i_ino = (unsigned long)(data->no_addr);
+        ip->i_no_addr = data->no_addr;
        return 0;
 }
 struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr)
 {
        unsigned long hash = (unsigned long)no_addr;
-        return ilookup5(sb, hash, iget_test, &no_addr);
+        struct gfs2_skip_data data;
+        data.no_addr = no_addr;
+        data.skipped = 0;
+        data.non_block = 0;
+        return ilookup5(sb, hash, iget_test, &data);
 }
-static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
+static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr,
+                               int non_block)
 {
+        struct gfs2_skip_data data;
        unsigned long hash = (unsigned long)no_addr;
-        return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
+        data.no_addr = no_addr;
+        data.skipped = 0;
+        data.non_block = non_block;
+        return iget5_locked(sb, hash, iget_test, iget_set, &data);
 }
 /**
@@ -111,19 +135,20 @@ static void gfs2_set_iop(struct inode *inode)
 * @sb: The super block
 * @no_addr: The inode number
 * @type: The type of the inode
+ * non_block: Can we block on inodes that are being freed?
 *
 * Returns: A VFS inode, or an error
 */
 struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
-                                u64 no_addr, u64 no_formal_ino)
+                                u64 no_addr, u64 no_formal_ino, int non_block)
 {
        struct inode *inode;
        struct gfs2_inode *ip;
        struct gfs2_glock *io_gl = NULL;
        int error;
-        inode = gfs2_iget(sb, no_addr);
+        inode = gfs2_iget(sb, no_addr, non_block);
        ip = GFS2_I(inode);
        if (!inode)
@@ -185,11 +210,12 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
 {
        struct super_block *sb = sdp->sd_vfs;
        struct gfs2_holder i_gh;
-        struct inode *inode;
+        struct inode *inode = NULL;
        int error;
+        /* Must not read in block until block type is verified */
        error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
-                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+                                  LM_ST_EXCLUSIVE, GL_SKIP, &i_gh);
        if (error)
                return ERR_PTR(error);
@@ -197,7 +223,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
        if (error)
                goto fail;
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1);
        if (IS_ERR(inode))
                goto fail;
@@ -843,7 +869,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
                goto fail_gunlock2;
        inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
-                                  inum.no_formal_ino);
+                                  inum.no_formal_ino, 0);
        if (IS_ERR(inode))
                goto fail_gunlock2;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 3e00a66e7cbd..099ca305e518 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -97,7 +97,8 @@ err:
 }
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-                                       u64 no_addr, u64 no_formal_ino);
+                                       u64 no_addr, u64 no_formal_ino,
+                                       int non_block);
 extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
                                         u64 *no_formal_ino,
                                         unsigned int blktype);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 42ef24355afb..d3c69eb91c74 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -430,7 +430,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
        struct dentry *dentry;
        struct inode *inode;
-        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
+        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
        if (IS_ERR(inode)) {
                fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
                return PTR_ERR(inode);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index cf930cd9664a..6fcae8469f6d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -945,7 +945,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
                /* rgblk_search can return a block < goal, so we need to
                   keep it marching forward. */
                no_addr = block + rgd->rd_data0;
-                goal++;
+                goal = max(block + 1, goal + 1);
                if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
                        continue;
                if (no_addr == skip)
@@ -971,7 +971,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
                        found++;
                /* Limit reclaim to sensible number of tasks */
-                if (found > 2*NR_CPUS)
+                if (found > NR_CPUS)
                        return;
        }
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index a4e23d68a398..b9f28e66dad1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1318,15 +1318,17 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 static void gfs2_evict_inode(struct inode *inode)
 {
-        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct super_block *sb = inode->i_sb;
+        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_holder gh;
        int error;
-        if (inode->i_nlink)
+        if (inode->i_nlink || (sb->s_flags & MS_RDONLY))
                goto out;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        /* Must not read inode block until block type has been verified */
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
        if (unlikely(error)) {
                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
                goto out;
@@ -1336,6 +1338,12 @@ static void gfs2_evict_inode(struct inode *inode)
        if (error)
                goto out_truncate;
+        if (test_bit(GIF_INVALID, &ip->i_flags)) {
+                error = gfs2_inode_refresh(ip);
+                if (error)
+                        goto out_truncate;
+        }
        ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
        gfs2_glock_dq_wait(&ip->i_iopen_gh);
        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 20af62f4304b..6e28000a4b21 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -105,6 +105,8 @@ static int journal_submit_commit_record(journal_t *journal,
        int ret;
        struct timespec now = current_kernel_time();
+        *cbh = NULL;
        if (is_journal_aborted(journal))
                return 0;
@@ -806,7 +808,7 @@ wait_for_iobuf:
                if (err)
                        __jbd2_journal_abort_hard(journal);
        }
-        if (!err && !is_journal_aborted(journal))
+        if (cbh)
                err = journal_wait_on_commit_record(journal, cbh);
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index aba8ebaec25c..e0ec3db1c395 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2413,10 +2413,12 @@ const char *jbd2_dev_to_name(dev_t device)
        new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
        if (!new_dev)
                return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
+        bd = bdget(device);
        spin_lock(&devname_cache_lock);
        if (devcache[i]) {
                if (devcache[i]->device == device) {
                        kfree(new_dev);
+                        bdput(bd);
                        ret = devcache[i]->devname;
                        spin_unlock(&devname_cache_lock);
                        return ret;
@@ -2425,7 +2427,6 @@ const char *jbd2_dev_to_name(dev_t device)
        }
        devcache[i] = new_dev;
        devcache[i]->device = device;
-        bd = bdget(device);
        if (bd) {
                bdevname(bd, devcache[i]->devname);
                bdput(bd);
diff --git a/fs/namei.c b/fs/namei.c
index e6cd6113872c..54fc993e3027 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -697,6 +697,7 @@ static __always_inline void set_root_rcu(struct nameidata *nd)
                do {
                        seq = read_seqcount_begin(&fs->seq);
                        nd->root = fs->root;
+                        nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
                } while (read_seqcount_retry(&fs->seq, seq));
        }
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 7dba2ed03429..d99bcf59e4c2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1030,18 +1030,6 @@ const struct seq_operations mounts_op = {
        .show   = show_vfsmnt
 };
-static int uuid_is_nil(u8 *uuid)
-{
-        int i;
-        u8  *cp = (u8 *)uuid;
-        for (i = 0; i < 16; i++) {
-                if (*cp++)
-                        return 0;
-        }
-        return 1;
-}
 static int show_mountinfo(struct seq_file *m, void *v)
 {
        struct proc_mounts *p = m->private;
@@ -1085,10 +1073,6 @@ static int show_mountinfo(struct seq_file *m, void *v)
        if (IS_MNT_UNBINDABLE(mnt))
                seq_puts(m, " unbindable");
-        if (!uuid_is_nil(mnt->mnt_sb->s_uuid))
-                /* print the uuid */
-                seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid);
        /* Filesystem specific data */
        seq_puts(m, " - ");
        show_type(m, sb);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 9166fcb66da2..89fc160fd5b0 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -148,67 +148,64 @@ static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors,
        return pseudoflavor;
 }
-static rpc_authflavor_t nfs_negotiate_security(const struct dentry *parent, const struct dentry *dentry)
+static int nfs_negotiate_security(const struct dentry *parent,
+                                  const struct dentry *dentry,
+                                  rpc_authflavor_t *flavor)
 {
-        int status = 0;
        struct page *page;
        struct nfs4_secinfo_flavors *flavors;
        int (*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
-        rpc_authflavor_t flavor = RPC_AUTH_UNIX;
+        int ret = -EPERM;
        secinfo = NFS_PROTO(parent->d_inode)->secinfo;
        if (secinfo != NULL) {
                page = alloc_page(GFP_KERNEL);
                if (!page) {
-                        status = -ENOMEM;
+                        ret = -ENOMEM;
                        goto out;
                }
                flavors = page_address(page);
-                status = secinfo(parent->d_inode, &dentry->d_name, flavors);
+                ret = secinfo(parent->d_inode, &dentry->d_name, flavors);
-                flavor = nfs_find_best_sec(flavors, dentry->d_inode);
+                *flavor = nfs_find_best_sec(flavors, dentry->d_inode);
                put_page(page);
        }
-        return flavor;
 out:
-        status = -ENOMEM;
+        return ret;
-        return status;
 }
-static rpc_authflavor_t nfs_lookup_with_sec(struct nfs_server *server, struct dentry *parent,
+static int nfs_lookup_with_sec(struct nfs_server *server, struct dentry *parent,
-                                     struct dentry *dentry, struct path *path,
+                               struct dentry *dentry, struct path *path,
-                                     struct nfs_fh *fh, struct nfs_fattr *fattr)
+                               struct nfs_fh *fh, struct nfs_fattr *fattr,
+                               rpc_authflavor_t *flavor)
 {
-        rpc_authflavor_t flavor;
        struct rpc_clnt *clone;
        struct rpc_auth *auth;
        int err;
-        flavor = nfs_negotiate_security(parent, path->dentry);
+        err = nfs_negotiate_security(parent, path->dentry, flavor);
-        if (flavor < 0)
+        if (err < 0)
                goto out;
        clone  = rpc_clone_client(server->client);
-        auth   = rpcauth_create(flavor, clone);
+        auth   = rpcauth_create(*flavor, clone);
        if (!auth) {
-                flavor = -EIO;
+                err = -EIO;
                goto out_shutdown;
        }
        err = server->nfs_client->rpc_ops->lookup(clone, parent->d_inode,
                                                  &path->dentry->d_name,
                                                  fh, fattr);
-        if (err < 0)
-                flavor = err;
 out_shutdown:
        rpc_shutdown_client(clone);
 out:
-        return flavor;
+        return err;
 }
 #else /* CONFIG_NFS_V4 */
-static inline rpc_authflavor_t nfs_lookup_with_sec(struct nfs_server *server,
+static inline int nfs_lookup_with_sec(struct nfs_server *server,
-                                     struct dentry *parent, struct dentry *dentry,
+                                      struct dentry *parent, struct dentry *dentry,
-                                     struct path *path, struct nfs_fh *fh,
+                                      struct path *path, struct nfs_fh *fh,
-                                     struct nfs_fattr *fattr)
+                                      struct nfs_fattr *fattr,
+                                      rpc_authflavor_t *flavor)
 {
        return -EPERM;
 }
@@ -234,7 +231,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
        struct nfs_fh *fh = NULL;
        struct nfs_fattr *fattr = NULL;
        int err;
-        rpc_authflavor_t flavor = 1;
+        rpc_authflavor_t flavor = RPC_AUTH_UNIX;
        dprintk("--> nfs_d_automount()\n");
@@ -255,13 +252,8 @@ struct vfsmount *nfs_d_automount(struct path *path)
        err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode,
                                                  &path->dentry->d_name,
                                                  fh, fattr);
-        if (err == -EPERM) {
+        if (err == -EPERM && NFS_PROTO(parent->d_inode)->secinfo != NULL)
-                flavor = nfs_lookup_with_sec(server, parent, path->dentry, path, fh, fattr);
+                err = nfs_lookup_with_sec(server, parent, path->dentry, path, fh, fattr, &flavor);
-                if (flavor < 0)
-                        err = flavor;
-                else
-                        err = 0;
-        }
        dput(parent);
        if (err != 0) {
                mnt = ERR_PTR(err);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dfd1e6d7e6c3..9bf41eab3e46 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2204,8 +2204,6 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl
                goto out;
        }
        ret = nfs4_lookup_root(server, fhandle, info);
-        if (ret < 0)
-                ret = -EAGAIN;
 out:
        return ret;
 }
@@ -2226,7 +2224,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
        for (i = 0; i < len; i++) {
                status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]);
-                if (status == 0)
+                if (status != -EPERM)
                        break;
        }
        if (status == 0)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index af0c6279a4a7..e4cbc11a74ab 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -542,11 +542,15 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u
        if (!nfs_need_commit(nfsi))
                return 0;
+        spin_lock(&inode->i_lock);
        ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
        if (ret > 0)
                nfsi->ncommit -= ret;
+        spin_unlock(&inode->i_lock);
        if (nfs_need_commit(NFS_I(inode)))
                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
        return ret;
 }
 #else
@@ -1483,9 +1487,7 @@ int nfs_commit_inode(struct inode *inode, int how)
        res = nfs_commit_set_lock(NFS_I(inode), may_wait);
        if (res <= 0)
                goto out_mark_dirty;
-        spin_lock(&inode->i_lock);
        res = nfs_scan_commit(inode, &head, 0, 0);
-        spin_unlock(&inode->i_lock);
        if (res) {
                int error;
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 0c6d81670137..7c831a2731fa 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -38,7 +38,6 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
        exp_readlock();
        nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
        fh_put(&fh);
-        rqstp->rq_client = NULL;
        exp_readunlock();
        /* We return nlm error codes as nlm doesn't know
         * about nfsd, but nfsd does know about nlm..
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4b36ec3eb8ea..4cf04e11c66c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -258,6 +258,7 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
        if (atomic_dec_and_test(&fp->fi_delegees)) {
                vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
                fp->fi_lease = NULL;
+                fput(fp->fi_deleg_file);
                fp->fi_deleg_file = NULL;
        }
 }
@@ -397,9 +398,12 @@ static void unhash_generic_stateid(struct nfs4_stateid *stp)
 static void free_generic_stateid(struct nfs4_stateid *stp)
 {
-        int oflag = nfs4_access_bmap_to_omode(stp);
+        int oflag;
-        nfs4_file_put_access(stp->st_file, oflag);
+        if (stp->st_access_bmap) {
+                oflag = nfs4_access_bmap_to_omode(stp);
+                nfs4_file_put_access(stp->st_file, oflag);
+        }
        put_nfs4_file(stp->st_file);
        kmem_cache_free(stateid_slab, stp);
 }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2e1cebde90df..129f3c9f62d5 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1363,7 +1363,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                goto out;
        if (!(iap->ia_valid & ATTR_MODE))
                iap->ia_mode = 0;
-        err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+        err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
        if (err)
                goto out;
@@ -1385,6 +1385,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (IS_ERR(dchild))
                goto out_nfserr;
+        /* If file doesn't exist, check for permissions to create one */
+        if (!dchild->d_inode) {
+                err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+                if (err)
+                        goto out;
+        }
        err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
        if (err)
                goto out;
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index b10e3540d5b7..ce4f62440425 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -1299,6 +1299,11 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
        BUG_ON (!data || !frags);
+        if (size < 2 * VBLK_SIZE_HEAD) {
+                ldm_error("Value of size is to small.");
+                return false;
+        }
        group = get_unaligned_be32(data + 0x08);
        rec   = get_unaligned_be16(data + 0x0C);
        num   = get_unaligned_be16(data + 0x0E);
@@ -1306,6 +1311,10 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
                ldm_error ("A VBLK claims to have %d parts.", num);
                return false;
        }
+        if (rec >= num) {
+                ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num);
+                return false;
+        }
        list_for_each (item, frags) {
                f = list_entry (item, struct frag, list);
@@ -1334,10 +1343,9 @@ found:
        f->map |= (1 << rec);
-        if (num > 0) {
+        data += VBLK_SIZE_HEAD;
-                data += VBLK_SIZE_HEAD;
+        size -= VBLK_SIZE_HEAD;
-                size -= VBLK_SIZE_HEAD;
-        }
        memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size);
        return true;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dd6628d3ba42..dfa532730e55 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3124,11 +3124,16 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-        unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
+        unsigned int nr;
-        struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode);
+        struct task_struct *reaper;
        struct tgid_iter iter;
        struct pid_namespace *ns;
+        if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
+                goto out_no_task;
+        nr = filp->f_pos - FIRST_PROCESS_ENTRY;
+        reaper = get_proc_task(filp->f_path.dentry->d_inode);
        if (!reaper)
                goto out_no_task;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a925bf205497..d3c032f5fa0a 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -442,7 +442,7 @@ EXPORT_SYMBOL(dquot_acquire);
 */
 int dquot_commit(struct dquot *dquot)
 {
-        int ret = 0, ret2 = 0;
+        int ret = 0;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
        mutex_lock(&dqopt->dqio_mutex);
@@ -454,15 +454,10 @@ int dquot_commit(struct dquot *dquot)
        spin_unlock(&dq_list_lock);
        /* Inactive dquot can be only if there was error during read/init
         * => we have better not writing it */
-        if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+        if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
                ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot);
-                if (info_dirty(&dqopt->info[dquot->dq_type])) {
+        else
-                        ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
+                ret = -EIO;
-                                                dquot->dq_sb, dquot->dq_type);
-                }
-                if (ret >= 0)
-                        ret = ret2;
-        }
 out_sem:
        mutex_unlock(&dqopt->dqio_mutex);
        return ret;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 9eead2c796b7..fbb0b478a346 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -112,6 +112,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
                SetPageDirty(page);
                unlock_page(page);
+                put_page(page);
        }
        return 0;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 919f0de29d8f..e6493cac193d 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -23,6 +23,12 @@
 #ifndef __UBIFS_DEBUG_H__
 #define __UBIFS_DEBUG_H__
+/* Checking helper functions */
+typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
+                                 struct ubifs_zbranch *zbr, void *priv);
+typedef int (*dbg_znode_callback)(struct ubifs_info *c,
+                                  struct ubifs_znode *znode, void *priv);
 #ifdef CONFIG_UBIFS_FS_DEBUG
 /**
@@ -270,11 +276,6 @@ void dbg_dump_tnc(struct ubifs_info *c);
 void dbg_dump_index(struct ubifs_info *c);
 void dbg_dump_lpt_lebs(const struct ubifs_info *c);
-/* Checking helper functions */
-typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
-                                 struct ubifs_zbranch *zbr, void *priv);
-typedef int (*dbg_znode_callback)(struct ubifs_info *c,
-                                  struct ubifs_znode *znode, void *priv);
 int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
                   dbg_znode_callback znode_cb, void *priv);
@@ -295,7 +296,6 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
 int dbg_check_filesystem(struct ubifs_info *c);
 void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
                    int add_pos);
-int dbg_check_lprops(struct ubifs_info *c);
 int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
                        int row, int col);
 int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
@@ -401,58 +401,94 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
-#define ubifs_debugging_init(c)                0
+static inline int ubifs_debugging_init(struct ubifs_info *c)      { return 0; }
-#define ubifs_debugging_exit(c)                ({})
+static inline void ubifs_debugging_exit(struct ubifs_info *c)     { return; }
+static inline const char *dbg_ntype(int type)                     { return ""; }
-#define dbg_ntype(type)                        ""
+static inline const char *dbg_cstate(int cmt_state)               { return ""; }
-#define dbg_cstate(cmt_state)                  ""
+static inline const char *dbg_jhead(int jhead)                    { return ""; }
-#define dbg_jhead(jhead)                       ""
+static inline const char *
-#define dbg_get_key_dump(c, key)               ({})
+dbg_get_key_dump(const struct ubifs_info *c,
-#define dbg_dump_inode(c, inode)               ({})
+                 const union ubifs_key *key)                      { return ""; }
-#define dbg_dump_node(c, node)                 ({})
+static inline void dbg_dump_inode(const struct ubifs_info *c,
-#define dbg_dump_lpt_node(c, node, lnum, offs) ({})
+                                  const struct inode *inode)      { return; }
-#define dbg_dump_budget_req(req)               ({})
+static inline void dbg_dump_node(const struct ubifs_info *c,
-#define dbg_dump_lstats(lst)                   ({})
+                                 const void *node)                { return; }
-#define dbg_dump_budg(c)                       ({})
+static inline void dbg_dump_lpt_node(const struct ubifs_info *c,
-#define dbg_dump_lprop(c, lp)                  ({})
+                                     void *node, int lnum,
-#define dbg_dump_lprops(c)                     ({})
+                                     int offs)                    { return; }
-#define dbg_dump_lpt_info(c)                   ({})
+static inline void
-#define dbg_dump_leb(c, lnum)                  ({})
+dbg_dump_budget_req(const struct ubifs_budget_req *req)           { return; }
-#define dbg_dump_znode(c, znode)               ({})
+static inline void
-#define dbg_dump_heap(c, heap, cat)            ({})
+dbg_dump_lstats(const struct ubifs_lp_stats *lst)                 { return; }
-#define dbg_dump_pnode(c, pnode, parent, iip)  ({})
+static inline void dbg_dump_budg(struct ubifs_info *c)            { return; }
-#define dbg_dump_tnc(c)                        ({})
+static inline void dbg_dump_lprop(const struct ubifs_info *c,
-#define dbg_dump_index(c)                      ({})
+                                  const struct ubifs_lprops *lp)  { return; }
-#define dbg_dump_lpt_lebs(c)                   ({})
+static inline void dbg_dump_lprops(struct ubifs_info *c)          { return; }
+static inline void dbg_dump_lpt_info(struct ubifs_info *c)        { return; }
-#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
+static inline void dbg_dump_leb(const struct ubifs_info *c,
-#define dbg_old_index_check_init(c, zroot)         0
+                                int lnum)                         { return; }
-#define dbg_save_space_info(c)                     ({})
+static inline void
-#define dbg_check_space_info(c)                    0
+dbg_dump_znode(const struct ubifs_info *c,
-#define dbg_check_old_index(c, zroot)              0
+               const struct ubifs_znode *znode)                   { return; }
-#define dbg_check_cats(c)                          0
+static inline void dbg_dump_heap(struct ubifs_info *c,
-#define dbg_check_ltab(c)                          0
+                                 struct ubifs_lpt_heap *heap,
-#define dbg_chk_lpt_free_spc(c)                    0
+                                 int cat)                         { return; }
-#define dbg_chk_lpt_sz(c, action, len)             0
+static inline void dbg_dump_pnode(struct ubifs_info *c,
-#define dbg_check_synced_i_size(inode)             0
+                                  struct ubifs_pnode *pnode,
-#define dbg_check_dir_size(c, dir)                 0
+                                  struct ubifs_nnode *parent,
-#define dbg_check_tnc(c, x)                        0
+                                  int iip)                        { return; }
-#define dbg_check_idx_size(c, idx_size)            0
+static inline void dbg_dump_tnc(struct ubifs_info *c)             { return; }
-#define dbg_check_filesystem(c)                    0
+static inline void dbg_dump_index(struct ubifs_info *c)           { return; }
-#define dbg_check_heap(c, heap, cat, add_pos)      ({})
+static inline void dbg_dump_lpt_lebs(const struct ubifs_info *c)  { return; }
-#define dbg_check_lprops(c)                        0
-#define dbg_check_lpt_nodes(c, cnode, row, col)    0
+static inline int dbg_walk_index(struct ubifs_info *c,
-#define dbg_check_inode_size(c, inode, size)       0
+                                 dbg_leaf_callback leaf_cb,
-#define dbg_check_data_nodes_order(c, head)        0
+                                 dbg_znode_callback znode_cb,
-#define dbg_check_nondata_nodes_order(c, head)     0
+                                 void *priv)                      { return 0; }
-#define dbg_force_in_the_gaps_enabled              0
+static inline void dbg_save_space_info(struct ubifs_info *c)      { return; }
-#define dbg_force_in_the_gaps()                    0
+static inline int dbg_check_space_info(struct ubifs_info *c)      { return 0; }
-#define dbg_failure_mode                           0
+static inline int dbg_check_lprops(struct ubifs_info *c)          { return 0; }
+static inline int
-#define dbg_debugfs_init()                         0
+dbg_old_index_check_init(struct ubifs_info *c,
-#define dbg_debugfs_exit()
+                         struct ubifs_zbranch *zroot)             { return 0; }
-#define dbg_debugfs_init_fs(c)                     0
+static inline int
-#define dbg_debugfs_exit_fs(c)                     0
+dbg_check_old_index(struct ubifs_info *c,
+                    struct ubifs_zbranch *zroot)                  { return 0; }
+static inline int dbg_check_cats(struct ubifs_info *c)            { return 0; }
+static inline int dbg_check_ltab(struct ubifs_info *c)            { return 0; }
+static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c)      { return 0; }
+static inline int dbg_chk_lpt_sz(struct ubifs_info *c,
+                                 int action, int len)             { return 0; }
+static inline int dbg_check_synced_i_size(struct inode *inode)    { return 0; }
+static inline int dbg_check_dir_size(struct ubifs_info *c,
+                                     const struct inode *dir)     { return 0; }
+static inline int dbg_check_tnc(struct ubifs_info *c, int extra)  { return 0; }
+static inline int dbg_check_idx_size(struct ubifs_info *c,
+                                     long long idx_size)          { return 0; }
+static inline int dbg_check_filesystem(struct ubifs_info *c)      { return 0; }
+static inline void dbg_check_heap(struct ubifs_info *c,
+                                  struct ubifs_lpt_heap *heap,
+                                  int cat, int add_pos)           { return; }
+static inline int dbg_check_lpt_nodes(struct ubifs_info *c,
+        struct ubifs_cnode *cnode, int row, int col)              { return 0; }
+static inline int dbg_check_inode_size(struct ubifs_info *c,
+                                       const struct inode *inode,
+                                       loff_t size)               { return 0; }
+static inline int
+dbg_check_data_nodes_order(struct ubifs_info *c,
+                           struct list_head *head)                { return 0; }
+static inline int
+dbg_check_nondata_nodes_order(struct ubifs_info *c,
+                              struct list_head *head)             { return 0; }
+static inline int dbg_force_in_the_gaps(void)                     { return 0; }
+#define dbg_force_in_the_gaps_enabled 0
+#define dbg_failure_mode              0
+static inline int dbg_debugfs_init(void)                          { return 0; }
+static inline void dbg_debugfs_exit(void)                         { return; }
+static inline int dbg_debugfs_init_fs(struct ubifs_info *c)       { return 0; }
+static inline int dbg_debugfs_exit_fs(struct ubifs_info *c)       { return 0; }
 #endif /* !CONFIG_UBIFS_FS_DEBUG */
 #endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 28be1e6a65e8..b286db79c686 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1312,6 +1312,9 @@ int ubifs_fsync(struct file *file, int datasync)
        dbg_gen("syncing inode %lu", inode->i_ino);
+        if (inode->i_sb->s_flags & MS_RDONLY)
+                return 0;
        /*
         * VFS has already synchronized dirty pages for this inode. Synchronize
         * the inode unless this is a 'datasync()' call.
diff --git a/fs/xattr.c b/fs/xattr.c
index a19acdb81cd1..f1ef94974dea 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -666,7 +666,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
                return -EOPNOTSUPP;
-        return handler->set(dentry, name, value, size, 0, handler->flags);
+        return handler->set(dentry, name, value, size, flags, handler->flags);
 }
 /*
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 5ea402023ebd..9ef9ed2cfe2e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -293,7 +293,6 @@ xfs_buf_allocate_memory(
        size_t                  nbytes, offset;
        gfp_t                   gfp_mask = xb_to_gfp(flags);
        unsigned short          page_count, i;
-        pgoff_t                 first;
        xfs_off_t               end;
        int                     error;
@@ -333,7 +332,6 @@ use_alloc_page:
                return error;
        offset = bp->b_offset;
-        first = bp->b_file_offset >> PAGE_SHIFT;
        bp->b_flags |= _XBF_PAGES;
        for (i = 0; i < bp->b_page_count; i++) {
@@ -657,8 +655,6 @@ xfs_buf_readahead(
        xfs_off_t               ioff,
        size_t                  isize)
 {
-        struct backing_dev_info *bdi;
        if (bdi_read_congested(target->bt_bdi))
                return;
@@ -919,8 +915,6 @@ xfs_buf_lock(
        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
                xfs_log_force(bp->b_target->bt_mount, 0);
-        if (atomic_read(&bp->b_io_remaining))
-                blk_flush_plug(current);
        down(&bp->b_sema);
        XB_SET_OWNER(bp);
@@ -1309,8 +1303,6 @@ xfs_buf_iowait(
 {
        trace_xfs_buf_iowait(bp, _RET_IP_);
-        if (atomic_read(&bp->b_io_remaining))
-                blk_flush_plug(current);
        wait_for_completion(&bp->b_iowait);
        trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1747,8 +1739,8 @@ xfsbufd(
        do {
                long    age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
                long    tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
-                int     count = 0;
                struct list_head tmp;
+                struct blk_plug plug;
                if (unlikely(freezing(current))) {
                        set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
@@ -1764,16 +1756,15 @@ xfsbufd(
                xfs_buf_delwri_split(target, &tmp, age);
                list_sort(NULL, &tmp, xfs_buf_cmp);
+                blk_start_plug(&plug);
                while (!list_empty(&tmp)) {
                        struct xfs_buf *bp;
                        bp = list_first_entry(&tmp, struct xfs_buf, b_list);
                        list_del_init(&bp->b_list);
                        xfs_bdstrat_cb(bp);
-                        count++;
                }
-                if (count)
+                blk_finish_plug(&plug);
-                        blk_flush_plug(current);
        } while (!kthread_should_stop());
        return 0;
@@ -1793,6 +1784,7 @@ xfs_flush_buftarg(
        int             pincount = 0;
        LIST_HEAD(tmp_list);
        LIST_HEAD(wait_list);
+        struct blk_plug plug;
        xfs_buf_runall_queues(xfsconvertd_workqueue);
        xfs_buf_runall_queues(xfsdatad_workqueue);
@@ -1807,6 +1799,8 @@ xfs_flush_buftarg(
         * we do that after issuing all the IO.
         */
        list_sort(NULL, &tmp_list, xfs_buf_cmp);
+        blk_start_plug(&plug);
        while (!list_empty(&tmp_list)) {
                bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
                ASSERT(target == bp->b_target);
@@ -1817,10 +1811,10 @@ xfs_flush_buftarg(
                }
                xfs_bdstrat_cb(bp);
        }
+        blk_finish_plug(&plug);
        if (wait) {
-                /* Expedite and wait for IO to complete. */
+                /* Wait for IO to complete. */
-                blk_flush_plug(current);
                while (!list_empty(&wait_list)) {
                        bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
index 508e06fd7d1e..9f76cceb678d 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -28,53 +28,49 @@
 /*
 * XFS logging functions
 */
-static int
+static void
 __xfs_printk(
        const char              *level,
        const struct xfs_mount  *mp,
        struct va_format        *vaf)
 {
-        if (mp && mp->m_fsname)
+        if (mp && mp->m_fsname) {
-                return printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+                printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
-        return printk("%sXFS: %pV\n", level, vaf);
+                return;
+        }
+        printk("%sXFS: %pV\n", level, vaf);
 }
-int xfs_printk(
+void xfs_printk(
        const char              *level,
        const struct xfs_mount  *mp,
        const char              *fmt, ...)
 {
        struct va_format        vaf;
        va_list                 args;
-        int                      r;
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
-        r = __xfs_printk(level, mp, &vaf);
+        __xfs_printk(level, mp, &vaf);
        va_end(args);
-        return r;
 }
 #define define_xfs_printk_level(func, kern_level)               \
-int func(const struct xfs_mount *mp, const char *fmt, ...)      \
+void func(const struct xfs_mount *mp, const char *fmt, ...)     \
 {                                                               \
        struct va_format        vaf;                            \
        va_list                 args;                           \
-        int                     r;                              \
                                                                \
        va_start(args, fmt);                                    \
                                                                \
        vaf.fmt = fmt;                                          \
        vaf.va = &args;                                         \
                                                                \
-        r = __xfs_printk(kern_level, mp, &vaf);                 \
+        __xfs_printk(kern_level, mp, &vaf);                     \
        va_end(args);                                           \
-                                                                \
-        return r;                                               \
 }                                                               \
 define_xfs_printk_level(xfs_emerg, KERN_EMERG);
@@ -88,7 +84,7 @@ define_xfs_printk_level(xfs_info, KERN_INFO);
 define_xfs_printk_level(xfs_debug, KERN_DEBUG);
 #endif
-int
+void
 xfs_alert_tag(
        const struct xfs_mount  *mp,
        int                     panic_tag,
@@ -97,7 +93,6 @@ xfs_alert_tag(
        struct va_format        vaf;
        va_list                 args;
        int                     do_panic = 0;
-        int                     r;
        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
                xfs_printk(KERN_ALERT, mp,
@@ -110,12 +105,10 @@ xfs_alert_tag(
        vaf.fmt = fmt;
        vaf.va = &args;
-        r = __xfs_printk(KERN_ALERT, mp, &vaf);
+        __xfs_printk(KERN_ALERT, mp, &vaf);
        va_end(args);
        BUG_ON(do_panic);
-        return r;
 }
 void
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
index e77ffa16745b..f1b3fc1b6c4e 100644
--- a/fs/xfs/linux-2.6/xfs_message.h
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -3,32 +3,34 @@
 struct xfs_mount;
-extern int xfs_printk(const char *level, const struct xfs_mount *mp,
+extern void xfs_printk(const char *level, const struct xfs_mount *mp,
                      const char *fmt, ...)
        __attribute__ ((format (printf, 3, 4)));
-extern int xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3)));
-extern int xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3)));
-extern int xfs_alert_tag(const struct xfs_mount *mp, int tag,
+extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
                         const char *fmt, ...)
        __attribute__ ((format (printf, 3, 4)));
-extern int xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3)));
-extern int xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3)));
-extern int xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3)));
-extern int xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3)));
-extern int xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3)));
 #ifdef DEBUG
-extern int xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3)));
 #else
-#define xfs_debug(mp, fmt, ...) (0)
+static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+{
+}
 #endif
 extern void assfail(char *expr, char *f, int l);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 1ba5c451da36..b38e58d02299 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -816,75 +816,6 @@ xfs_setup_devices(
        return 0;
 }
-/*
- * XFS AIL push thread support
- */
-void
-xfsaild_wakeup(
-        struct xfs_ail          *ailp,
-        xfs_lsn_t               threshold_lsn)
-{
-        /* only ever move the target forwards */
-        if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
-                ailp->xa_target = threshold_lsn;
-                wake_up_process(ailp->xa_task);
-        }
-}
-STATIC int
-xfsaild(
-        void    *data)
-{
-        struct xfs_ail  *ailp = data;
-        xfs_lsn_t       last_pushed_lsn = 0;
-        long            tout = 0; /* milliseconds */
-        while (!kthread_should_stop()) {
-                /*
-                 * for short sleeps indicating congestion, don't allow us to
-                 * get woken early. Otherwise all we do is bang on the AIL lock
-                 * without making progress.
-                 */
-                if (tout && tout <= 20)
-                        __set_current_state(TASK_KILLABLE);
-                else
-                        __set_current_state(TASK_INTERRUPTIBLE);
-                schedule_timeout(tout ?
-                                 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
-                /* swsusp */
-                try_to_freeze();
-                ASSERT(ailp->xa_mount->m_log);
-                if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
-                        continue;
-                tout = xfsaild_push(ailp, &last_pushed_lsn);
-        }
-        return 0;
-}       /* xfsaild */
-int
-xfsaild_start(
-        struct xfs_ail  *ailp)
-{
-        ailp->xa_target = 0;
-        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
-                                    ailp->xa_mount->m_fsname);
-        if (IS_ERR(ailp->xa_task))
-                return -PTR_ERR(ailp->xa_task);
-        return 0;
-}
-void
-xfsaild_stop(
-        struct xfs_ail  *ailp)
-{
-        kthread_stop(ailp->xa_task);
-}
 /* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
@@ -1191,22 +1122,12 @@ xfs_fs_sync_fs(
                return -error;
        if (laptop_mode) {
-                int     prev_sync_seq = mp->m_sync_seq;
                /*
                 * The disk must be active because we're syncing.
                 * We schedule xfssyncd now (now that the disk is
                 * active) instead of later (when it might not be).
                 */
-                wake_up_process(mp->m_sync_task);
+                flush_delayed_work_sync(&mp->m_sync_work);
-                /*
-                 * We have to wait for the sync iteration to complete.
-                 * If we don't, the disk activity caused by the sync
-                 * will come after the sync is completed, and that
-                 * triggers another sync from laptop mode.
-                 */
-                wait_event(mp->m_wait_single_sync_task,
-                                mp->m_sync_seq != prev_sync_seq);
        }
        return 0;
@@ -1490,9 +1411,6 @@ xfs_fs_fill_super(
        spin_lock_init(&mp->m_sb_lock);
        mutex_init(&mp->m_growlock);
        atomic_set(&mp->m_active_trans, 0);
-        INIT_LIST_HEAD(&mp->m_sync_list);
-        spin_lock_init(&mp->m_sync_lock);
-        init_waitqueue_head(&mp->m_wait_single_sync_task);
        mp->m_super = sb;
        sb->s_fs_info = mp;
@@ -1799,6 +1717,38 @@ xfs_destroy_zones(void)
 }
 STATIC int __init
+xfs_init_workqueues(void)
+{
+        /*
+         * max_active is set to 8 to give enough concurency to allow
+         * multiple work operations on each CPU to run. This allows multiple
+         * filesystems to be running sync work concurrently, and scales with
+         * the number of CPUs in the system.
+         */
+        xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
+        if (!xfs_syncd_wq)
+                goto out;
+        xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
+        if (!xfs_ail_wq)
+                goto out_destroy_syncd;
+        return 0;
+out_destroy_syncd:
+        destroy_workqueue(xfs_syncd_wq);
+out:
+        return -ENOMEM;
+}
+STATIC void
+xfs_destroy_workqueues(void)
+{
+        destroy_workqueue(xfs_ail_wq);
+        destroy_workqueue(xfs_syncd_wq);
+}
+STATIC int __init
 init_xfs_fs(void)
 {
        int                     error;
@@ -1813,10 +1763,14 @@ init_xfs_fs(void)
        if (error)
                goto out;
-        error = xfs_mru_cache_init();
+        error = xfs_init_workqueues();
        if (error)
                goto out_destroy_zones;
+        error = xfs_mru_cache_init();
+        if (error)
+                goto out_destroy_wq;
        error = xfs_filestream_init();
        if (error)
                goto out_mru_cache_uninit;
@@ -1833,6 +1787,10 @@ init_xfs_fs(void)
        if (error)
                goto out_cleanup_procfs;
+        error = xfs_init_workqueues();
+        if (error)
+                goto out_sysctl_unregister;
        vfs_initquota();
        error = register_filesystem(&xfs_fs_type);
@@ -1850,6 +1808,8 @@ init_xfs_fs(void)
        xfs_filestream_uninit();
 out_mru_cache_uninit:
        xfs_mru_cache_uninit();
+ out_destroy_wq:
+        xfs_destroy_workqueues();
 out_destroy_zones:
        xfs_destroy_zones();
 out:
@@ -1866,6 +1826,7 @@ exit_xfs_fs(void)
        xfs_buf_terminate();
        xfs_filestream_uninit();
        xfs_mru_cache_uninit();
+        xfs_destroy_workqueues();
        xfs_destroy_zones();
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 9cf35a688f53..e4f9c1b0836c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
+#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -39,6 +40,8 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+struct workqueue_struct *xfs_syncd_wq;  /* sync workqueue */
 /*
 * The inode lookup is done in batches to keep the amount of lock traffic and
 * radix tree lookups to a minimum. The batch size is a trade off between
@@ -431,62 +434,12 @@ xfs_quiesce_attr(
        xfs_unmountfs_writesb(mp);
 }
-/*
+static void
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+xfs_syncd_queue_sync(
- * Doing this has two advantages:
+        struct xfs_mount        *mp)
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
-        struct xfs_mount *mp,
-        void            *data,
-        void            (*syncer)(struct xfs_mount *, void *),
-        struct completion *completion)
-{
-        struct xfs_sync_work *work;
-        work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
-        INIT_LIST_HEAD(&work->w_list);
-        work->w_syncer = syncer;
-        work->w_data = data;
-        work->w_mount = mp;
-        work->w_completion = completion;
-        spin_lock(&mp->m_sync_lock);
-        list_add_tail(&work->w_list, &mp->m_sync_list);
-        spin_unlock(&mp->m_sync_lock);
-        wake_up_process(mp->m_sync_task);
-}
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inodes_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        xfs_sync_data(mp, SYNC_TRYLOCK);
-        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-        iput(inode);
-}
-void
-xfs_flush_inodes(
-        xfs_inode_t     *ip)
 {
-        struct inode    *inode = VFS_I(ip);
+        queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
-        DECLARE_COMPLETION_ONSTACK(completion);
+                                msecs_to_jiffies(xfs_syncd_centisecs * 10));
-        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
-        wait_for_completion(&completion);
-        xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
 }
 /*
@@ -496,9 +449,10 @@ xfs_flush_inodes(
 */
 STATIC void
 xfs_sync_worker(
-        struct xfs_mount *mp,
+        struct work_struct *work)
-        void            *unused)
 {
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_sync_work);
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
@@ -508,73 +462,106 @@ xfs_sync_worker(
                        error = xfs_fs_log_dummy(mp);
                else
                        xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, 0);
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
+                /* start pushing all the metadata that is currently dirty */
+                xfs_ail_push_all(mp->m_ail);
        }
-        mp->m_sync_seq++;
-        wake_up(&mp->m_wait_single_sync_task);
+        /* queue us up again */
+        xfs_syncd_queue_sync(mp);
 }
-STATIC int
+/*
-xfssyncd(
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
-        void                    *arg)
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_syncd_queue_reclaim(
+        struct xfs_mount        *mp)
 {
-        struct xfs_mount        *mp = arg;
-        long                    timeleft;
-        xfs_sync_work_t         *work, *n;
-        LIST_HEAD               (tmp);
-        set_freezable();
-        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
-        for (;;) {
-                if (list_empty(&mp->m_sync_list))
-                        timeleft = schedule_timeout_interruptible(timeleft);
-                /* swsusp */
-                try_to_freeze();
-                if (kthread_should_stop() && list_empty(&mp->m_sync_list))
-                        break;
-                spin_lock(&mp->m_sync_lock);
+        /*
-                /*
+         * We can have inodes enter reclaim after we've shut down the syncd
-                 * We can get woken by laptop mode, to do a sync -
+         * workqueue during unmount, so don't allow reclaim work to be queued
-                 * that's the (only!) case where the list would be
+         * during unmount.
-                 * empty with time remaining.
+         */
-                 */
+        if (!(mp->m_super->s_flags & MS_ACTIVE))
-                if (!timeleft || list_empty(&mp->m_sync_list)) {
+                return;
-                        if (!timeleft)
-                                timeleft = xfs_syncd_centisecs *
-                                                        msecs_to_jiffies(10);
-                        INIT_LIST_HEAD(&mp->m_sync_work.w_list);
-                        list_add_tail(&mp->m_sync_work.w_list,
-                                        &mp->m_sync_list);
-                }
-                list_splice_init(&mp->m_sync_list, &tmp);
-                spin_unlock(&mp->m_sync_lock);
-                list_for_each_entry_safe(work, n, &tmp, w_list) {
+        rcu_read_lock();
-                        (*work->w_syncer)(mp, work->w_data);
+        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-                        list_del(&work->w_list);
+                queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
-                        if (work == &mp->m_sync_work)
+                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-                                continue;
-                        if (work->w_completion)
-                                complete(work->w_completion);
-                        kmem_free(work);
-                }
        }
+        rcu_read_unlock();
+}
-        return 0;
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+STATIC void
+xfs_reclaim_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_reclaim_work);
+        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+        xfs_syncd_queue_reclaim(mp);
+}
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room.
+ *
+ * Queue a new data flush if there isn't one already in progress and
+ * wait for completion of the flush. This means that we only ever have one
+ * inode flush in progress no matter how many ENOSPC events are occurring and
+ * so will prevent the system from bogging down due to every concurrent
+ * ENOSPC event scanning all the active inodes in the system for writeback.
+ */
+void
+xfs_flush_inodes(
+        struct xfs_inode        *ip)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        queue_work(xfs_syncd_wq, &mp->m_flush_work);
+        flush_work_sync(&mp->m_flush_work);
+}
+STATIC void
+xfs_flush_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(work,
+                                        struct xfs_mount, m_flush_work);
+        xfs_sync_data(mp, SYNC_TRYLOCK);
+        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 }
 int
 xfs_syncd_init(
        struct xfs_mount        *mp)
 {
-        mp->m_sync_work.w_syncer = xfs_sync_worker;
+        INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-        mp->m_sync_work.w_mount = mp;
+        INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-        mp->m_sync_work.w_completion = NULL;
+        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
-        if (IS_ERR(mp->m_sync_task))
+        xfs_syncd_queue_sync(mp);
-                return -PTR_ERR(mp->m_sync_task);
+        xfs_syncd_queue_reclaim(mp);
        return 0;
 }
@@ -582,7 +569,9 @@ void
 xfs_syncd_stop(
        struct xfs_mount        *mp)
 {
-        kthread_stop(mp->m_sync_task);
+        cancel_delayed_work_sync(&mp->m_sync_work);
+        cancel_delayed_work_sync(&mp->m_reclaim_work);
+        cancel_work_sync(&mp->m_flush_work);
 }
 void
@@ -601,6 +590,10 @@ __xfs_inode_set_reclaim_tag(
                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
                                XFS_ICI_RECLAIM_TAG);
                spin_unlock(&ip->i_mount->m_perag_lock);
+                /* schedule periodic background inode reclaim */
+                xfs_syncd_queue_reclaim(ip->i_mount);
                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
                                                        -1, _RET_IP_);
        }
@@ -1017,7 +1010,13 @@ xfs_reclaim_inodes(
 }
 /*
- * Shrinker infrastructure.
+ * Inode cache shrinker.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doiing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
 */
 static int
 xfs_reclaim_inode_shrink(
@@ -1032,10 +1031,15 @@ xfs_reclaim_inode_shrink(
        mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
        if (nr_to_scan) {
+                /* kick background reclaimer and push the AIL */
+                xfs_syncd_queue_reclaim(mp);
+                xfs_ail_push_all(mp->m_ail);
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
+                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
+                                        &nr_to_scan);
                /* terminate if we don't exhaust the scan */
                if (nr_to_scan > 0)
                        return -1;
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 32ba6628290c..e3a6ad27415f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -32,6 +32,8 @@ typedef struct xfs_sync_work {
 #define SYNC_WAIT               0x0001  /* wait for i/o to complete */
 #define SYNC_TRYLOCK            0x0002  /* only try to lock inodes */
+extern struct workqueue_struct  *xfs_syncd_wq;  /* sync workqueue */
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 254ee062bd7d..69228aa8605a 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -461,12 +461,10 @@ xfs_qm_dqflush_all(
        struct xfs_quotainfo    *q = mp->m_quotainfo;
        int                     recl;
        struct xfs_dquot        *dqp;
-        int                     niters;
        int                     error;
        if (!q)
                return 0;
-        niters = 0;
 again:
        mutex_lock(&q->qi_dqlist_lock);
        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
@@ -1314,14 +1312,9 @@ xfs_qm_dqiter_bufs(
 {
        xfs_buf_t       *bp;
        int             error;
-        int             notcommitted;
-        int             incr;
        int             type;
        ASSERT(blkcnt > 0);
-        notcommitted = 0;
-        incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
-                XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
        type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
                (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
        error = 0;
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index c9446f1c726d..567b29b9f1b3 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -65,11 +65,6 @@ extern kmem_zone_t	*qm_dqtrxzone;
 * block in the dquot/xqm code.
 */
 #define XFS_DQUOT_CLUSTER_SIZE_FSB      (xfs_filblks_t)1
-/*
- * When doing a quotacheck, we log dquot clusters of this many FSBs at most
- * in a single transaction. We don't want to ask for too huge a log reservation.
- */
-#define XFS_QM_MAX_DQCLUSTER_LOGSZ      3
 typedef xfs_dqhash_t    xfs_dqlist_t;
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 0d62a07b7fd8..2dadb15d5ca9 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -313,14 +313,12 @@ xfs_qm_scall_quotaon(
 {
        int             error;
        uint            qf;
-        uint            accflags;
        __int64_t       sbflags;
        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
        /*
         * Switching on quota accounting must be done at mount time.
         */
-        accflags = flags & XFS_ALL_QUOTA_ACCT;
        flags &= ~(XFS_ALL_QUOTA_ACCT);
        sbflags = 0;
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 4bc3c649aee4..27d64d752eab 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2395,17 +2395,33 @@ xfs_free_extent(
        memset(&args, 0, sizeof(xfs_alloc_arg_t));
        args.tp = tp;
        args.mp = tp->t_mountp;
+        /*
+         * validate that the block number is legal - the enables us to detect
+         * and handle a silent filesystem corruption rather than crashing.
+         */
        args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
-        ASSERT(args.agno < args.mp->m_sb.sb_agcount);
+        if (args.agno >= args.mp->m_sb.sb_agcount)
+                return EFSCORRUPTED;
        args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+        if (args.agbno >= args.mp->m_sb.sb_agblocks)
+                return EFSCORRUPTED;
        args.pag = xfs_perag_get(args.mp, args.agno);
-        if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
+        ASSERT(args.pag);
+        error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+        if (error)
                goto error0;
-#ifdef DEBUG
-        ASSERT(args.agbp != NULL);
+        /* validate the extent size is legal now we have the agf locked */
-        ASSERT((args.agbno + len) <=
+        if (args.agbno + len >
-                be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length));
+                        be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
-#endif
+                error = EFSCORRUPTED;
+                goto error0;
+        }
        error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
 error0:
        xfs_perag_put(args.pag);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 46cc40131d4a..576fdfe81d60 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -198,6 +198,41 @@ xfs_inode_item_size(
 }
 /*
+ * xfs_inode_item_format_extents - convert in-core extents to on-disk form
+ *
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode. In this case, we
+ * need to do this conversion before we write the extents into the log. Because
+ * we don't have the disk inode to write into here, we allocate a buffer and
+ * format the extents into it via xfs_iextents_copy(). We free the buffer in
+ * the unlock routine after the copy for the log has been made.
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only log on-disk extents
+ * here, so always use the physical fork size to determine the size of the
+ * buffer we need to allocate.
+ */
+STATIC void
+xfs_inode_item_format_extents(
+        struct xfs_inode        *ip,
+        struct xfs_log_iovec    *vecp,
+        int                     whichfork,
+        int                     type)
+{
+        xfs_bmbt_rec_t          *ext_buffer;
+        ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
+        if (whichfork == XFS_DATA_FORK)
+                ip->i_itemp->ili_extents_buf = ext_buffer;
+        else
+                ip->i_itemp->ili_aextents_buf = ext_buffer;
+        vecp->i_addr = ext_buffer;
+        vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
+        vecp->i_type = type;
+}
+/*
 * This is called to fill in the vector of log iovecs for the
 * given inode log item.  It fills the first item with an inode
 * log format structure, the second with the on-disk inode structure,
@@ -213,7 +248,6 @@ xfs_inode_item_format(
        struct xfs_inode        *ip = iip->ili_inode;
        uint                    nvecs;
        size_t                  data_bytes;
-        xfs_bmbt_rec_t          *ext_buffer;
        xfs_mount_t             *mp;
        vecp->i_addr = &iip->ili_format;
@@ -320,22 +354,8 @@ xfs_inode_item_format(
                        } else
 #endif
                        {
-                                /*
+                                xfs_inode_item_format_extents(ip, vecp,
-                                 * There are delayed allocation extents
+                                        XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
-                                 * in the inode, or we need to convert
-                                 * the extents to on disk format.
-                                 * Use xfs_iextents_copy()
-                                 * to copy only the real extents into
-                                 * a separate buffer.  We'll free the
-                                 * buffer in the unlock routine.
-                                 */
-                                ext_buffer = kmem_alloc(ip->i_df.if_bytes,
-                                        KM_SLEEP);
-                                iip->ili_extents_buf = ext_buffer;
-                                vecp->i_addr = ext_buffer;
-                                vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
-                                                XFS_DATA_FORK);
-                                vecp->i_type = XLOG_REG_TYPE_IEXT;
                        }
                        ASSERT(vecp->i_len <= ip->i_df.if_bytes);
                        iip->ili_format.ilf_dsize = vecp->i_len;
@@ -445,19 +465,12 @@ xfs_inode_item_format(
                         */
                        vecp->i_addr = ip->i_afp->if_u1.if_extents;
                        vecp->i_len = ip->i_afp->if_bytes;
+                        vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
 #else
                        ASSERT(iip->ili_aextents_buf == NULL);
-                        /*
+                        xfs_inode_item_format_extents(ip, vecp,
-                         * Need to endian flip before logging
+                                        XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
-                         */
-                        ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
-                                KM_SLEEP);
-                        iip->ili_aextents_buf = ext_buffer;
-                        vecp->i_addr = ext_buffer;
-                        vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
-                                        XFS_ATTR_FORK);
 #endif
-                        vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
                        iip->ili_format.ilf_asize = vecp->i_len;
                        vecp++;
                        nvecs++;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index dc1882adaf54..751e94fe1f77 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -204,7 +204,6 @@ xfs_bulkstat(
        xfs_agi_t               *agi;   /* agi header data */
        xfs_agino_t             agino;  /* inode # in allocation group */
        xfs_agnumber_t          agno;   /* allocation group number */
-        xfs_daddr_t             bno;    /* inode cluster start daddr */
        int                     chunkidx; /* current index into inode chunk */
        int                     clustidx; /* current index into inode cluster */
        xfs_btree_cur_t         *cur;   /* btree cursor for ialloc btree */
@@ -463,7 +462,6 @@ xfs_bulkstat(
                                                 mp->m_sb.sb_inopblog);
                                }
                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
-                                bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
                                /*
                                 * Skip if this inode is free.
                                 */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 25efa9b8a602..b612ce4520ae 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -761,7 +761,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
                break;
        case XLOG_STATE_COVER_NEED:
        case XLOG_STATE_COVER_NEED2:
-                if (!xfs_trans_ail_tail(log->l_ailp) &&
+                if (!xfs_ail_min_lsn(log->l_ailp) &&
                    xlog_iclogs_empty(log)) {
                        if (log->l_covered_state == XLOG_STATE_COVER_NEED)
                                log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -801,7 +801,7 @@ xlog_assign_tail_lsn(
        xfs_lsn_t               tail_lsn;
        struct log              *log = mp->m_log;
-        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
+        tail_lsn = xfs_ail_min_lsn(mp->m_ail);
        if (!tail_lsn)
                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
@@ -1239,7 +1239,7 @@ xlog_grant_push_ail(
         * the filesystem is shutting down.
         */
        if (!XLOG_FORCED_SHUTDOWN(log))
-                xfs_trans_ail_push(log->l_ailp, threshold_lsn);
+                xfs_ail_push(log->l_ailp, threshold_lsn);
 }
 /*
@@ -3407,6 +3407,17 @@ xlog_verify_dest_ptr(
                xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
 }
+/*
+ * Check to make sure the grant write head didn't just over lap the tail.  If
+ * the cycles are the same, we can't be overlapping.  Otherwise, make sure that
+ * the cycles differ by exactly one and check the byte count.
+ *
+ * This check is run unlocked, so can give false positives. Rather than assert
+ * on failures, use a warn-once flag and a panic tag to allow the admin to
+ * determine if they want to panic the machine when such an error occurs. For
+ * debug kernels this will have the same effect as using an assert but, unlinke
+ * an assert, it can be turned off at runtime.
+ */
 STATIC void
 xlog_verify_grant_tail(
        struct log      *log)
@@ -3414,17 +3425,22 @@ xlog_verify_grant_tail(
        int             tail_cycle, tail_blocks;
        int             cycle, space;
-        /*
-         * Check to make sure the grant write head didn't just over lap the
-         * tail.  If the cycles are the same, we can't be overlapping.
-         * Otherwise, make sure that the cycles differ by exactly one and
-         * check the byte count.
-         */
        xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
        if (tail_cycle != cycle) {
-                ASSERT(cycle - 1 == tail_cycle);
+                if (cycle - 1 != tail_cycle &&
-                ASSERT(space <= BBTOB(tail_blocks));
+                    !(log->l_flags & XLOG_TAIL_WARN)) {
+                        xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+                                "%s: cycle - 1 != tail_cycle", __func__);
+                        log->l_flags |= XLOG_TAIL_WARN;
+                }
+                if (space > BBTOB(tail_blocks) &&
+                    !(log->l_flags & XLOG_TAIL_WARN)) {
+                        xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+                                "%s: space > BBTOB(tail_blocks)", __func__);
+                        log->l_flags |= XLOG_TAIL_WARN;
+                }
        }
 }
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index ffae692c9832..5864850e9e34 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -144,6 +144,7 @@ static inline uint xlog_get_client_id(__be32 i)
 #define XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
 #define XLOG_IO_ERROR           0x8     /* log hit an I/O error, and being
                                           shutdown */
+#define XLOG_TAIL_WARN          0x10    /* log tail verify warning issued */
 #ifdef __KERNEL__
 /*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a62e8971539d..19af0ab0d0c6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -203,12 +203,9 @@ typedef struct xfs_mount {
        struct mutex            m_icsb_mutex;   /* balancer sync lock */
 #endif
        struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
-        struct task_struct      *m_sync_task;   /* generalised sync thread */
+        struct delayed_work     m_sync_work;    /* background sync work */
-        xfs_sync_work_t         m_sync_work;    /* work item for VFS_SYNC */
+        struct delayed_work     m_reclaim_work; /* background inode reclaim */
-        struct list_head        m_sync_list;    /* sync thread work item list */
+        struct work_struct      m_flush_work;   /* background inode flush */
-        spinlock_t              m_sync_lock;    /* work item list lock */
-        int                     m_sync_seq;     /* sync thread generation no. */
-        wait_queue_head_t       m_wait_single_sync_task;
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 12aff9584e29..acdb92f14d51 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,74 +28,138 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
+struct workqueue_struct *xfs_ail_wq;    /* AIL workqueue */
-STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
-STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
 #ifdef DEBUG
-STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *);
+/*
-#else
+ * Check that the list is sorted as it should be.
+ */
+STATIC void
+xfs_ail_check(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip)
+{
+        xfs_log_item_t  *prev_lip;
+        if (list_empty(&ailp->xa_ail))
+                return;
+        /*
+         * Check the next and previous entries are valid.
+         */
+        ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+        prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+        if (&prev_lip->li_ail != &ailp->xa_ail)
+                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+        prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
+        if (&prev_lip->li_ail != &ailp->xa_ail)
+                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
+#ifdef XFS_TRANS_DEBUG
+        /*
+         * Walk the list checking lsn ordering, and that every entry has the
+         * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
+         * when specifically debugging the transaction subsystem.
+         */
+        prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+                if (&prev_lip->li_ail != &ailp->xa_ail)
+                        ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+                ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+                prev_lip = lip;
+        }
+#endif /* XFS_TRANS_DEBUG */
+}
+#else /* !DEBUG */
 #define xfs_ail_check(a,l)
 #endif /* DEBUG */
+/*
+ * Return a pointer to the first item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_min(
+        struct xfs_ail  *ailp)
+{
+        if (list_empty(&ailp->xa_ail))
+                return NULL;
+        return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+}
+ /*
+ * Return a pointer to the last item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_max(
+        struct xfs_ail  *ailp)
+{
+        if (list_empty(&ailp->xa_ail))
+                return NULL;
+        return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail);
+}
+/*
+ * Return a pointer to the item which follows the given item in the AIL.  If
+ * the given item is the last item in the list, then return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_next(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip)
+{
+        if (lip->li_ail.next == &ailp->xa_ail)
+                return NULL;
+        return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
+}
 /*
- * This is called by the log manager code to determine the LSN
+ * This is called by the log manager code to determine the LSN of the tail of
- * of the tail of the log.  This is exactly the LSN of the first
+ * the log.  This is exactly the LSN of the first item in the AIL.  If the AIL
- * item in the AIL.  If the AIL is empty, then this function
+ * is empty, then this function returns 0.
- * returns 0.
 *
- * We need the AIL lock in order to get a coherent read of the
+ * We need the AIL lock in order to get a coherent read of the lsn of the last
- * lsn of the last item in the AIL.
+ * item in the AIL.
 */
 xfs_lsn_t
-xfs_trans_ail_tail(
+xfs_ail_min_lsn(
        struct xfs_ail  *ailp)
 {
-        xfs_lsn_t       lsn;
+        xfs_lsn_t       lsn = 0;
        xfs_log_item_t  *lip;
        spin_lock(&ailp->xa_lock);
        lip = xfs_ail_min(ailp);
-        if (lip == NULL) {
+        if (lip)
-                lsn = (xfs_lsn_t)0;
-        } else {
                lsn = lip->li_lsn;
-        }
        spin_unlock(&ailp->xa_lock);
        return lsn;
 }
 /*
- * xfs_trans_push_ail
+ * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
- *
- * This routine is called to move the tail of the AIL forward.  It does this by
- * trying to flush items in the AIL whose lsns are below the given
- * threshold_lsn.
- *
- * the push is run asynchronously in a separate thread, so we return the tail
- * of the log right now instead of the tail after the push. This means we will
- * either continue right away, or we will sleep waiting on the async thread to
- * do its work.
- *
- * We do this unlocked - we only need to know whether there is anything in the
- * AIL at the time we are called. We don't need to access the contents of
- * any of the objects, so the lock is not needed.
 */
-void
+static xfs_lsn_t
-xfs_trans_ail_push(
+xfs_ail_max_lsn(
-        struct xfs_ail  *ailp,
+        struct xfs_ail  *ailp)
-        xfs_lsn_t       threshold_lsn)
 {
-        xfs_log_item_t  *lip;
+        xfs_lsn_t       lsn = 0;
+        xfs_log_item_t  *lip;
-        lip = xfs_ail_min(ailp);
+        spin_lock(&ailp->xa_lock);
-        if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+        lip = xfs_ail_max(ailp);
-                if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0)
+        if (lip)
-                        xfsaild_wakeup(ailp, threshold_lsn);
+                lsn = lip->li_lsn;
-        }
+        spin_unlock(&ailp->xa_lock);
+        return lsn;
 }
 /*
@@ -236,16 +300,57 @@ out:
 }
 /*
- * xfsaild_push does the work of pushing on the AIL.  Returning a timeout of
+ * splice the log item list into the AIL at the given LSN.
- * zero indicates that the caller should sleep until woken.
 */
-long
+static void
-xfsaild_push(
+xfs_ail_splice(
-        struct xfs_ail  *ailp,
+        struct xfs_ail  *ailp,
-        xfs_lsn_t       *last_lsn)
+        struct list_head *list,
+        xfs_lsn_t       lsn)
 {
-        long            tout = 0;
+        xfs_log_item_t  *next_lip;
-        xfs_lsn_t       last_pushed_lsn = *last_lsn;
+        /* If the list is empty, just insert the item.  */
+        if (list_empty(&ailp->xa_ail)) {
+                list_splice(list, &ailp->xa_ail);
+                return;
+        }
+        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
+                if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
+                        break;
+        }
+        ASSERT(&next_lip->li_ail == &ailp->xa_ail ||
+               XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0);
+        list_splice_init(list, &next_lip->li_ail);
+}
+/*
+ * Delete the given item from the AIL.  Return a pointer to the item.
+ */
+static void
+xfs_ail_delete(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip)
+{
+        xfs_ail_check(ailp, lip);
+        list_del(&lip->li_ail);
+        xfs_trans_ail_cursor_clear(ailp, lip);
+}
+/*
+ * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself
+ * to run at a later time if there is more work to do to complete the push.
+ */
+STATIC void
+xfs_ail_worker(
+        struct work_struct *work)
+{
+        struct xfs_ail  *ailp = container_of(to_delayed_work(work),
+                                        struct xfs_ail, xa_work);
+        long            tout;
        xfs_lsn_t       target =  ailp->xa_target;
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
@@ -256,15 +361,15 @@ xfsaild_push(
        spin_lock(&ailp->xa_lock);
        xfs_trans_ail_cursor_init(ailp, cur);
-        lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn);
+        lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn);
        if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
                /*
                 * AIL is empty or our push has reached the end.
                 */
                xfs_trans_ail_cursor_done(ailp, cur);
                spin_unlock(&ailp->xa_lock);
-                *last_lsn = 0;
+                ailp->xa_last_pushed_lsn = 0;
-                return tout;
+                return;
        }
        XFS_STATS_INC(xs_push_ail);
@@ -301,13 +406,13 @@ xfsaild_push(
                case XFS_ITEM_SUCCESS:
                        XFS_STATS_INC(xs_push_ail_success);
                        IOP_PUSH(lip);
-                        last_pushed_lsn = lsn;
+                        ailp->xa_last_pushed_lsn = lsn;
                        break;
                case XFS_ITEM_PUSHBUF:
                        XFS_STATS_INC(xs_push_ail_pushbuf);
                        IOP_PUSHBUF(lip);
-                        last_pushed_lsn = lsn;
+                        ailp->xa_last_pushed_lsn = lsn;
                        push_xfsbufd = 1;
                        break;
@@ -319,7 +424,7 @@ xfsaild_push(
                case XFS_ITEM_LOCKED:
                        XFS_STATS_INC(xs_push_ail_locked);
-                        last_pushed_lsn = lsn;
+                        ailp->xa_last_pushed_lsn = lsn;
                        stuck++;
                        break;
@@ -374,9 +479,23 @@ xfsaild_push(
                wake_up_process(mp->m_ddev_targp->bt_task);
        }
+        /* assume we have more work to do in a short while */
+        tout = 10;
        if (!count) {
                /* We're past our target or empty, so idle */
-                last_pushed_lsn = 0;
+                ailp->xa_last_pushed_lsn = 0;
+                /*
+                 * Check for an updated push target before clearing the
+                 * XFS_AIL_PUSHING_BIT. If the target changed, we've got more
+                 * work to do. Wait a bit longer before starting that work.
+                 */
+                smp_rmb();
+                if (ailp->xa_target == target) {
+                        clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
+                        return;
+                }
+                tout = 50;
        } else if (XFS_LSN_CMP(lsn, target) >= 0) {
                /*
                 * We reached the target so wait a bit longer for I/O to
@@ -384,7 +503,7 @@ xfsaild_push(
                 * start the next scan from the start of the AIL.
                 */
                tout = 50;
-                last_pushed_lsn = 0;
+                ailp->xa_last_pushed_lsn = 0;
        } else if ((stuck * 100) / count > 90) {
                /*
                 * Either there is a lot of contention on the AIL or we
@@ -396,14 +515,61 @@ xfsaild_push(
                 * continuing from where we were.
                 */
                tout = 20;
-        } else {
-                /* more to do, but wait a short while before continuing */
-                tout = 10;
        }
-        *last_lsn = last_pushed_lsn;
-        return tout;
+        /* There is more to do, requeue us.  */
+        queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
+                                        msecs_to_jiffies(tout));
+}
+/*
+ * This routine is called to move the tail of the AIL forward.  It does this by
+ * trying to flush items in the AIL whose lsns are below the given
+ * threshold_lsn.
+ *
+ * The push is run asynchronously in a workqueue, which means the caller needs
+ * to handle waiting on the async flush for space to become available.
+ * We don't want to interrupt any push that is in progress, hence we only queue
+ * work if we set the pushing bit approriately.
+ *
+ * We do this unlocked - we only need to know whether there is anything in the
+ * AIL at the time we are called. We don't need to access the contents of
+ * any of the objects, so the lock is not needed.
+ */
+void
+xfs_ail_push(
+        struct xfs_ail  *ailp,
+        xfs_lsn_t       threshold_lsn)
+{
+        xfs_log_item_t  *lip;
+        lip = xfs_ail_min(ailp);
+        if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) ||
+            XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0)
+                return;
+        /*
+         * Ensure that the new target is noticed in push code before it clears
+         * the XFS_AIL_PUSHING_BIT.
+         */
+        smp_wmb();
+        ailp->xa_target = threshold_lsn;
+        if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
+                queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0);
 }
+/*
+ * Push out all items in the AIL immediately
+ */
+void
+xfs_ail_push_all(
+        struct xfs_ail  *ailp)
+{
+        xfs_lsn_t       threshold_lsn = xfs_ail_max_lsn(ailp);
+        if (threshold_lsn)
+                xfs_ail_push(ailp, threshold_lsn);
+}
 /*
 * This is to be called when an item is unlocked that may have
@@ -615,7 +781,6 @@ xfs_trans_ail_init(
        xfs_mount_t     *mp)
 {
        struct xfs_ail  *ailp;
-        int             error;
        ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
        if (!ailp)
@@ -624,15 +789,9 @@ xfs_trans_ail_init(
        ailp->xa_mount = mp;
        INIT_LIST_HEAD(&ailp->xa_ail);
        spin_lock_init(&ailp->xa_lock);
-        error = xfsaild_start(ailp);
+        INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker);
-        if (error)
-                goto out_free_ailp;
        mp->m_ail = ailp;
        return 0;
-out_free_ailp:
-        kmem_free(ailp);
-        return error;
 }
 void
@@ -641,124 +800,6 @@ xfs_trans_ail_destroy(
 {
        struct xfs_ail  *ailp = mp->m_ail;
-        xfsaild_stop(ailp);
+        cancel_delayed_work_sync(&ailp->xa_work);
        kmem_free(ailp);
 }
-/*
- * splice the log item list into the AIL at the given LSN.
- */
-STATIC void
-xfs_ail_splice(
-        struct xfs_ail  *ailp,
-        struct list_head *list,
-        xfs_lsn_t       lsn)
-{
-        xfs_log_item_t  *next_lip;
-        /*
-         * If the list is empty, just insert the item.
-         */
-        if (list_empty(&ailp->xa_ail)) {
-                list_splice(list, &ailp->xa_ail);
-                return;
-        }
-        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-                if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
-                        break;
-        }
-        ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
-               (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
-        list_splice_init(list, &next_lip->li_ail);
-        return;
-}
-/*
- * Delete the given item from the AIL.  Return a pointer to the item.
- */
-STATIC void
-xfs_ail_delete(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-{
-        xfs_ail_check(ailp, lip);
-        list_del(&lip->li_ail);
-        xfs_trans_ail_cursor_clear(ailp, lip);
-}
-/*
- * Return a pointer to the first item in the AIL.
- * If the AIL is empty, then return NULL.
- */
-STATIC xfs_log_item_t *
-xfs_ail_min(
-        struct xfs_ail  *ailp)
-{
-        if (list_empty(&ailp->xa_ail))
-                return NULL;
-        return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-}
-/*
- * Return a pointer to the item which follows
- * the given item in the AIL.  If the given item
- * is the last item in the list, then return NULL.
- */
-STATIC xfs_log_item_t *
-xfs_ail_next(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-{
-        if (lip->li_ail.next == &ailp->xa_ail)
-                return NULL;
-        return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
-}
-#ifdef DEBUG
-/*
- * Check that the list is sorted as it should be.
- */
-STATIC void
-xfs_ail_check(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-{
-        xfs_log_item_t  *prev_lip;
-        if (list_empty(&ailp->xa_ail))
-                return;
-        /*
-         * Check the next and previous entries are valid.
-         */
-        ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-        prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
-        if (&prev_lip->li_ail != &ailp->xa_ail)
-                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-        prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
-        if (&prev_lip->li_ail != &ailp->xa_ail)
-                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
-#ifdef XFS_TRANS_DEBUG
-        /*
-         * Walk the list checking lsn ordering, and that every entry has the
-         * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
-         * when specifically debugging the transaction subsystem.
-         */
-        prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
-                if (&prev_lip->li_ail != &ailp->xa_ail)
-                        ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-                ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-                prev_lip = lip;
-        }
-#endif /* XFS_TRANS_DEBUG */
-}
-#endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 35162c238fa3..6b164e9e9a1f 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -65,16 +65,22 @@ struct xfs_ail_cursor {
 struct xfs_ail {
        struct xfs_mount        *xa_mount;
        struct list_head        xa_ail;
-        uint                    xa_gen;
-        struct task_struct      *xa_task;
        xfs_lsn_t               xa_target;
        struct xfs_ail_cursor   xa_cursors;
        spinlock_t              xa_lock;
+        struct delayed_work     xa_work;
+        xfs_lsn_t               xa_last_pushed_lsn;
+        unsigned long           xa_flags;
 };
+#define XFS_AIL_PUSHING_BIT     0
 /*
 * From xfs_trans_ail.c
 */
+extern struct workqueue_struct  *xfs_ail_wq;    /* AIL workqueue */
 void    xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
                                struct xfs_log_item **log_items, int nr_items,
                                xfs_lsn_t lsn) __releases(ailp->xa_lock);
@@ -98,12 +104,13 @@ xfs_trans_ail_delete(
        xfs_trans_ail_delete_bulk(ailp, &lip, 1);
 }
-void                    xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
+void                    xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
+void                    xfs_ail_push_all(struct xfs_ail *);
+xfs_lsn_t               xfs_ail_min_lsn(struct xfs_ail *ailp);
 void                    xfs_trans_unlocked_item(struct xfs_ail *,
                                        xfs_log_item_t *);
-xfs_lsn_t               xfs_trans_ail_tail(struct xfs_ail *ailp);
 struct xfs_log_item     *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
                                        struct xfs_ail_cursor *cur,
                                        xfs_lsn_t lsn);
@@ -112,11 +119,6 @@ struct xfs_log_item	*xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
 void                    xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
                                        struct xfs_ail_cursor *cur);
-long    xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
-void    xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
-int     xfsaild_start(struct xfs_ail *);
-void    xfsaild_stop(struct xfs_ail *);
 #if BITS_PER_LONG != 64
 static inline void
 xfs_trans_ail_copy_lsn(
author	Ingo Molnar <mingo@elte.hu>	2011-04-22 04:19:26 -0400
committer	Ingo Molnar <mingo@elte.hu>	2011-04-22 04:19:30 -0400
commit	eff430de53be6f3328c3eebe93755f1ecf499e37 (patch)
tree	c8e5ae958fe3e6656b4e96c83bbda17e649321a2 /fs
parent	9cbdb702092a2d82f909312f4ec3eeded77bb82e (diff)
parent	91e8549bde9e5cc88c5a2e8c8114389279e240b5 (diff)