42 files changed, 863 insertions, 342 deletions
diff --git a/fs/bio.c b/fs/bio.c
index 062299acbccd..72ab251cdb9c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -302,9 +302,10 @@ void bio_init(struct bio *bio)
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
        struct bio *bio = NULL;
+        void *p;
        if (bs) {
-                void *p = mempool_alloc(bs->bio_pool, gfp_mask);
+                p = mempool_alloc(bs->bio_pool, gfp_mask);
                if (p)
                        bio = p + bs->front_pad;
@@ -329,7 +330,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                        }
                        if (unlikely(!bvl)) {
                                if (bs)
-                                        mempool_free(bio, bs->bio_pool);
+                                        mempool_free(p, bs->bio_pool);
                                else
                                        kfree(bio);
                                bio = NULL;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35443cc4b9a9..42491d728e99 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,19 +38,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct btrfs_path *path, int level, int slot);
-inline void btrfs_init_path(struct btrfs_path *p)
-{
-        memset(p, 0, sizeof(*p));
-}
 struct btrfs_path *btrfs_alloc_path(void)
 {
        struct btrfs_path *path;
-        path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
+        path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
-        if (path) {
+        if (path)
-                btrfs_init_path(path);
                path->reada = 1;
-        }
        return path;
 }
@@ -69,14 +62,38 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
 /*
 * reset all the locked nodes in the patch to spinning locks.
+ *
+ * held is used to keep lockdep happy, when lockdep is enabled
+ * we set held to a blocking lock before we go around and
+ * retake all the spinlocks in the path.  You can safely use NULL
+ * for held
 */
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
+                                        struct extent_buffer *held)
 {
        int i;
-        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        /* lockdep really cares that we take all of these spinlocks
+         * in the right order.  If any of the locks in the path are not
+         * currently blocking, it is going to complain.  So, make really
+         * really sure by forcing the path to blocking before we clear
+         * the path blocking.
+         */
+        if (held)
+                btrfs_set_lock_blocking(held);
+        btrfs_set_path_blocking(p);
+#endif
+        for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
                if (p->nodes[i] && p->locks[i])
                        btrfs_clear_lock_blocking(p->nodes[i]);
        }
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (held)
+                btrfs_clear_lock_blocking(held);
+#endif
 }
 /* this also releases the path */
@@ -286,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                                                  trans->transid, level, &ins);
                BUG_ON(ret);
                cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
-                                            buf->len);
+                                            buf->len, level);
        } else {
                cow = btrfs_alloc_free_block(trans, root, buf->len,
                                             parent_start,
@@ -917,9 +934,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                /* promote the child to a root */
                child = read_node_slot(root, mid, 0);
+                BUG_ON(!child);
                btrfs_tree_lock(child);
                btrfs_set_lock_blocking(child);
-                BUG_ON(!child);
                ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
                BUG_ON(ret);
@@ -1566,7 +1583,7 @@ cow_done:
                if (!p->skip_locking)
                        p->locks[level] = 1;
-                btrfs_clear_path_blocking(p);
+                btrfs_clear_path_blocking(p, NULL);
                /*
                 * we have a lock on b and as long as we aren't changing
@@ -1605,7 +1622,7 @@ cow_done:
                                btrfs_set_path_blocking(p);
                                sret = split_node(trans, root, p, level);
-                                btrfs_clear_path_blocking(p);
+                                btrfs_clear_path_blocking(p, NULL);
                                BUG_ON(sret > 0);
                                if (sret) {
@@ -1625,7 +1642,7 @@ cow_done:
                                btrfs_set_path_blocking(p);
                                sret = balance_level(trans, root, p, level);
-                                btrfs_clear_path_blocking(p);
+                                btrfs_clear_path_blocking(p, NULL);
                                if (sret) {
                                        ret = sret;
@@ -1688,13 +1705,13 @@ cow_done:
                        if (!p->skip_locking) {
                                int lret;
-                                btrfs_clear_path_blocking(p);
+                                btrfs_clear_path_blocking(p, NULL);
                                lret = btrfs_try_spin_lock(b);
                                if (!lret) {
                                        btrfs_set_path_blocking(p);
                                        btrfs_tree_lock(b);
-                                        btrfs_clear_path_blocking(p);
+                                        btrfs_clear_path_blocking(p, b);
                                }
                        }
                } else {
@@ -1706,7 +1723,7 @@ cow_done:
                                btrfs_set_path_blocking(p);
                                sret = split_leaf(trans, root, key,
                                                      p, ins_len, ret == 0);
-                                btrfs_clear_path_blocking(p);
+                                btrfs_clear_path_blocking(p, NULL);
                                BUG_ON(sret > 0);
                                if (sret) {
@@ -3926,7 +3943,6 @@ find_next_key:
                                btrfs_release_path(root, path);
                                goto again;
                        } else {
-                                btrfs_clear_path_blocking(path);
                                goto out;
                        }
                }
@@ -3946,7 +3962,7 @@ find_next_key:
                path->locks[level - 1] = 1;
                path->nodes[level - 1] = cur;
                unlock_up(path, level, 1);
-                btrfs_clear_path_blocking(path);
+                btrfs_clear_path_blocking(path, NULL);
        }
 out:
        if (ret == 0)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 531db112c8bd..766b31ae3186 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,7 @@ struct btrfs_ordered_sum;
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
-#ifdef CONFIG_LOCKDEP
+#define BTRFS_MAX_LEVEL 8
-# define BTRFS_MAX_LEVEL 7
-#else
-# define BTRFS_MAX_LEVEL 8
-#endif
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -1715,7 +1711,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                             u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
-                                            u64 bytenr, u32 blocksize);
+                                            u64 bytenr, u32 blocksize,
+                                            int level);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root,
                       u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1834,9 +1831,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
-void btrfs_init_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5aebddd71193..adda739a0215 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -75,6 +75,40 @@ struct async_submit_bio {
        struct btrfs_work work;
 };
+/* These are used to set the lockdep class on the extent buffer locks.
+ * The class is set by the readpage_end_io_hook after the buffer has
+ * passed csum validation but before the pages are unlocked.
+ *
+ * The lockdep class is also set by btrfs_init_new_buffer on freshly
+ * allocated blocks.
+ *
+ * The class is based on the level in the tree block, which allows lockdep
+ * to know that lower nodes nest inside the locks of higher nodes.
+ *
+ * We also add a check to make sure the highest level of the tree is
+ * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
+ * code needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+#  error
+# endif
+static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
+static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
+        /* leaf */
+        "btrfs-extent-00",
+        "btrfs-extent-01",
+        "btrfs-extent-02",
+        "btrfs-extent-03",
+        "btrfs-extent-04",
+        "btrfs-extent-05",
+        "btrfs-extent-06",
+        "btrfs-extent-07",
+        /* highest possible level */
+        "btrfs-extent-08",
+};
+#endif
 /*
 * extents on the btree inode are pretty simple, there's one extent
 * that covers the entire device
@@ -347,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
        return ret;
 }
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
+{
+        lockdep_set_class_and_name(&eb->lock,
+                           &btrfs_eb_class[level],
+                           btrfs_eb_name[level]);
+}
+#endif
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                               struct extent_state *state)
 {
@@ -392,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        }
        found_level = btrfs_header_level(eb);
+        btrfs_set_buffer_lockdep_class(eb, found_level);
        ret = csum_tree_block(root, eb, 1);
        if (ret)
                ret = -EIO;
@@ -1777,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
        dev_root->track_dirty = 1;
        if (ret)
                goto fail_extent_root;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 494a56eb2986..95029db227be 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -101,4 +101,14 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root);
 int btree_lock_page_hook(struct page *page);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+#else
+static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
+                                                 int level)
+{
+}
+#endif
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7527523c2d2d..0a5d796c9f7e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1323,8 +1323,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root)
 {
-        finish_current_insert(trans, root->fs_info->extent_root, 1);
+        u64 start;
-        del_pending_extents(trans, root->fs_info->extent_root, 1);
+        u64 end;
+        int ret;
+        while(1) {
+                finish_current_insert(trans, root->fs_info->extent_root, 1);
+                del_pending_extents(trans, root->fs_info->extent_root, 1);
+                /* is there more work to do? */
+                ret = find_first_extent_bit(&root->fs_info->pending_del,
+                                            0, &start, &end, EXTENT_WRITEBACK);
+                if (!ret)
+                        continue;
+                ret = find_first_extent_bit(&root->fs_info->extent_ins,
+                                            0, &start, &end, EXTENT_WRITEBACK);
+                if (!ret)
+                        continue;
+                break;
+        }
        return 0;
 }
@@ -2211,13 +2228,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
        u64 end;
        u64 priv;
        u64 search = 0;
-        u64 skipped = 0;
        struct btrfs_fs_info *info = extent_root->fs_info;
        struct btrfs_path *path;
        struct pending_extent_op *extent_op, *tmp;
        struct list_head insert_list, update_list;
        int ret;
-        int num_inserts = 0, max_inserts;
+        int num_inserts = 0, max_inserts, restart = 0;
        path = btrfs_alloc_path();
        INIT_LIST_HEAD(&insert_list);
@@ -2233,19 +2249,19 @@ again:
                ret = find_first_extent_bit(&info->extent_ins, search, &start,
                                            &end, EXTENT_WRITEBACK);
                if (ret) {
-                        if (skipped && all && !num_inserts &&
+                        if (restart && !num_inserts &&
                            list_empty(&update_list)) {
-                                skipped = 0;
+                                restart = 0;
                                search = 0;
                                continue;
                        }
-                        mutex_unlock(&info->extent_ins_mutex);
                        break;
                }
                ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
                if (!ret) {
-                        skipped = 1;
+                        if (all)
+                                restart = 1;
                        search = end + 1;
                        if (need_resched()) {
                                mutex_unlock(&info->extent_ins_mutex);
@@ -2264,7 +2280,7 @@ again:
                        list_add_tail(&extent_op->list, &insert_list);
                        search = end + 1;
                        if (num_inserts == max_inserts) {
-                                mutex_unlock(&info->extent_ins_mutex);
+                                restart = 1;
                                break;
                        }
                } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2280,7 +2296,6 @@ again:
         * somebody marked this thing for deletion then just unlock it and be
         * done, the free_extents will handle it
         */
-        mutex_lock(&info->extent_ins_mutex);
        list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
                clear_extent_bits(&info->extent_ins, extent_op->bytenr,
                                  extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2302,6 +2317,10 @@ again:
        if (!list_empty(&update_list)) {
                ret = update_backrefs(trans, extent_root, path, &update_list);
                BUG_ON(ret);
+                /* we may have COW'ed new blocks, so lets start over */
+                if (all)
+                        restart = 1;
        }
        /*
@@ -2309,9 +2328,9 @@ again:
         * need to make sure everything is cleaned then reset everything and
         * go back to the beginning
         */
-        if (!num_inserts && all && skipped) {
+        if (!num_inserts && restart) {
                search = 0;
-                skipped = 0;
+                restart = 0;
                INIT_LIST_HEAD(&update_list);
                INIT_LIST_HEAD(&insert_list);
                goto again;
@@ -2368,27 +2387,19 @@ again:
        BUG_ON(ret);
        /*
-         * if we broke out of the loop in order to insert stuff because we hit
+         * if restart is set for whatever reason we need to go back and start
-         * the maximum number of inserts at a time we can handle, then loop
+         * searching through the pending list again.
-         * back and pick up where we left off
+         *
+         * We just inserted some extents, which could have resulted in new
+         * blocks being allocated, which would result in new blocks needing
+         * updates, so if all is set we _must_ restart to get the updated
+         * blocks.
         */
-        if (num_inserts == max_inserts) {
+        if (restart || all) {
-                INIT_LIST_HEAD(&insert_list);
-                INIT_LIST_HEAD(&update_list);
-                num_inserts = 0;
-                goto again;
-        }
-        /*
-         * again, if we need to make absolutely sure there are no more pending
-         * extent operations left and we know that we skipped some, go back to
-         * the beginning and do it all again
-         */
-        if (all && skipped) {
                INIT_LIST_HEAD(&insert_list);
                INIT_LIST_HEAD(&update_list);
                search = 0;
-                skipped = 0;
+                restart = 0;
                num_inserts = 0;
                goto again;
        }
@@ -2709,6 +2720,8 @@ again:
                goto again;
        }
+        if (!err)
+                finish_current_insert(trans, extent_root, 0);
        return err;
 }
@@ -2859,7 +2872,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        if (data & BTRFS_BLOCK_GROUP_METADATA) {
                last_ptr = &root->fs_info->last_alloc;
-                empty_cluster = 64 * 1024;
+                if (!btrfs_test_opt(root, SSD))
+                        empty_cluster = 64 * 1024;
        }
        if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
@@ -3402,7 +3416,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
-                                            u64 bytenr, u32 blocksize)
+                                            u64 bytenr, u32 blocksize,
+                                            int level)
 {
        struct extent_buffer *buf;
@@ -3410,6 +3425,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        if (!buf)
                return ERR_PTR(-ENOMEM);
        btrfs_set_header_generation(buf, trans->transid);
+        btrfs_set_buffer_lockdep_class(buf, level);
        btrfs_tree_lock(buf);
        clean_tree_block(trans, root, buf);
@@ -3453,7 +3469,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                return ERR_PTR(ret);
        }
-        buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
+                                    blocksize, level);
        return buf;
 }
@@ -5641,7 +5658,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
                        prev_block = block_start;
                }
+                mutex_lock(&extent_root->fs_info->trans_mutex);
                btrfs_record_root_in_trans(found_root);
+                mutex_unlock(&extent_root->fs_info->trans_mutex);
                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
                        /*
                         * try to update data extent references while
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 37d43b516b79..ebe6b29e6069 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -415,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
        node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
        if (node) {
-                struct extent_state *found;
-                found = rb_entry(node, struct extent_state, rb_node);
                free_extent_state(prealloc);
                return -EEXIST;
        }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3e8023efaff7..872f104576e5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1222,7 +1222,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
        /*
         * ok we haven't committed the transaction yet, lets do a commit
         */
-        if (file->private_data)
+        if (file && file->private_data)
                btrfs_ioctl_trans_end(file);
        trans = btrfs_start_transaction(root, 1);
@@ -1231,7 +1231,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
                goto out;
        }
-        ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+        ret = btrfs_log_dentry_safe(trans, root, dentry);
        if (ret < 0)
                goto out;
@@ -1245,7 +1245,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
         * file again, but that will end up using the synchronization
         * inside btrfs_sync_log to keep things safe.
         */
-        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+        mutex_unlock(&dentry->d_inode->i_mutex);
        if (ret > 0) {
                ret = btrfs_commit_transaction(trans, root);
@@ -1253,7 +1253,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
                btrfs_sync_log(trans, root);
                ret = btrfs_end_transaction(trans, root);
        }
-        mutex_lock(&file->f_dentry->d_inode->i_mutex);
+        mutex_lock(&dentry->d_inode->i_mutex);
 out:
        return ret > 0 ? EIO : ret;
 }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa79873eb46..cc7334d833c9 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
        search_key.type = 0;
        search_key.offset = 0;
-        btrfs_init_path(path);
        start_found = 0;
        ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
        if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8f0706210a47..3cee77ae03c8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2531,8 +2531,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        key.offset = (u64)-1;
        key.type = (u8)-1;
-        btrfs_init_path(path);
 search_again:
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0)
@@ -4263,7 +4261,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
        if (PageWriteback(page) || PageDirty(page))
                return 0;
-        return __btrfs_releasepage(page, gfp_flags);
+        return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
 }
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9ebe9385129b..85506c4a3af7 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,21 +25,10 @@
 #include "extent_io.h"
 #include "locking.h"
-/*
- * btrfs_header_level() isn't free, so don't call it when lockdep isn't
- * on
- */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void spin_nested(struct extent_buffer *eb)
-{
-        spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
-}
-#else
 static inline void spin_nested(struct extent_buffer *eb)
 {
        spin_lock(&eb->lock);
 }
-#endif
 /*
 * Setting a lock to blocking will drop the spinlock and set the
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f3fd7e2cbc38..19a4daf03ccb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -379,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        btrfs_start_delalloc_inodes(root);
        btrfs_wait_ordered_extents(root, 0);
-        btrfs_clean_old_snapshots(root);
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
        sb->s_dirt = 0;
@@ -511,6 +510,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        struct btrfs_root *root = btrfs_sb(sb);
        int ret;
+        ret = btrfs_parse_options(root, data);
+        if (ret)
+                return -EINVAL;
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 919172de5c9a..4112d53d4f4d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
                num_bytes -= btrfs_root_used(&dirty->root->root_item);
                bytes_used = btrfs_root_used(&root->root_item);
                if (num_bytes) {
+                        mutex_lock(&root->fs_info->trans_mutex);
                        btrfs_record_root_in_trans(root);
+                        mutex_unlock(&root->fs_info->trans_mutex);
                        btrfs_set_root_used(&root->root_item,
                                            bytes_used - num_bytes);
                }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 20794290256b..9c462fbd60fa 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2832,7 +2832,9 @@ again:
                BUG_ON(!wc.replay_dest);
                wc.replay_dest->log_root = log;
+                mutex_lock(&fs_info->trans_mutex);
                btrfs_record_root_in_trans(wc.replay_dest);
+                mutex_unlock(&fs_info->trans_mutex);
                ret = walk_log_tree(trans, log, &wc);
                BUG_ON(ret);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bcd14ebccae1..1316139bf9e8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2894,10 +2894,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                free_extent_map(em);
        }
-        map = kzalloc(sizeof(*map), GFP_NOFS);
-        if (!map)
-                return -ENOMEM;
        em = alloc_extent_map(GFP_NOFS);
        if (!em)
                return -ENOMEM;
@@ -3106,6 +3102,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
        if (!sb)
                return -ENOMEM;
        btrfs_set_buffer_uptodate(sb);
+        btrfs_set_buffer_lockdep_class(sb, 0);
        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
        array_size = btrfs_super_sys_array_size(super_copy);
diff --git a/fs/buffer.c b/fs/buffer.c
index 665d446b25bc..9f697419ed8e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -777,6 +777,7 @@ static int __set_page_dirty(struct page *page,
                        __inc_zone_page_state(page, NR_FILE_DIRTY);
                        __inc_bdi_stat(mapping->backing_dev_info,
                                        BDI_RECLAIMABLE);
+                        task_dirty_inc(current);
                        task_io_account_write(PAGE_CACHE_SIZE);
                }
                radix_tree_tag_set(&mapping->page_tree,
@@ -3108,7 +3109,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
        if (test_clear_buffer_dirty(bh)) {
                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
-                ret = submit_bh(WRITE_SYNC, bh);
+                ret = submit_bh(WRITE, bh);
                wait_on_buffer(bh);
                if (buffer_eopnotsupp(bh)) {
                        clear_buffer_eopnotsupp(bh);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 73ac7ebd1dfc..851388fafc73 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,13 @@
+Version 1.57
+------------
+Improve support for multiple security contexts to the same server. We
+used to use the same "vcnumber" for all connections which could cause
+the server to treat subsequent connections, especially those that
+are authenticated as guest, as reconnections, invalidating the earlier
+user's smb session.  This fix allows cifs to mount multiple times to the
+same server with different userids without risking invalidating earlier
+established security contexts.
 Version 1.56
 ------------
 Add "forcemandatorylock" mount option to allow user to use mandatory
@@ -7,7 +17,10 @@ specified and user does not have access to query information about the
 top of the share.  Fix problem in 2.6.28 resolving DFS paths to
 Samba servers (worked to Windows).  Fix rmdir so that pending search
 (readdir) requests do not get invalid results which include the now
-removed directory.
+removed directory.  Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
+when using DFS.  Add better file create support to servers which support
+the CIFS POSIX protocol extensions (this adds support for new flags
+on create, and improves semantics for write of locked ranges).
 Version 1.55
 ------------
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7ac481841f87..2b1d28a9ee28 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.56"
+#define CIFS_VERSION   "1.57"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 94c1ca0ec953..e004f6db5fc8 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -164,9 +164,12 @@ struct TCP_Server_Info {
        /* multiplexed reads or writes */
        unsigned int maxBuf;    /* maxBuf specifies the maximum */
        /* message size the server can send or receive for non-raw SMBs */
-        unsigned int maxRw;     /* maxRw specifies the maximum */
+        unsigned int max_rw;    /* maxRw specifies the maximum */
        /* message size the server can send or receive for */
        /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
+        unsigned int max_vcs;   /* maximum number of smb sessions, at least
+                                   those that can be specified uniquely with
+                                   vcnumbers */
        char sessid[4];         /* unique token id for this session */
        /* (returned on Negotiate */
        int capabilities; /* allow selective disabling of caps by smb sess */
@@ -210,6 +213,7 @@ struct cifsSesInfo {
        unsigned overrideSecFlg;  /* if non-zero override global sec flags */
        __u16 ipc_tid;          /* special tid for connection to IPC share */
        __u16 flags;
+        __u16 vcnum;
        char *serverOS;         /* name of operating system underlying server */
        char *serverNOS;        /* name of network operating system of server */
        char *serverDomain;     /* security realm of server */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 382ba6298809..083dfc57c7a3 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -42,6 +42,7 @@ extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
 #define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
 extern char *build_path_from_dentry(struct dentry *);
+extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
@@ -91,6 +92,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
+extern void posix_fill_in_inode(struct inode *tmp_inode,
+                                FILE_UNIX_BASIC_INFO *pData, int isNewInode);
+extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
 extern int cifs_get_inode_info(struct inode **pinode,
                        const unsigned char *search_path,
                        FILE_ALL_INFO *pfile_info,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 552642a507c4..939e2f76b959 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -528,14 +528,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
                server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
                                (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+                server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
                GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
                /* even though we do not use raw we might as well set this
                accurately, in case we ever find a need for it */
                if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
-                        server->maxRw = 0xFF00;
+                        server->max_rw = 0xFF00;
                        server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
                } else {
-                        server->maxRw = 0;/* we do not need to use raw anyway */
+                        server->max_rw = 0;/* do not need to use raw anyway */
                        server->capabilities = CAP_MPX_MODE;
                }
                tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
@@ -638,7 +639,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        /* probably no need to store and check maxvcs */
        server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
                        (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
-        server->maxRw = le32_to_cpu(pSMBr->MaxRawSize);
+        server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
        cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
        GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2209be943051..da0f4ffa0613 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,7 +23,6 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/wait.h>
-#include <linux/ipv6.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
@@ -35,6 +34,7 @@
 #include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -1379,8 +1379,8 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
                     server->addr.sockAddr.sin_addr.s_addr))
                        continue;
                else if (addr->ss_family == AF_INET6 &&
-                         memcmp(&server->addr.sockAddr6.sin6_addr,
+                         !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
-                                &addr6->sin6_addr, sizeof(addr6->sin6_addr)))
+                                          &addr6->sin6_addr))
                        continue;
                ++server->srv_count;
@@ -2180,6 +2180,33 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                           "mount option supported"));
 }
+static int
+is_path_accessible(int xid, struct cifsTconInfo *tcon,
+                   struct cifs_sb_info *cifs_sb, const char *full_path)
+{
+        int rc;
+        __u64 inode_num;
+        FILE_ALL_INFO *pfile_info;
+        rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num,
+                                   cifs_sb->local_nls,
+                                   cifs_sb->mnt_cifs_flags &
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc != -EOPNOTSUPP)
+                return rc;
+        pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+        if (pfile_info == NULL)
+                return -ENOMEM;
+        rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info,
+                              0 /* not legacy */, cifs_sb->local_nls,
+                              cifs_sb->mnt_cifs_flags &
+                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        kfree(pfile_info);
+        return rc;
+}
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
           char *mount_data, const char *devname)
@@ -2190,6 +2217,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        struct cifsSesInfo *pSesInfo = NULL;
        struct cifsTconInfo *tcon = NULL;
        struct TCP_Server_Info *srvTcp = NULL;
+        char   *full_path;
        xid = GetXid();
@@ -2426,6 +2454,23 @@ mount_fail_check:
                cifs_sb->rsize = min(cifs_sb->rsize,
                               (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+        if (!rc && cifs_sb->prepathlen) {
+                /* build_path_to_root works only when we have a valid tcon */
+                full_path = cifs_build_path_to_root(cifs_sb);
+                if (full_path == NULL) {
+                        rc = -ENOMEM;
+                        goto mount_fail_check;
+                }
+                rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
+                if (rc) {
+                        cERROR(1, ("Path %s in not accessible: %d",
+                                                full_path, rc));
+                        kfree(full_path);
+                        goto mount_fail_check;
+                }
+                kfree(full_path);
+        }
        /* volume_info->password is freed above when existing session found
        (in which case it is not needed anymore) but when new sesion is created
        the password ptr is put in the new session structure (in which case the
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 964aad03c5ad..89fb72832652 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -3,7 +3,7 @@
 *
 *   vfs operations that deal with dentries
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -129,6 +129,78 @@ cifs_bp_rename_retry:
        return full_path;
 }
+static int cifs_posix_open(char *full_path, struct inode **pinode,
+                    struct super_block *sb, int mode, int oflags,
+                    int *poplock, __u16 *pnetfid, int xid)
+{
+        int rc;
+        __u32 oplock;
+        FILE_UNIX_BASIC_INFO *presp_data;
+        __u32 posix_flags = 0;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        cFYI(1, ("posix open %s", full_path));
+        presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+        if (presp_data == NULL)
+                return -ENOMEM;
+/* So far cifs posix extensions can only map the following flags.
+   There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
+   so far we do not seem to need them, and we can treat them as local only */
+        if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
+                (FMODE_READ | FMODE_WRITE))
+                posix_flags = SMB_O_RDWR;
+        else if (oflags & FMODE_READ)
+                posix_flags = SMB_O_RDONLY;
+        else if (oflags & FMODE_WRITE)
+                posix_flags = SMB_O_WRONLY;
+        if (oflags & O_CREAT)
+                posix_flags |= SMB_O_CREAT;
+        if (oflags & O_EXCL)
+                posix_flags |= SMB_O_EXCL;
+        if (oflags & O_TRUNC)
+                posix_flags |= SMB_O_TRUNC;
+        if (oflags & O_APPEND)
+                posix_flags |= SMB_O_APPEND;
+        if (oflags & O_SYNC)
+                posix_flags |= SMB_O_SYNC;
+        if (oflags & O_DIRECTORY)
+                posix_flags |= SMB_O_DIRECTORY;
+        if (oflags & O_NOFOLLOW)
+                posix_flags |= SMB_O_NOFOLLOW;
+        if (oflags & O_DIRECT)
+                posix_flags |= SMB_O_DIRECT;
+        rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
+                        pnetfid, presp_data, &oplock, full_path,
+                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc)
+                goto posix_open_ret;
+        if (presp_data->Type == cpu_to_le32(-1))
+                goto posix_open_ret; /* open ok, caller does qpathinfo */
+        /* get new inode and set it up */
+        if (!pinode)
+                goto posix_open_ret; /* caller does not need info */
+        *pinode = cifs_new_inode(sb, &presp_data->UniqueId);
+        /* We do not need to close the file if new_inode fails since
+           the caller will retry qpathinfo as long as inode is null */
+        if (*pinode == NULL)
+                goto posix_open_ret;
+        posix_fill_in_inode(*pinode, presp_data, 1);
+posix_open_ret:
+        kfree(presp_data);
+        return rc;
+}
 static void setup_cifs_dentry(struct cifsTconInfo *tcon,
                              struct dentry *direntry,
                              struct inode *newinode)
@@ -150,7 +222,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        int xid;
        int create_options = CREATE_NOT_DIR;
        int oplock = 0;
-        /* BB below access is too much for the mknod to request */
+        int oflags;
+        /*
+         * BB below access is probably too much for mknod to request
+         *    but we have to do query and setpathinfo so requesting
+         *    less could fail (unless we want to request getatr and setatr
+         *    permissions (only).  At least for POSIX we do not have to
+         *    request so much.
+         */
        int desiredAccess = GENERIC_READ | GENERIC_WRITE;
        __u16 fileHandle;
        struct cifs_sb_info *cifs_sb;
@@ -174,13 +253,43 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        }
        mode &= ~current->fs->umask;
+        if (oplockEnabled)
+                oplock = REQ_OPLOCK;
-        if (nd && (nd->flags & LOOKUP_OPEN)) {
+        if (nd && (nd->flags & LOOKUP_OPEN))
-                int oflags = nd->intent.open.flags;
+                oflags = nd->intent.open.flags;
+        else
+                oflags = FMODE_READ;
+        if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+                rc = cifs_posix_open(full_path, &newinode, inode->i_sb,
+                                     mode, oflags, &oplock, &fileHandle, xid);
+                /* EIO could indicate that (posix open) operation is not
+                   supported, despite what server claimed in capability
+                   negotation.  EREMOTE indicates DFS junction, which is not
+                   handled in posix open */
+                if ((rc == 0) && (newinode == NULL))
+                        goto cifs_create_get_file_info; /* query inode info */
+                else if (rc == 0) /* success, no need to query */
+                        goto cifs_create_set_dentry;
+                else if ((rc != -EIO) && (rc != -EREMOTE) &&
+                         (rc != -EOPNOTSUPP)) /* path not found or net err */
+                        goto cifs_create_out;
+                /* else fallthrough to retry, using older open call, this is
+                   case where server does not support this SMB level, and
+                   falsely claims capability (also get here for DFS case
+                   which should be rare for path not covered on files) */
+        }
+        if (nd && (nd->flags & LOOKUP_OPEN)) {
+                /* if the file is going to stay open, then we
+                   need to set the desired access properly */
                desiredAccess = 0;
                if (oflags & FMODE_READ)
-                        desiredAccess |= GENERIC_READ;
+                        desiredAccess |= GENERIC_READ; /* is this too little? */
                if (oflags & FMODE_WRITE) {
                        desiredAccess |= GENERIC_WRITE;
                        if (!(oflags & FMODE_READ))
@@ -199,8 +308,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        /* BB add processing to set equivalent of mode - e.g. via CreateX with
           ACLs */
-        if (oplockEnabled)
-                oplock = REQ_OPLOCK;
        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
        if (buf == NULL) {
@@ -233,116 +340,112 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        }
        if (rc) {
                cFYI(1, ("cifs_create returned 0x%x", rc));
-        } else {
+                goto cifs_create_out;
-                /* If Open reported that we actually created a file
+        }
-                then we now have to set the mode if possible */
-                if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+        /* If Open reported that we actually created a file
-                        struct cifs_unix_set_info_args args = {
+           then we now have to set the mode if possible */
+        if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+                struct cifs_unix_set_info_args args = {
                                .mode   = mode,
                                .ctime  = NO_CHANGE_64,
                                .atime  = NO_CHANGE_64,
                                .mtime  = NO_CHANGE_64,
                                .device = 0,
-                        };
+                };
-                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-                                args.uid = (__u64) current_fsuid();
+                        args.uid = (__u64) current_fsuid();
-                                if (inode->i_mode & S_ISGID)
+                        if (inode->i_mode & S_ISGID)
-                                        args.gid = (__u64) inode->i_gid;
+                                args.gid = (__u64) inode->i_gid;
-                                else
+                        else
-                                        args.gid = (__u64) current_fsgid();
+                                args.gid = (__u64) current_fsgid();
-                        } else {
-                                args.uid = NO_CHANGE_64;
-                                args.gid = NO_CHANGE_64;
-                        }
-                        CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
-                                cifs_sb->local_nls,
-                                cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                } else {
-                        /* BB implement mode setting via Windows security
+                        args.uid = NO_CHANGE_64;
-                           descriptors e.g. */
+                        args.gid = NO_CHANGE_64;
-                        /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
-                        /* Could set r/o dos attribute if mode & 0222 == 0 */
                }
+                CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
+                        cifs_sb->local_nls,
+                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        } else {
+                /* BB implement mode setting via Windows security
+                   descriptors e.g. */
+                /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
-                /* server might mask mode so we have to query for it */
+                /* Could set r/o dos attribute if mode & 0222 == 0 */
-                if (tcon->unix_ext)
+        }
-                        rc = cifs_get_inode_info_unix(&newinode, full_path,
-                                                 inode->i_sb, xid);
+cifs_create_get_file_info:
-                else {
+        /* server might mask mode so we have to query for it */
-                        rc = cifs_get_inode_info(&newinode, full_path,
+        if (tcon->unix_ext)
-                                                 buf, inode->i_sb, xid,
+                rc = cifs_get_inode_info_unix(&newinode, full_path,
-                                                 &fileHandle);
+                                              inode->i_sb, xid);
-                        if (newinode) {
+        else {
-                                if (cifs_sb->mnt_cifs_flags &
+                rc = cifs_get_inode_info(&newinode, full_path, buf,
-                                    CIFS_MOUNT_DYNPERM)
+                                         inode->i_sb, xid, &fileHandle);
-                                        newinode->i_mode = mode;
+                if (newinode) {
-                                if ((oplock & CIFS_CREATE_ACTION) &&
+                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
-                                    (cifs_sb->mnt_cifs_flags &
+                                newinode->i_mode = mode;
-                                     CIFS_MOUNT_SET_UID)) {
+                        if ((oplock & CIFS_CREATE_ACTION) &&
-                                        newinode->i_uid = current_fsuid();
+                            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
-                                        if (inode->i_mode & S_ISGID)
+                                newinode->i_uid = current_fsuid();
-                                                newinode->i_gid =
+                                if (inode->i_mode & S_ISGID)
-                                                        inode->i_gid;
+                                        newinode->i_gid = inode->i_gid;
-                                        else
+                                else
-                                                newinode->i_gid =
+                                        newinode->i_gid = current_fsgid();
-                                                        current_fsgid();
-                                }
                        }
                }
+        }
-                if (rc != 0) {
+cifs_create_set_dentry:
-                        cFYI(1, ("Create worked, get_inode_info failed rc = %d",
+        if (rc == 0)
-                                 rc));
+                setup_cifs_dentry(tcon, direntry, newinode);
-                } else
+        else
-                        setup_cifs_dentry(tcon, direntry, newinode);
+                cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
-                if ((nd == NULL /* nfsd case - nfs srv does not set nd */) ||
+        /* nfsd case - nfs srv does not set nd */
-                        (!(nd->flags & LOOKUP_OPEN))) {
+        if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
-                        /* mknod case - do not leave file open */
+                /* mknod case - do not leave file open */
-                        CIFSSMBClose(xid, tcon, fileHandle);
+                CIFSSMBClose(xid, tcon, fileHandle);
-                } else if (newinode) {
+        } else if (newinode) {
-                        struct cifsFileInfo *pCifsFile =
+                struct cifsFileInfo *pCifsFile =
-                           kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+                        kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-                        if (pCifsFile == NULL)
+                if (pCifsFile == NULL)
-                                goto cifs_create_out;
+                        goto cifs_create_out;
-                        pCifsFile->netfid = fileHandle;
+                pCifsFile->netfid = fileHandle;
-                        pCifsFile->pid = current->tgid;
+                pCifsFile->pid = current->tgid;
-                        pCifsFile->pInode = newinode;
+                pCifsFile->pInode = newinode;
-                        pCifsFile->invalidHandle = false;
+                pCifsFile->invalidHandle = false;
-                        pCifsFile->closePend     = false;
+                pCifsFile->closePend     = false;
-                        init_MUTEX(&pCifsFile->fh_sem);
+                init_MUTEX(&pCifsFile->fh_sem);
-                        mutex_init(&pCifsFile->lock_mutex);
+                mutex_init(&pCifsFile->lock_mutex);
-                        INIT_LIST_HEAD(&pCifsFile->llist);
+                INIT_LIST_HEAD(&pCifsFile->llist);
-                        atomic_set(&pCifsFile->wrtPending, 0);
+                atomic_set(&pCifsFile->wrtPending, 0);
-                        /* set the following in open now
+                /* set the following in open now
                                pCifsFile->pfile = file; */
-                        write_lock(&GlobalSMBSeslock);
+                write_lock(&GlobalSMBSeslock);
-                        list_add(&pCifsFile->tlist, &tcon->openFileList);
+                list_add(&pCifsFile->tlist, &tcon->openFileList);
-                        pCifsInode = CIFS_I(newinode);
+                pCifsInode = CIFS_I(newinode);
-                        if (pCifsInode) {
+                if (pCifsInode) {
-                                /* if readable file instance put first in list*/
+                        /* if readable file instance put first in list*/
-                                if (write_only) {
+                        if (write_only) {
-                                        list_add_tail(&pCifsFile->flist,
+                                list_add_tail(&pCifsFile->flist,
-                                                &pCifsInode->openFileList);
+                                              &pCifsInode->openFileList);
-                                } else {
+                        } else {
-                                        list_add(&pCifsFile->flist,
+                                list_add(&pCifsFile->flist,
-                                                &pCifsInode->openFileList);
+                                         &pCifsInode->openFileList);
-                                }
-                                if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                                        pCifsInode->clientCanCacheAll = true;
-                                        pCifsInode->clientCanCacheRead = true;
-                                        cFYI(1, ("Exclusive Oplock inode %p",
-                                                newinode));
-                                } else if ((oplock & 0xF) == OPLOCK_READ)
-                                        pCifsInode->clientCanCacheRead = true;
                        }
-                        write_unlock(&GlobalSMBSeslock);
+                        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+                                pCifsInode->clientCanCacheAll = true;
+                                pCifsInode->clientCanCacheRead = true;
+                                cFYI(1, ("Exclusive Oplock inode %p",
+                                        newinode));
+                        } else if ((oplock & 0xF) == OPLOCK_READ)
+                                pCifsInode->clientCanCacheRead = true;
                }
+                write_unlock(&GlobalSMBSeslock);
        }
 cifs_create_out:
        kfree(buf);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index bcf7b5184664..4690a360c855 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -199,6 +199,49 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
        pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
 }
+/**
+ * cifs_new inode - create new inode, initialize, and hash it
+ * @sb - pointer to superblock
+ * @inum - if valid pointer and serverino is enabled, replace i_ino with val
+ *
+ * Create a new inode, initialize it for CIFS and hash it. Returns the new
+ * inode or NULL if one couldn't be allocated.
+ *
+ * If the share isn't mounted with "serverino" or inum is a NULL pointer then
+ * we'll just use the inode number assigned by new_inode(). Note that this can
+ * mean i_ino collisions since the i_ino assigned by new_inode is not
+ * guaranteed to be unique.
+ */
+struct inode *
+cifs_new_inode(struct super_block *sb, __u64 *inum)
+{
+        struct inode *inode;
+        inode = new_inode(sb);
+        if (inode == NULL)
+                return NULL;
+        /*
+         * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we
+         *     stop passing inum as ptr. Are there sanity checks we can use to
+         *     ensure that the server is really filling in that field? Also,
+         *     if serverino is disabled, perhaps we should be using iunique()?
+         */
+        if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
+                inode->i_ino = (unsigned long) *inum;
+        /*
+         * must set this here instead of cifs_alloc_inode since VFS will
+         * clobber i_flags
+         */
+        if (sb->s_flags & MS_NOATIME)
+                inode->i_flags |= S_NOATIME | S_NOCMTIME;
+        insert_inode_hash(inode);
+        return inode;
+}
 int cifs_get_inode_info_unix(struct inode **pinode,
        const unsigned char *full_path, struct super_block *sb, int xid)
 {
@@ -233,22 +276,11 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        /* get new inode */
        if (*pinode == NULL) {
-                *pinode = new_inode(sb);
+                *pinode = cifs_new_inode(sb, &find_data.UniqueId);
                if (*pinode == NULL) {
                        rc = -ENOMEM;
                        goto cgiiu_exit;
                }
-                /* Is an i_ino of zero legal? */
-                /* note ino incremented to unique num in new_inode */
-                /* Are there sanity checks we can use to ensure that
-                   the server is really filling in that field? */
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-                        (*pinode)->i_ino = (unsigned long)find_data.UniqueId;
-                if (sb->s_flags & MS_NOATIME)
-                        (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-                insert_inode_hash(*pinode);
        }
        inode = *pinode;
@@ -465,11 +497,9 @@ int cifs_get_inode_info(struct inode **pinode,
        /* get new inode */
        if (*pinode == NULL) {
-                *pinode = new_inode(sb);
+                __u64 inode_num;
-                if (*pinode == NULL) {
+                __u64 *pinum = &inode_num;
-                        rc = -ENOMEM;
-                        goto cgii_exit;
-                }
                /* Is an i_ino of zero legal? Can we use that to check
                   if the server supports returning inode numbers?  Are
                   there other sanity checks we can use to ensure that
@@ -486,22 +516,26 @@ int cifs_get_inode_info(struct inode **pinode,
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                        int rc1 = 0;
-                        __u64 inode_num;
                        rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
-                                        full_path, &inode_num,
+                                        full_path, pinum,
                                        cifs_sb->local_nls,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        if (rc1) {
                                cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+                                pinum = NULL;
                                /* BB EOPNOSUPP disable SERVER_INUM? */
-                        } else /* do we need cast or hash to ino? */
+                        }
-                                (*pinode)->i_ino = inode_num;
+                } else {
-                } /* else ino incremented to unique num in new_inode*/
+                        pinum = NULL;
-                if (sb->s_flags & MS_NOATIME)
+                }
-                        (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-                insert_inode_hash(*pinode);
+                *pinode = cifs_new_inode(sb, pinum);
+                if (*pinode == NULL) {
+                        rc = -ENOMEM;
+                        goto cgii_exit;
+                }
        }
        inode = *pinode;
        cifsInfo = CIFS_I(inode);
@@ -621,7 +655,7 @@ static const struct inode_operations cifs_ipc_inode_ops = {
        .lookup = cifs_lookup,
 };
-static char *build_path_to_root(struct cifs_sb_info *cifs_sb)
+char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 {
        int pplen = cifs_sb->prepathlen;
        int dfsplen;
@@ -678,7 +712,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
                return inode;
        cifs_sb = CIFS_SB(inode->i_sb);
-        full_path = build_path_to_root(cifs_sb);
+        full_path = cifs_build_path_to_root(cifs_sb);
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
@@ -1017,7 +1051,7 @@ out_reval:
        return rc;
 }
-static void posix_fill_in_inode(struct inode *tmp_inode,
+void posix_fill_in_inode(struct inode *tmp_inode,
        FILE_UNIX_BASIC_INFO *pData, int isNewInode)
 {
        struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
@@ -1114,24 +1148,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        else
                                direntry->d_op = &cifs_dentry_ops;
-                        newinode = new_inode(inode->i_sb);
+                        newinode = cifs_new_inode(inode->i_sb,
+                                                  &pInfo->UniqueId);
                        if (newinode == NULL) {
                                kfree(pInfo);
                                goto mkdir_get_info;
                        }
-                        /* Is an i_ino of zero legal? */
-                        /* Are there sanity checks we can use to ensure that
-                           the server is really filling in that field? */
-                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-                                newinode->i_ino =
-                                        (unsigned long)pInfo->UniqueId;
-                        } /* note ino incremented to unique num in new_inode */
-                        if (inode->i_sb->s_flags & MS_NOATIME)
-                                newinode->i_flags |= S_NOATIME | S_NOCMTIME;
                        newinode->i_nlink = 2;
-                        insert_inode_hash(newinode);
                        d_instantiate(direntry, newinode);
                        /* we already checked in POSIXCreate whether
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 9f51f9bf0292..c2c01ff4c32c 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -56,35 +56,34 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 }
 #endif /* DEBUG2 */
-/* Returns one if new inode created (which therefore needs to be hashed) */
+/* Returns 1 if new inode created, 2 if both dentry and inode were */
 /* Might check in the future if inode number changed so we can rehash inode */
-static int construct_dentry(struct qstr *qstring, struct file *file,
+static int
-        struct inode **ptmp_inode, struct dentry **pnew_dentry)
+construct_dentry(struct qstr *qstring, struct file *file,
+                 struct inode **ptmp_inode, struct dentry **pnew_dentry,
+                 __u64 *inum)
 {
-        struct dentry *tmp_dentry;
+        struct dentry *tmp_dentry = NULL;
-        struct cifs_sb_info *cifs_sb;
+        struct super_block *sb = file->f_path.dentry->d_sb;
-        struct cifsTconInfo *pTcon;
        int rc = 0;
        cFYI(1, ("For %s", qstring->name));
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
        qstring->hash = full_name_hash(qstring->name, qstring->len);
        tmp_dentry = d_lookup(file->f_path.dentry, qstring);
        if (tmp_dentry) {
+                /* BB: overwrite old name? i.e. tmp_dentry->d_name and
+                 * tmp_dentry->d_name.len??
+                 */
                cFYI(0, ("existing dentry with inode 0x%p",
                         tmp_dentry->d_inode));
                *ptmp_inode = tmp_dentry->d_inode;
-/* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/
                if (*ptmp_inode == NULL) {
-                        *ptmp_inode = new_inode(file->f_path.dentry->d_sb);
+                        *ptmp_inode = cifs_new_inode(sb, inum);
                        if (*ptmp_inode == NULL)
                                return rc;
                        rc = 1;
                }
-                if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
-                        (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
        } else {
                tmp_dentry = d_alloc(file->f_path.dentry, qstring);
                if (tmp_dentry == NULL) {
@@ -93,15 +92,14 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
                        return rc;
                }
-                *ptmp_inode = new_inode(file->f_path.dentry->d_sb);
+                if (CIFS_SB(sb)->tcon->nocase)
-                if (pTcon->nocase)
                        tmp_dentry->d_op = &cifs_ci_dentry_ops;
                else
                        tmp_dentry->d_op = &cifs_dentry_ops;
+                *ptmp_inode = cifs_new_inode(sb, inum);
                if (*ptmp_inode == NULL)
                        return rc;
-                if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
-                        (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
                rc = 2;
        }
@@ -822,7 +820,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 /* inode num, inode type and filename returned */
 static int cifs_get_name_from_search_buf(struct qstr *pqst,
        char *current_entry, __u16 level, unsigned int unicode,
-        struct cifs_sb_info *cifs_sb, int max_len, ino_t *pinum)
+        struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum)
 {
        int rc = 0;
        unsigned int len = 0;
@@ -842,9 +840,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                        len = strnlen(filename, PATH_MAX);
                }
-                /* BB fixme - hash low and high 32 bits if not 64 bit arch BB */
+                *pinum = pFindData->UniqueId;
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-                        *pinum = pFindData->UniqueId;
        } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
                FILE_DIRECTORY_INFO *pFindData =
                        (FILE_DIRECTORY_INFO *)current_entry;
@@ -907,7 +903,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
        struct qstr qstring;
        struct cifsFileInfo *pCifsF;
        unsigned int obj_type;
-        ino_t  inum;
+        __u64  inum;
        struct cifs_sb_info *cifs_sb;
        struct inode *tmp_inode;
        struct dentry *tmp_dentry;
@@ -940,20 +936,18 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
        if (rc)
                return rc;
-        rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry);
+        /* only these two infolevels return valid inode numbers */
+        if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX ||
+            pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
+                rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
+                                        &inum);
+        else
+                rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
+                                        NULL);
        if ((tmp_inode == NULL) || (tmp_dentry == NULL))
                return -ENOMEM;
-        if (rc) {
-                /* inode created, we need to hash it with right inode number */
-                if (inum != 0) {
-                        /* BB fixme - hash the 2 32 quantities bits together if
-                         *  necessary BB */
-                        tmp_inode->i_ino = inum;
-                }
-                insert_inode_hash(tmp_inode);
-        }
        /* we pass in rc below, indicating whether it is a new inode,
           so we can figure out whether to invalidate the inode cached
           data if the file has changed */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5f22de7b79a9..5c68b4282be9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -34,15 +34,99 @@
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
                         unsigned char *p24);
+/* Checks if this is the first smb session to be reconnected after
+   the socket has been reestablished (so we know whether to use vc 0).
+   Called while holding the cifs_tcp_ses_lock, so do not block */
+static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
+{
+        struct list_head *tmp;
+        struct cifsSesInfo *tmp_ses;
+        list_for_each(tmp, &ses->server->smb_ses_list) {
+                tmp_ses = list_entry(tmp, struct cifsSesInfo,
+                                     smb_ses_list);
+                if (tmp_ses->need_reconnect == false)
+                        return false;
+        }
+        /* could not find a session that was already connected,
+           this must be the first one we are reconnecting */
+        return true;
+}
+/*
+ *      vc number 0 is treated specially by some servers, and should be the
+ *      first one we request.  After that we can use vcnumbers up to maxvcs,
+ *      one for each smb session (some Windows versions set maxvcs incorrectly
+ *      so maxvc=1 can be ignored).  If we have too many vcs, we can reuse
+ *      any vc but zero (some servers reset the connection on vcnum zero)
+ *
+ */
+static __le16 get_next_vcnum(struct cifsSesInfo *ses)
+{
+        __u16 vcnum = 0;
+        struct list_head *tmp;
+        struct cifsSesInfo *tmp_ses;
+        __u16 max_vcs = ses->server->max_vcs;
+        __u16 i;
+        int free_vc_found = 0;
+        /* Quoting the MS-SMB specification: "Windows-based SMB servers set this
+        field to one but do not enforce this limit, which allows an SMB client
+        to establish more virtual circuits than allowed by this value ... but
+        other server implementations can enforce this limit." */
+        if (max_vcs < 2)
+                max_vcs = 0xFFFF;
+        write_lock(&cifs_tcp_ses_lock);
+        if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
+                        goto get_vc_num_exit;  /* vcnum will be zero */
+        for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
+                if (i == 0) /* this is the only connection, use vc 0 */
+                        break;
+                free_vc_found = 1;
+                list_for_each(tmp, &ses->server->smb_ses_list) {
+                        tmp_ses = list_entry(tmp, struct cifsSesInfo,
+                                             smb_ses_list);
+                        if (tmp_ses->vcnum == i) {
+                                free_vc_found = 0;
+                                break; /* found duplicate, try next vcnum */
+                        }
+                }
+                if (free_vc_found)
+                        break; /* we found a vcnumber that will work - use it */
+        }
+        if (i == 0)
+                vcnum = 0; /* for most common case, ie if one smb session, use
+                              vc zero.  Also for case when no free vcnum, zero
+                              is safest to send (some clients only send zero) */
+        else if (free_vc_found == 0)
+                vcnum = 1;  /* we can not reuse vc=0 safely, since some servers
+                                reset all uids on that, but 1 is ok. */
+        else
+                vcnum = i;
+        ses->vcnum = vcnum;
+get_vc_num_exit:
+        write_unlock(&cifs_tcp_ses_lock);
+        return le16_to_cpu(vcnum);
+}
 static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 {
        __u32 capabilities = 0;
        /* init fields common to all four types of SessSetup */
-        /* note that header is initialized to zero in header_assemble */
+        /* Note that offsets for first seven fields in req struct are same  */
+        /*      in CIFS Specs so does not matter which of 3 forms of struct */
+        /*      that we use in next few lines                               */
+        /* Note that header is initialized to zero in header_assemble */
        pSMB->req.AndXCommand = 0xFF;
        pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
        pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
+        pSMB->req.VcNumber = get_next_vcnum(ses);
        /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@ -71,7 +155,6 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
        if (ses->capabilities & CAP_UNIX)
                capabilities |= CAP_UNIX;
-        /* BB check whether to init vcnum BB */
        return capabilities;
 }
@@ -228,7 +311,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
        kfree(ses->serverOS);
        /* UTF-8 string will not grow more than four times as big as UCS-16 */
-        ses->serverOS = kzalloc(4 * len, GFP_KERNEL);
+        ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
        if (ses->serverOS != NULL)
                cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp);
        data += 2 * (len + 1);
@@ -241,7 +324,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
                return rc;
        kfree(ses->serverNOS);
-        ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */
+        ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
        if (ses->serverNOS != NULL) {
                cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
                                   nls_cp);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 763fe69ef351..4ea8a98b23df 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1943,6 +1943,8 @@ ULONG_IOCTL(SET_BITMAP_FILE)
 /* Big K */
 COMPATIBLE_IOCTL(PIO_FONT)
 COMPATIBLE_IOCTL(GIO_FONT)
+COMPATIBLE_IOCTL(PIO_CMAP)
+COMPATIBLE_IOCTL(GIO_CMAP)
 ULONG_IOCTL(KDSIGACCEPT)
 COMPATIBLE_IOCTL(KDGETKEYCODE)
 COMPATIBLE_IOCTL(KDSETKEYCODE)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index aafc9eba1c25..b0c87dce66a3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -868,7 +868,7 @@ static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
 {
        unsigned len = le16_to_cpu(dlen);
-        if (len == EXT4_MAX_REC_LEN)
+        if (len == EXT4_MAX_REC_LEN || len == 0)
                return 1 << 16;
        return len;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 03ba20be1329..cbd2ca99d113 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,8 +47,10 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
-        return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+        return jbd2_journal_begin_ordered_truncate(
-                                                   new_size);
+                                        EXT4_SB(inode->i_sb)->s_journal,
+                                        &EXT4_I(inode)->jinode,
+                                        new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -2437,6 +2439,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        int no_nrwrite_index_update;
        int pages_written = 0;
        long pages_skipped;
+        int range_cyclic, cycled = 1, io_done = 0;
        int needed_blocks, ret = 0, nr_to_writebump = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
@@ -2488,9 +2491,15 @@ static int ext4_da_writepages(struct address_space *mapping,
        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;
-        if (wbc->range_cyclic)
+        range_cyclic = wbc->range_cyclic;
+        if (wbc->range_cyclic) {
                index = mapping->writeback_index;
-        else
+                if (index)
+                        cycled = 0;
+                wbc->range_start = index << PAGE_CACHE_SHIFT;
+                wbc->range_end  = LLONG_MAX;
+                wbc->range_cyclic = 0;
+        } else
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
        mpd.wbc = wbc;
@@ -2504,6 +2513,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        wbc->no_nrwrite_index_update = 1;
        pages_skipped = wbc->pages_skipped;
+retry:
        while (!ret && wbc->nr_to_write > 0) {
                /*
@@ -2546,6 +2556,7 @@ static int ext4_da_writepages(struct address_space *mapping,
                        pages_written += mpd.pages_written;
                        wbc->pages_skipped = pages_skipped;
                        ret = 0;
+                        io_done = 1;
                } else if (wbc->nr_to_write)
                        /*
                         * There is no more writeout needed
@@ -2554,6 +2565,13 @@ static int ext4_da_writepages(struct address_space *mapping,
                         */
                        break;
        }
+        if (!io_done && !cycled) {
+                cycled = 1;
+                index = 0;
+                wbc->range_start = index << PAGE_CACHE_SHIFT;
+                wbc->range_end  = mapping->writeback_index - 1;
+                goto retry;
+        }
        if (pages_skipped != wbc->pages_skipped)
                printk(KERN_EMERG "This should not happen leaving %s "
                                "with nr_to_write = %ld ret = %d\n",
@@ -2561,6 +2579,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        /* Update index */
        index += pages_written;
+        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * set the writeback_index so that range_cyclic
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index deba54f6cbed..4415beeb0b62 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3693,6 +3693,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        pa->pa_free = pa->pa_len;
        atomic_set(&pa->pa_count, 1);
        spin_lock_init(&pa->pa_lock);
+        INIT_LIST_HEAD(&pa->pa_inode_list);
+        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
        pa->pa_linear = 0;
@@ -3755,6 +3757,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        atomic_set(&pa->pa_count, 1);
        spin_lock_init(&pa->pa_lock);
        INIT_LIST_HEAD(&pa->pa_inode_list);
+        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
        pa->pa_linear = 1;
@@ -4476,23 +4479,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                        pa->pa_free -= ac->ac_b_ex.fe_len;
                        pa->pa_len -= ac->ac_b_ex.fe_len;
                        spin_unlock(&pa->pa_lock);
-                        /*
-                         * We want to add the pa to the right bucket.
-                         * Remove it from the list and while adding
-                         * make sure the list to which we are adding
-                         * doesn't grow big.
-                         */
-                        if (likely(pa->pa_free)) {
-                                spin_lock(pa->pa_obj_lock);
-                                list_del_rcu(&pa->pa_inode_list);
-                                spin_unlock(pa->pa_obj_lock);
-                                ext4_mb_add_n_trim(ac);
-                        }
                }
-                ext4_mb_put_pa(ac, ac->ac_sb, pa);
        }
        if (ac->alloc_semp)
                up_read(ac->alloc_semp);
+        if (pa) {
+                /*
+                 * We want to add the pa to the right bucket.
+                 * Remove it from the list and while adding
+                 * make sure the list to which we are adding
+                 * doesn't grow big.  We need to release
+                 * alloc_semp before calling ext4_mb_add_n_trim()
+                 */
+                if (pa->pa_linear && likely(pa->pa_free)) {
+                        spin_lock(pa->pa_obj_lock);
+                        list_del_rcu(&pa->pa_inode_list);
+                        spin_unlock(pa->pa_obj_lock);
+                        ext4_mb_add_n_trim(ac);
+                }
+                ext4_mb_put_pa(ac, ac->ac_sb, pa);
+        }
        if (ac->ac_bitmap_page)
                page_cache_release(ac->ac_bitmap_page);
        if (ac->ac_buddy_page)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 734abca25e35..fe64d9f79852 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode)
                                        + 1);
        if (IS_ERR(handle)) {
                retval = PTR_ERR(handle);
-                goto err_out;
+                return retval;
        }
        tmp_inode = ext4_new_inode(handle,
                                inode->i_sb->s_root->d_inode,
@@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode)
        if (IS_ERR(tmp_inode)) {
                retval = -ENOMEM;
                ext4_journal_stop(handle);
-                tmp_inode = NULL;
+                return retval;
-                goto err_out;
        }
        i_size_write(tmp_inode, i_size_read(inode));
        /*
@@ -618,8 +617,7 @@ err_out:
        ext4_journal_stop(handle);
-        if (tmp_inode)
+        iput(tmp_inode);
-                iput(tmp_inode);
        return retval;
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e5f06a5f045e..a5732c58f676 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3046,14 +3046,17 @@ static void ext4_write_super(struct super_block *sb)
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
        int ret = 0;
+        tid_t target;
        trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
        sb->s_dirt = 0;
        if (EXT4_SB(sb)->s_journal) {
-                if (wait)
+                if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
-                        ret = ext4_force_commit(sb);
+                                              &target)) {
-                else
+                        if (wait)
-                        jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+                                jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
+                                                     target);
+                }
        } else {
                ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
        }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index eb343008eded..58144102bf25 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal)
 }
 /*
- * Called under j_state_lock.  Returns true if a transaction was started.
+ * Called under j_state_lock.  Returns true if a transaction commit was started.
 */
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
@@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 /*
 * Start a commit of the current running transaction (if any).  Returns true
- * if a transaction was started, and fills its tid in at *ptid
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
 */
 int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 {
@@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
        if (journal->j_running_transaction) {
                tid_t tid = journal->j_running_transaction->t_tid;
-                ret = __jbd2_log_start_commit(journal, tid);
+                __jbd2_log_start_commit(journal, tid);
-                if (ret && ptid)
+                /* There's a running transaction and we've just made sure
+                 * it's commit has been scheduled. */
+                if (ptid)
                        *ptid = tid;
-        } else if (journal->j_committing_transaction && ptid) {
+                ret = 1;
+        } else if (journal->j_committing_transaction) {
                /*
                 * If ext3_write_super() recently started a commit, then we
                 * have to wait for completion of that transaction
                 */
-                *ptid = journal->j_committing_transaction->t_tid;
+                if (ptid)
+                        *ptid = journal->j_committing_transaction->t_tid;
                ret = 1;
        }
        spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 46b4e347ed7d..28ce21d8598e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2129,26 +2129,46 @@ done:
 }
 /*
- * This function must be called when inode is journaled in ordered mode
+ * File truncate and transaction commit interact with each other in a
- * before truncation happens. It starts writeout of truncated part in
+ * non-trivial way.  If a transaction writing data block A is
- * case it is in the committing transaction so that we stand to ordered
+ * committing, we cannot discard the data by truncate until we have
- * mode consistency guarantees.
+ * written them.  Otherwise if we crashed after the transaction with
+ * write has committed but before the transaction with truncate has
+ * committed, we could see stale data in block A.  This function is a
+ * helper to solve this problem.  It starts writeout of the truncated
+ * part in case it is in the committing transaction.
+ *
+ * Filesystem code must call this function when inode is journaled in
+ * ordered mode before truncation happens and after the inode has been
+ * placed on orphan list with the new inode size. The second condition
+ * avoids the race that someone writes new data and we start
+ * committing the transaction after this function has been called but
+ * before a transaction for truncate is started (and furthermore it
+ * allows us to optimize the case where the addition to orphan list
+ * happens in the same transaction as write --- we don't have to write
+ * any data in such case).
 */
-int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+int jbd2_journal_begin_ordered_truncate(journal_t *journal,
+                                        struct jbd2_inode *jinode,
                                        loff_t new_size)
 {
-        journal_t *journal;
+        transaction_t *inode_trans, *commit_trans;
-        transaction_t *commit_trans;
        int ret = 0;
-        if (!inode->i_transaction && !inode->i_next_transaction)
+        /* This is a quick check to avoid locking if not necessary */
+        if (!jinode->i_transaction)
                goto out;
-        journal = inode->i_transaction->t_journal;
+        /* Locks are here just to force reading of recent values, it is
+         * enough that the transaction was not committing before we started
+         * a transaction adding the inode to orphan list */
        spin_lock(&journal->j_state_lock);
        commit_trans = journal->j_committing_transaction;
        spin_unlock(&journal->j_state_lock);
-        if (inode->i_transaction == commit_trans) {
+        spin_lock(&journal->j_list_lock);
-                ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+        inode_trans = jinode->i_transaction;
+        spin_unlock(&journal->j_list_lock);
+        if (inode_trans == commit_trans) {
+                ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
                        new_size, LLONG_MAX);
                if (ret)
                        jbd2_journal_abort(journal, ret);
diff --git a/fs/namespace.c b/fs/namespace.c
index 228d8c4bfd18..06f8e63f6cb1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -614,9 +614,11 @@ static inline void __mntput(struct vfsmount *mnt)
         */
        for_each_possible_cpu(cpu) {
                struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-                if (cpu_writer->mnt != mnt)
-                        continue;
                spin_lock(&cpu_writer->lock);
+                if (cpu_writer->mnt != mnt) {
+                        spin_unlock(&cpu_writer->lock);
+                        continue;
+                }
                atomic_add(cpu_writer->count, &mnt->__mnt_writers);
                cpu_writer->count = 0;
                /*
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d4..331f2e88e284 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -156,7 +156,7 @@ static int inotify_handle_get_wd(struct inotify_handle *ih,
        int ret;
        do {
-                if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
+                if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
                        return -ENOSPC;
                ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
        } while (ret == -EAGAIN);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3c3532e1307c..172850a9a12a 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
                                               loff_t new_size)
 {
-        return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
+        return jbd2_journal_begin_ordered_truncate(
-                                                   new_size);
+                                OCFS2_SB(inode->i_sb)->journal->j_journal,
+                                &OCFS2_I(inode)->ip_jinode,
+                                new_size);
 }
 #endif /* OCFS2_JOURNAL_H */
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3e76bb9b3ad6..d8bb5c671f42 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -485,8 +485,10 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
                        }
                }
                unlock_new_inode(inode);
-        } else
+        } else {
               module_put(de->owner);
+               de_put(de);
+        }
        return inode;
 out_ino:
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5267098532bf..a1a4cfe19210 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,8 +48,16 @@ int seq_open(struct file *file, const struct seq_operations *op)
         */
        file->f_version = 0;
-        /* SEQ files support lseek, but not pread/pwrite */
+        /*
-        file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE);
+         * seq_files support lseek() and pread().  They do not implement
+         * write() at all, but we clear FMODE_PWRITE here for historical
+         * reasons.
+         *
+         * If a client of seq_files a) implements file.write() and b) wishes to
+         * support pwrite() then that client will need to implement its own
+         * file.open() which calls seq_open() and then sets FMODE_PWRITE.
+         */
+        file->f_mode &= ~FMODE_PWRITE;
        return 0;
 }
 EXPORT_SYMBOL(seq_open);
@@ -131,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
        int err = 0;
        mutex_lock(&m->lock);
+        /* Don't assume *ppos is where we left it */
+        if (unlikely(*ppos != m->read_pos)) {
+                m->read_pos = *ppos;
+                while ((err = traverse(m, *ppos)) == -EAGAIN)
+                        ;
+                if (err) {
+                        /* With prejudice... */
+                        m->read_pos = 0;
+                        m->version = 0;
+                        m->index = 0;
+                        m->count = 0;
+                        goto Done;
+                }
+        }
        /*
         * seq_file->op->..m_start/m_stop/m_next may do special actions
         * or optimisations based on the file->f_version, so we want to
@@ -230,8 +254,10 @@ Fill:
 Done:
        if (!copied)
                copied = err;
-        else
+        else {
                *ppos += copied;
+                m->read_pos += copied;
+        }
        file->f_version = m->version;
        mutex_unlock(&m->lock);
        return copied;
@@ -266,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
                        if (offset < 0)
                                break;
                        retval = offset;
-                        if (offset != file->f_pos) {
+                        if (offset != m->read_pos) {
                                while ((retval=traverse(m, offset)) == -EAGAIN)
                                        ;
                                if (retval) {
                                        /* with extreme prejudice... */
                                        file->f_pos = 0;
+                                        m->read_pos = 0;
                                        m->version = 0;
                                        m->index = 0;
                                        m->count = 0;
                                } else {
+                                        m->read_pos = offset;
                                        retval = file->f_pos = offset;
                                }
                        }
diff --git a/fs/super.c b/fs/super.c
index 61dce001dd57..8349ed6b1412 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -82,7 +82,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
                 * lock ordering than usbfs:
                 */
                lockdep_set_class(&s->s_lock, &type->s_lock_key);
-                down_write(&s->s_umount);
+                /*
+                 * sget() can have s_umount recursion.
+                 *
+                 * When it cannot find a suitable sb, it allocates a new
+                 * one (this one), and tries again to find a suitable old
+                 * one.
+                 *
+                 * In case that succeeds, it will acquire the s_umount
+                 * lock of the old one. Since these are clearly distrinct
+                 * locks, and this object isn't exposed yet, there's no
+                 * risk of deadlocks.
+                 *
+                 * Annotate this by putting this lock in a different
+                 * subclass.
+                 */
+                down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
                s->s_count = S_BIAS;
                atomic_set(&s->s_active, 1);
                mutex_init(&s->s_vfs_rename_mutex);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a123b8ff3f5..b042bd7034b1 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
        BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
-        if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
+        if ((flags & ~TFD_CREATE_FLAGS) ||
-                return -EINVAL;
+            (clockid != CLOCK_MONOTONIC &&
-        if (clockid != CLOCK_MONOTONIC &&
+             clockid != CLOCK_REALTIME))
-            clockid != CLOCK_REALTIME)
                return -EINVAL;
        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
        hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
        ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
-                               flags & (O_CLOEXEC | O_NONBLOCK));
+                               flags & TFD_SHARED_FCNTL_FLAGS);
        if (ufd < 0)
                kfree(ctx);
@@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
        if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
                return -EFAULT;
-        if (!timespec_valid(&ktmr.it_value) ||
+        if ((flags & ~TFD_SETTIME_FLAGS) ||
+            !timespec_valid(&ktmr.it_value) ||
            !timespec_valid(&ktmr.it_interval))
                return -EINVAL;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index d71dc44e21ed..cb329edc925b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -166,6 +166,75 @@ test_page_region(
 }
 /*
+ *      Mapping of multi-page buffers into contiguous virtual space
+ */
+typedef struct a_list {
+        void            *vm_addr;
+        struct a_list   *next;
+} a_list_t;
+static a_list_t         *as_free_head;
+static int              as_list_len;
+static DEFINE_SPINLOCK(as_lock);
+/*
+ *      Try to batch vunmaps because they are costly.
+ */
+STATIC void
+free_address(
+        void            *addr)
+{
+        a_list_t        *aentry;
+#ifdef CONFIG_XEN
+        /*
+         * Xen needs to be able to make sure it can get an exclusive
+         * RO mapping of pages it wants to turn into a pagetable.  If
+         * a newly allocated page is also still being vmap()ed by xfs,
+         * it will cause pagetable construction to fail.  This is a
+         * quick workaround to always eagerly unmap pages so that Xen
+         * is happy.
+         */
+        vunmap(addr);
+        return;
+#endif
+        aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
+        if (likely(aentry)) {
+                spin_lock(&as_lock);
+                aentry->next = as_free_head;
+                aentry->vm_addr = addr;
+                as_free_head = aentry;
+                as_list_len++;
+                spin_unlock(&as_lock);
+        } else {
+                vunmap(addr);
+        }
+}
+STATIC void
+purge_addresses(void)
+{
+        a_list_t        *aentry, *old;
+        if (as_free_head == NULL)
+                return;
+        spin_lock(&as_lock);
+        aentry = as_free_head;
+        as_free_head = NULL;
+        as_list_len = 0;
+        spin_unlock(&as_lock);
+        while ((old = aentry) != NULL) {
+                vunmap(aentry->vm_addr);
+                aentry = aentry->next;
+                kfree(old);
+        }
+}
+/*
 *      Internal xfs_buf_t object manipulation
 */
@@ -264,7 +333,7 @@ xfs_buf_free(
                uint            i;
                if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-                       vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
+                        free_address(bp->b_addr - bp->b_offset);
                for (i = 0; i < bp->b_page_count; i++) {
                        struct page     *page = bp->b_pages[i];
@@ -386,8 +455,10 @@ _xfs_buf_map_pages(
                bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
                bp->b_flags |= XBF_MAPPED;
        } else if (flags & XBF_MAPPED) {
-               bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+                if (as_list_len > 64)
-                                       -1, PAGE_KERNEL);
+                        purge_addresses();
+                bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
+                                        VM_MAP, PAGE_KERNEL);
                if (unlikely(bp->b_addr == NULL))
                        return -ENOMEM;
                bp->b_addr += bp->b_offset;
@@ -1672,6 +1743,8 @@ xfsbufd(
                        count++;
                }
+                if (as_list_len > 0)
+                        purge_addresses();
                if (count)
                        blk_run_address_space(target->bt_mapping);