95 files changed, 1682 insertions, 620 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 38bc735c67a..dc20db34867 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -69,10 +69,12 @@ obj-$(CONFIG_DLM)		+= dlm/
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)       += reiserfs/
 obj-$(CONFIG_EXT3_FS)           += ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4_FS)           += ext4/ # Before ext2 so root fs can be ext4
+obj-$(CONFIG_EXT2_FS)           += ext2/
+# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
+# unless explicitly requested by rootfstype
+obj-$(CONFIG_EXT4_FS)           += ext4/
 obj-$(CONFIG_JBD)               += jbd/
 obj-$(CONFIG_JBD2)              += jbd2/
-obj-$(CONFIG_EXT2_FS)           += ext2/
 obj-$(CONFIG_CRAMFS)            += cramfs/
 obj-$(CONFIG_SQUASHFS)          += squashfs/
 obj-y                           += ramfs/
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 549b0144da1..fe2b1aa2464 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -685,19 +685,20 @@ EXPORT_SYMBOL(bio_integrity_split);
 * bio_integrity_clone - Callback for cloning bios with integrity metadata
 * @bio:        New bio
 * @bio_src:    Original bio
+ * @gfp_mask:   Memory allocation mask
 * @bs:         bio_set to allocate bip from
 *
 * Description: Called to allocate a bip when cloning a bio
 */
 int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
-                        struct bio_set *bs)
+                        gfp_t gfp_mask, struct bio_set *bs)
 {
        struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
        struct bio_integrity_payload *bip;
        BUG_ON(bip_src == NULL);
-        bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+        bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
        if (bip == NULL)
                return -EIO;
diff --git a/fs/bio.c b/fs/bio.c
index 062299acbcc..d4f06327c81 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -302,9 +302,10 @@ void bio_init(struct bio *bio)
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
        struct bio *bio = NULL;
+        void *uninitialized_var(p);
        if (bs) {
-                void *p = mempool_alloc(bs->bio_pool, gfp_mask);
+                p = mempool_alloc(bs->bio_pool, gfp_mask);
                if (p)
                        bio = p + bs->front_pad;
@@ -329,7 +330,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                        }
                        if (unlikely(!bvl)) {
                                if (bs)
-                                        mempool_free(bio, bs->bio_pool);
+                                        mempool_free(p, bs->bio_pool);
                                else
                                        kfree(bio);
                                bio = NULL;
@@ -462,10 +463,12 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
        if (bio_integrity(bio)) {
                int ret;
-                ret = bio_integrity_clone(b, bio, fs_bio_set);
+                ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
-                if (ret < 0)
+                if (ret < 0) {
+                        bio_put(b);
                        return NULL;
+                }
        }
        return b;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a8c9693b75a..72677ce2b74 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,9 @@ struct btrfs_inode {
         */
        struct list_head delalloc_inodes;
+        /* the space_info for where this inode's data allocations are done */
+        struct btrfs_space_info *space_info;
        /* full 64 bit generation number, struct vfs_inode doesn't have a big
         * enough field for this.
         */
@@ -94,6 +97,11 @@ struct btrfs_inode {
         */
        u64 delalloc_bytes;
+        /* total number of bytes that may be used for this inode for
+         * delalloc
+         */
+        u64 reserved_bytes;
        /*
         * the size of the file stored in the metadata on disk.  data=ordered
         * means the in-memory i_size might be larger than the size on disk
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35443cc4b9a..37f31b5529a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,19 +38,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct btrfs_path *path, int level, int slot);
-inline void btrfs_init_path(struct btrfs_path *p)
-{
-        memset(p, 0, sizeof(*p));
-}
 struct btrfs_path *btrfs_alloc_path(void)
 {
        struct btrfs_path *path;
-        path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
+        path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
-        if (path) {
+        if (path)
-                btrfs_init_path(path);
                path->reada = 1;
-        }
        return path;
 }
@@ -69,14 +62,38 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
 /*
 * reset all the locked nodes in the patch to spinning locks.
+ *
+ * held is used to keep lockdep happy, when lockdep is enabled
+ * we set held to a blocking lock before we go around and
+ * retake all the spinlocks in the path.  You can safely use NULL
+ * for held
 */
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
+                                        struct extent_buffer *held)
 {
        int i;
-        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        /* lockdep really cares that we take all of these spinlocks
+         * in the right order.  If any of the locks in the path are not
+         * currently blocking, it is going to complain.  So, make really
+         * really sure by forcing the path to blocking before we clear
+         * the path blocking.
+         */
+        if (held)
+                btrfs_set_lock_blocking(held);
+        btrfs_set_path_blocking(p);
+#endif
+        for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
                if (p->nodes[i] && p->locks[i])
                        btrfs_clear_lock_blocking(p->nodes[i]);
        }
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (held)
+                btrfs_clear_lock_blocking(held);
+#endif
 }
 /* this also releases the path */
@@ -260,7 +277,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        if (*cow_ret == buf)
                unlock_orig = 1;
-        WARN_ON(!btrfs_tree_locked(buf));
+        btrfs_assert_tree_locked(buf);
        if (parent)
                parent_start = parent->start;
@@ -286,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                                                  trans->transid, level, &ins);
                BUG_ON(ret);
                cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
-                                            buf->len);
+                                            buf->len, level);
        } else {
                cow = btrfs_alloc_free_block(trans, root, buf->len,
                                             parent_start,
@@ -917,9 +934,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                /* promote the child to a root */
                child = read_node_slot(root, mid, 0);
+                BUG_ON(!child);
                btrfs_tree_lock(child);
                btrfs_set_lock_blocking(child);
-                BUG_ON(!child);
                ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
                BUG_ON(ret);
@@ -1566,7 +1583,7 @@ cow_done:
                if (!p->skip_locking)
                        p->locks[level] = 1;
-                btrfs_clear_path_blocking(p);
+                btrfs_clear_path_blocking(p, NULL);
                /*
                 * we have a lock on b and as long as we aren't changing
@@ -1605,7 +1622,7 @@ cow_done:
                                btrfs_set_path_blocking(p);
                                sret = split_node(trans, root, p, level);
-                                btrfs_clear_path_blocking(p);
+                                btrfs_clear_path_blocking(p, NULL);
                                BUG_ON(sret > 0);
                                if (sret) {
@@ -1625,7 +1642,7 @@ cow_done:
                                btrfs_set_path_blocking(p);
                                sret = balance_level(trans, root, p, level);
-                                btrfs_clear_path_blocking(p);
+                                btrfs_clear_path_blocking(p, NULL);
                                if (sret) {
                                        ret = sret;
@@ -1688,13 +1705,13 @@ cow_done:
                        if (!p->skip_locking) {
                                int lret;
-                                btrfs_clear_path_blocking(p);
+                                btrfs_clear_path_blocking(p, NULL);
                                lret = btrfs_try_spin_lock(b);
                                if (!lret) {
                                        btrfs_set_path_blocking(p);
                                        btrfs_tree_lock(b);
-                                        btrfs_clear_path_blocking(p);
+                                        btrfs_clear_path_blocking(p, b);
                                }
                        }
                } else {
@@ -1706,7 +1723,7 @@ cow_done:
                                btrfs_set_path_blocking(p);
                                sret = split_leaf(trans, root, key,
                                                      p, ins_len, ret == 0);
-                                btrfs_clear_path_blocking(p);
+                                btrfs_clear_path_blocking(p, NULL);
                                BUG_ON(sret > 0);
                                if (sret) {
@@ -2348,7 +2365,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        if (slot >= btrfs_header_nritems(upper) - 1)
                return 1;
-        WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+        btrfs_assert_tree_locked(path->nodes[1]);
        right = read_node_slot(root, upper, slot + 1);
        btrfs_tree_lock(right);
@@ -2545,7 +2562,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
        if (right_nritems == 0)
                return 1;
-        WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+        btrfs_assert_tree_locked(path->nodes[1]);
        left = read_node_slot(root, path->nodes[1], slot - 1);
        btrfs_tree_lock(left);
@@ -3926,7 +3943,6 @@ find_next_key:
                                btrfs_release_path(root, path);
                                goto again;
                        } else {
-                                btrfs_clear_path_blocking(path);
                                goto out;
                        }
                }
@@ -3946,7 +3962,7 @@ find_next_key:
                path->locks[level - 1] = 1;
                path->nodes[level - 1] = cur;
                unlock_up(path, level, 1);
-                btrfs_clear_path_blocking(path);
+                btrfs_clear_path_blocking(path, NULL);
        }
 out:
        if (ret == 0)
@@ -4085,7 +4101,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
                next = read_node_slot(root, c, slot);
                if (!path->skip_locking) {
-                        WARN_ON(!btrfs_tree_locked(c));
+                        btrfs_assert_tree_locked(c);
                        btrfs_tree_lock(next);
                        btrfs_set_lock_blocking(next);
                }
@@ -4110,7 +4126,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
                        reada_for_search(root, path, level, slot, 0);
                next = read_node_slot(root, next, 0);
                if (!path->skip_locking) {
-                        WARN_ON(!btrfs_tree_locked(path->nodes[level]));
+                        btrfs_assert_tree_locked(path->nodes[level]);
                        btrfs_tree_lock(next);
                        btrfs_set_lock_blocking(next);
                }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 531db112c8b..82491ba8fa4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,7 @@ struct btrfs_ordered_sum;
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
-#ifdef CONFIG_LOCKDEP
+#define BTRFS_MAX_LEVEL 8
-# define BTRFS_MAX_LEVEL 7
-#else
-# define BTRFS_MAX_LEVEL 8
-#endif
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -600,13 +596,27 @@ struct btrfs_block_group_item {
 struct btrfs_space_info {
        u64 flags;
-        u64 total_bytes;
-        u64 bytes_used;
+        u64 total_bytes;        /* total bytes in the space */
-        u64 bytes_pinned;
+        u64 bytes_used;         /* total bytes used on disk */
-        u64 bytes_reserved;
+        u64 bytes_pinned;       /* total bytes pinned, will be freed when the
-        u64 bytes_readonly;
+                                   transaction finishes */
-        int full;
+        u64 bytes_reserved;     /* total bytes the allocator has reserved for
-        int force_alloc;
+                                   current allocations */
+        u64 bytes_readonly;     /* total bytes that are read only */
+        /* delalloc accounting */
+        u64 bytes_delalloc;     /* number of bytes reserved for allocation,
+                                   this space is not necessarily reserved yet
+                                   by the allocator */
+        u64 bytes_may_use;      /* number of bytes that may be used for
+                                   delalloc */
+        int full;               /* indicates that we cannot allocate any more
+                                   chunks for this space */
+        int force_alloc;        /* set if we need to force a chunk alloc for
+                                   this space */
        struct list_head list;
        /* for block groups in our same type */
@@ -1715,7 +1725,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                             u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
-                                            u64 bytenr, u32 blocksize);
+                                            u64 bytenr, u32 blocksize,
+                                            int level);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root,
                       u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1785,6 +1796,16 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root);
 int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
+int btrfs_check_metadata_free_space(struct btrfs_root *root);
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+                                u64 bytes);
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+                                    struct inode *inode, u64 bytes);
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+                                 u64 bytes);
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+                              u64 bytes);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
                        struct btrfs_path *path, u64 min_objectid,
@@ -1834,9 +1855,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
-void btrfs_init_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2032,8 +2051,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 unsigned long btrfs_force_ra(struct address_space *mapping,
                              struct file_ra_state *ra, struct file *file,
                              pgoff_t offset, pgoff_t last_index);
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
-                           int for_del);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5aebddd7119..3e18175248e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -75,6 +75,40 @@ struct async_submit_bio {
        struct btrfs_work work;
 };
+/* These are used to set the lockdep class on the extent buffer locks.
+ * The class is set by the readpage_end_io_hook after the buffer has
+ * passed csum validation but before the pages are unlocked.
+ *
+ * The lockdep class is also set by btrfs_init_new_buffer on freshly
+ * allocated blocks.
+ *
+ * The class is based on the level in the tree block, which allows lockdep
+ * to know that lower nodes nest inside the locks of higher nodes.
+ *
+ * We also add a check to make sure the highest level of the tree is
+ * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
+ * code needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+#  error
+# endif
+static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
+static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
+        /* leaf */
+        "btrfs-extent-00",
+        "btrfs-extent-01",
+        "btrfs-extent-02",
+        "btrfs-extent-03",
+        "btrfs-extent-04",
+        "btrfs-extent-05",
+        "btrfs-extent-06",
+        "btrfs-extent-07",
+        /* highest possible level */
+        "btrfs-extent-08",
+};
+#endif
 /*
 * extents on the btree inode are pretty simple, there's one extent
 * that covers the entire device
@@ -347,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
        return ret;
 }
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
+{
+        lockdep_set_class_and_name(&eb->lock,
+                           &btrfs_eb_class[level],
+                           btrfs_eb_name[level]);
+}
+#endif
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                               struct extent_state *state)
 {
@@ -392,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        }
        found_level = btrfs_header_level(eb);
+        btrfs_set_buffer_lockdep_class(eb, found_level);
        ret = csum_tree_block(root, eb, 1);
        if (ret)
                ret = -EIO;
@@ -812,7 +857,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        struct inode *btree_inode = root->fs_info->btree_inode;
        if (btrfs_header_generation(buf) ==
            root->fs_info->running_transaction->transid) {
-                WARN_ON(!btrfs_tree_locked(buf));
+                btrfs_assert_tree_locked(buf);
                /* ugh, clear_extent_buffer_dirty can be expensive */
                btrfs_set_lock_blocking(buf);
@@ -1777,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
        dev_root->track_dirty = 1;
        if (ret)
                goto fail_extent_root;
@@ -2317,7 +2361,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
        btrfs_set_lock_blocking(buf);
-        WARN_ON(!btrfs_tree_locked(buf));
+        btrfs_assert_tree_locked(buf);
        if (transid != root->fs_info->generation) {
                printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
                       "found %llu running %llu\n",
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 494a56eb298..95029db227b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -101,4 +101,14 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root);
 int btree_lock_page_hook(struct page *page);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+#else
+static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
+                                                 int level)
+{
+}
+#endif
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7527523c2d2..9abf81f71c4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -60,6 +60,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                              u64 bytenr, u64 num_bytes, int alloc,
                              int mark_free);
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *extent_root, u64 alloc_bytes,
+                          u64 flags, int force);
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
        return (cache->flags & bits) == bits;
@@ -1323,8 +1327,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root)
 {
-        finish_current_insert(trans, root->fs_info->extent_root, 1);
+        u64 start;
-        del_pending_extents(trans, root->fs_info->extent_root, 1);
+        u64 end;
+        int ret;
+        while(1) {
+                finish_current_insert(trans, root->fs_info->extent_root, 1);
+                del_pending_extents(trans, root->fs_info->extent_root, 1);
+                /* is there more work to do? */
+                ret = find_first_extent_bit(&root->fs_info->pending_del,
+                                            0, &start, &end, EXTENT_WRITEBACK);
+                if (!ret)
+                        continue;
+                ret = find_first_extent_bit(&root->fs_info->extent_ins,
+                                            0, &start, &end, EXTENT_WRITEBACK);
+                if (!ret)
+                        continue;
+                break;
+        }
        return 0;
 }
@@ -1892,6 +1913,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->bytes_pinned = 0;
        found->bytes_reserved = 0;
        found->bytes_readonly = 0;
+        found->bytes_delalloc = 0;
        found->full = 0;
        found->force_alloc = 0;
        *space_info = found;
@@ -1955,6 +1977,233 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        return flags;
 }
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+{
+        struct btrfs_fs_info *info = root->fs_info;
+        u64 alloc_profile;
+        if (data) {
+                alloc_profile = info->avail_data_alloc_bits &
+                        info->data_alloc_profile;
+                data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+        } else if (root == root->fs_info->chunk_root) {
+                alloc_profile = info->avail_system_alloc_bits &
+                        info->system_alloc_profile;
+                data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+        } else {
+                alloc_profile = info->avail_metadata_alloc_bits &
+                        info->metadata_alloc_profile;
+                data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+        }
+        return btrfs_reduce_alloc_profile(root, data);
+}
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+{
+        u64 alloc_target;
+        alloc_target = btrfs_get_alloc_profile(root, 1);
+        BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+                                                       alloc_target);
+}
+/*
+ * for now this just makes sure we have at least 5% of our metadata space free
+ * for use.
+ */
+int btrfs_check_metadata_free_space(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *info = root->fs_info;
+        struct btrfs_space_info *meta_sinfo;
+        u64 alloc_target, thresh;
+        int committed = 0, ret;
+        /* get the space info for where the metadata will live */
+        alloc_target = btrfs_get_alloc_profile(root, 0);
+        meta_sinfo = __find_space_info(info, alloc_target);
+again:
+        spin_lock(&meta_sinfo->lock);
+        if (!meta_sinfo->full)
+                thresh = meta_sinfo->total_bytes * 80;
+        else
+                thresh = meta_sinfo->total_bytes * 95;
+        do_div(thresh, 100);
+        if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+            meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+                struct btrfs_trans_handle *trans;
+                if (!meta_sinfo->full) {
+                        meta_sinfo->force_alloc = 1;
+                        spin_unlock(&meta_sinfo->lock);
+                        trans = btrfs_start_transaction(root, 1);
+                        if (!trans)
+                                return -ENOMEM;
+                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                             2 * 1024 * 1024, alloc_target, 0);
+                        btrfs_end_transaction(trans, root);
+                        goto again;
+                }
+                spin_unlock(&meta_sinfo->lock);
+                if (!committed) {
+                        committed = 1;
+                        trans = btrfs_join_transaction(root, 1);
+                        if (!trans)
+                                return -ENOMEM;
+                        ret = btrfs_commit_transaction(trans, root);
+                        if (ret)
+                                return ret;
+                        goto again;
+                }
+                return -ENOSPC;
+        }
+        spin_unlock(&meta_sinfo->lock);
+        return 0;
+}
+/*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+                                u64 bytes)
+{
+        struct btrfs_space_info *data_sinfo;
+        int ret = 0, committed = 0;
+        /* make sure bytes are sectorsize aligned */
+        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+        data_sinfo = BTRFS_I(inode)->space_info;
+again:
+        /* make sure we have enough space to handle the data first */
+        spin_lock(&data_sinfo->lock);
+        if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+            data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+            data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+            data_sinfo->bytes_may_use < bytes) {
+                struct btrfs_trans_handle *trans;
+                /*
+                 * if we don't have enough free bytes in this space then we need
+                 * to alloc a new chunk.
+                 */
+                if (!data_sinfo->full) {
+                        u64 alloc_target;
+                        data_sinfo->force_alloc = 1;
+                        spin_unlock(&data_sinfo->lock);
+                        alloc_target = btrfs_get_alloc_profile(root, 1);
+                        trans = btrfs_start_transaction(root, 1);
+                        if (!trans)
+                                return -ENOMEM;
+                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                             bytes + 2 * 1024 * 1024,
+                                             alloc_target, 0);
+                        btrfs_end_transaction(trans, root);
+                        if (ret)
+                                return ret;
+                        goto again;
+                }
+                spin_unlock(&data_sinfo->lock);
+                /* commit the current transaction and try again */
+                if (!committed) {
+                        committed = 1;
+                        trans = btrfs_join_transaction(root, 1);
+                        if (!trans)
+                                return -ENOMEM;
+                        ret = btrfs_commit_transaction(trans, root);
+                        if (ret)
+                                return ret;
+                        goto again;
+                }
+                printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+                       ", %llu bytes_used, %llu bytes_reserved, "
+                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
+                       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
+                       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
+                       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
+                       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
+                return -ENOSPC;
+        }
+        data_sinfo->bytes_may_use += bytes;
+        BTRFS_I(inode)->reserved_bytes += bytes;
+        spin_unlock(&data_sinfo->lock);
+        return btrfs_check_metadata_free_space(root);
+}
+/*
+ * if there was an error for whatever reason after calling
+ * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ */
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+                                    struct inode *inode, u64 bytes)
+{
+        struct btrfs_space_info *data_sinfo;
+        /* make sure bytes are sectorsize aligned */
+        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+        data_sinfo = BTRFS_I(inode)->space_info;
+        spin_lock(&data_sinfo->lock);
+        data_sinfo->bytes_may_use -= bytes;
+        BTRFS_I(inode)->reserved_bytes -= bytes;
+        spin_unlock(&data_sinfo->lock);
+}
+/* called when we are adding a delalloc extent to the inode's io_tree */
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+                                  u64 bytes)
+{
+        struct btrfs_space_info *data_sinfo;
+        /* get the space info for where this inode will be storing its data */
+        data_sinfo = BTRFS_I(inode)->space_info;
+        /* make sure we have enough space to handle the data first */
+        spin_lock(&data_sinfo->lock);
+        data_sinfo->bytes_delalloc += bytes;
+        /*
+         * we are adding a delalloc extent without calling
+         * btrfs_check_data_free_space first.  This happens on a weird
+         * writepage condition, but shouldn't hurt our accounting
+         */
+        if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
+                data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
+                BTRFS_I(inode)->reserved_bytes = 0;
+        } else {
+                data_sinfo->bytes_may_use -= bytes;
+                BTRFS_I(inode)->reserved_bytes -= bytes;
+        }
+        spin_unlock(&data_sinfo->lock);
+}
+/* called when we are clearing an delalloc extent from the inode's io_tree */
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+                              u64 bytes)
+{
+        struct btrfs_space_info *info;
+        info = BTRFS_I(inode)->space_info;
+        spin_lock(&info->lock);
+        info->bytes_delalloc -= bytes;
+        spin_unlock(&info->lock);
+}
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force)
@@ -2211,13 +2460,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
        u64 end;
        u64 priv;
        u64 search = 0;
-        u64 skipped = 0;
        struct btrfs_fs_info *info = extent_root->fs_info;
        struct btrfs_path *path;
        struct pending_extent_op *extent_op, *tmp;
        struct list_head insert_list, update_list;
        int ret;
-        int num_inserts = 0, max_inserts;
+        int num_inserts = 0, max_inserts, restart = 0;
        path = btrfs_alloc_path();
        INIT_LIST_HEAD(&insert_list);
@@ -2233,19 +2481,19 @@ again:
                ret = find_first_extent_bit(&info->extent_ins, search, &start,
                                            &end, EXTENT_WRITEBACK);
                if (ret) {
-                        if (skipped && all && !num_inserts &&
+                        if (restart && !num_inserts &&
                            list_empty(&update_list)) {
-                                skipped = 0;
+                                restart = 0;
                                search = 0;
                                continue;
                        }
-                        mutex_unlock(&info->extent_ins_mutex);
                        break;
                }
                ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
                if (!ret) {
-                        skipped = 1;
+                        if (all)
+                                restart = 1;
                        search = end + 1;
                        if (need_resched()) {
                                mutex_unlock(&info->extent_ins_mutex);
@@ -2264,7 +2512,7 @@ again:
                        list_add_tail(&extent_op->list, &insert_list);
                        search = end + 1;
                        if (num_inserts == max_inserts) {
-                                mutex_unlock(&info->extent_ins_mutex);
+                                restart = 1;
                                break;
                        }
                } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2280,7 +2528,6 @@ again:
         * somebody marked this thing for deletion then just unlock it and be
         * done, the free_extents will handle it
         */
-        mutex_lock(&info->extent_ins_mutex);
        list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
                clear_extent_bits(&info->extent_ins, extent_op->bytenr,
                                  extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2302,6 +2549,10 @@ again:
        if (!list_empty(&update_list)) {
                ret = update_backrefs(trans, extent_root, path, &update_list);
                BUG_ON(ret);
+                /* we may have COW'ed new blocks, so lets start over */
+                if (all)
+                        restart = 1;
        }
        /*
@@ -2309,9 +2560,9 @@ again:
         * need to make sure everything is cleaned then reset everything and
         * go back to the beginning
         */
-        if (!num_inserts && all && skipped) {
+        if (!num_inserts && restart) {
                search = 0;
-                skipped = 0;
+                restart = 0;
                INIT_LIST_HEAD(&update_list);
                INIT_LIST_HEAD(&insert_list);
                goto again;
@@ -2368,27 +2619,19 @@ again:
        BUG_ON(ret);
        /*
-         * if we broke out of the loop in order to insert stuff because we hit
+         * if restart is set for whatever reason we need to go back and start
-         * the maximum number of inserts at a time we can handle, then loop
+         * searching through the pending list again.
-         * back and pick up where we left off
+         *
+         * We just inserted some extents, which could have resulted in new
+         * blocks being allocated, which would result in new blocks needing
+         * updates, so if all is set we _must_ restart to get the updated
+         * blocks.
         */
-        if (num_inserts == max_inserts) {
+        if (restart || all) {
-                INIT_LIST_HEAD(&insert_list);
-                INIT_LIST_HEAD(&update_list);
-                num_inserts = 0;
-                goto again;
-        }
-        /*
-         * again, if we need to make absolutely sure there are no more pending
-         * extent operations left and we know that we skipped some, go back to
-         * the beginning and do it all again
-         */
-        if (all && skipped) {
                INIT_LIST_HEAD(&insert_list);
                INIT_LIST_HEAD(&update_list);
                search = 0;
-                skipped = 0;
+                restart = 0;
                num_inserts = 0;
                goto again;
        }
@@ -2709,6 +2952,8 @@ again:
                goto again;
        }
+        if (!err)
+                finish_current_insert(trans, extent_root, 0);
        return err;
 }
@@ -2859,7 +3104,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        if (data & BTRFS_BLOCK_GROUP_METADATA) {
                last_ptr = &root->fs_info->last_alloc;
-                empty_cluster = 64 * 1024;
+                if (!btrfs_test_opt(root, SSD))
+                        empty_cluster = 64 * 1024;
        }
        if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
@@ -3091,6 +3337,10 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
               (unsigned long long)(info->total_bytes - info->bytes_used -
                                    info->bytes_pinned - info->bytes_reserved),
               (info->full) ? "" : "not ");
+        printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+               " may_use=%llu, used=%llu\n", info->total_bytes,
+               info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
+               info->bytes_used);
        down_read(&info->groups_sem);
        list_for_each_entry(cache, &info->block_groups, list) {
@@ -3117,24 +3367,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 {
        int ret;
        u64 search_start = 0;
-        u64 alloc_profile;
        struct btrfs_fs_info *info = root->fs_info;
-        if (data) {
+        data = btrfs_get_alloc_profile(root, data);
-                alloc_profile = info->avail_data_alloc_bits &
-                        info->data_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-        } else if (root == root->fs_info->chunk_root) {
-                alloc_profile = info->avail_system_alloc_bits &
-                        info->system_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-        } else {
-                alloc_profile = info->avail_metadata_alloc_bits &
-                        info->metadata_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-        }
 again:
-        data = btrfs_reduce_alloc_profile(root, data);
        /*
         * the only place that sets empty_size is btrfs_realloc_node, which
         * is not called recursively on allocations
@@ -3402,7 +3638,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
-                                            u64 bytenr, u32 blocksize)
+                                            u64 bytenr, u32 blocksize,
+                                            int level)
 {
        struct extent_buffer *buf;
@@ -3410,6 +3647,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        if (!buf)
                return ERR_PTR(-ENOMEM);
        btrfs_set_header_generation(buf, trans->transid);
+        btrfs_set_buffer_lockdep_class(buf, level);
        btrfs_tree_lock(buf);
        clean_tree_block(trans, root, buf);
@@ -3453,7 +3691,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                return ERR_PTR(ret);
        }
-        buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
+                                    blocksize, level);
        return buf;
 }
@@ -4179,13 +4418,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        BUG_ON(!btrfs_tree_locked(parent));
+        btrfs_assert_tree_locked(parent);
        parent_level = btrfs_header_level(parent);
        extent_buffer_get(parent);
        path->nodes[parent_level] = parent;
        path->slots[parent_level] = btrfs_header_nritems(parent);
-        BUG_ON(!btrfs_tree_locked(node));
+        btrfs_assert_tree_locked(node);
        level = btrfs_header_level(node);
        extent_buffer_get(node);
        path->nodes[level] = node;
@@ -5641,7 +5880,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
                        prev_block = block_start;
                }
+                mutex_lock(&extent_root->fs_info->trans_mutex);
                btrfs_record_root_in_trans(found_root);
+                mutex_unlock(&extent_root->fs_info->trans_mutex);
                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
                        /*
                         * try to update data extent references while
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 37d43b516b7..ebe6b29e606 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -415,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
        node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
        if (node) {
-                struct extent_state *found;
-                found = rb_entry(node, struct extent_state, rb_node);
                free_extent_state(prealloc);
                return -EEXIST;
        }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3e8023efaff..dc78954861b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1091,19 +1091,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                WARN_ON(num_pages > nrptrs);
                memset(pages, 0, sizeof(struct page *) * nrptrs);
-                ret = btrfs_check_free_space(root, write_bytes, 0);
+                ret = btrfs_check_data_free_space(root, inode, write_bytes);
                if (ret)
                        goto out;
                ret = prepare_pages(root, file, pages, num_pages,
                                    pos, first_index, last_index,
                                    write_bytes);
-                if (ret)
+                if (ret) {
+                        btrfs_free_reserved_data_space(root, inode,
+                                                       write_bytes);
                        goto out;
+                }
                ret = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, buf);
                if (ret) {
+                        btrfs_free_reserved_data_space(root, inode,
+                                                       write_bytes);
                        btrfs_drop_pages(pages, num_pages);
                        goto out;
                }
@@ -1111,8 +1116,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                ret = dirty_and_release_pages(NULL, root, file, pages,
                                              num_pages, pos, write_bytes);
                btrfs_drop_pages(pages, num_pages);
-                if (ret)
+                if (ret) {
+                        btrfs_free_reserved_data_space(root, inode,
+                                                       write_bytes);
                        goto out;
+                }
                if (will_write) {
                        btrfs_fdatawrite_range(inode->i_mapping, pos,
@@ -1136,6 +1144,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
        }
 out:
        mutex_unlock(&inode->i_mutex);
+        if (ret)
+                err = ret;
 out_nolock:
        kfree(pages);
@@ -1222,7 +1232,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
        /*
         * ok we haven't committed the transaction yet, lets do a commit
         */
-        if (file->private_data)
+        if (file && file->private_data)
                btrfs_ioctl_trans_end(file);
        trans = btrfs_start_transaction(root, 1);
@@ -1231,7 +1241,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
                goto out;
        }
-        ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+        ret = btrfs_log_dentry_safe(trans, root, dentry);
        if (ret < 0)
                goto out;
@@ -1245,7 +1255,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
         * file again, but that will end up using the synchronization
         * inside btrfs_sync_log to keep things safe.
         */
-        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+        mutex_unlock(&dentry->d_inode->i_mutex);
        if (ret > 0) {
                ret = btrfs_commit_transaction(trans, root);
@@ -1253,7 +1263,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
                btrfs_sync_log(trans, root);
                ret = btrfs_end_transaction(trans, root);
        }
-        mutex_lock(&file->f_dentry->d_inode->i_mutex);
+        mutex_lock(&dentry->d_inode->i_mutex);
 out:
        return ret > 0 ? EIO : ret;
 }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa79873eb4..cc7334d833c 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
        search_key.type = 0;
        search_key.offset = 0;
-        btrfs_init_path(path);
        start_found = 0;
        ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
        if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8f0706210a4..7d4f948bc22 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -102,34 +102,6 @@ static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
 }
 /*
- * a very lame attempt at stopping writes when the FS is 85% full.  There
- * are countless ways this is incorrect, but it is better than nothing.
- */
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
-                           int for_del)
-{
-        u64 total;
-        u64 used;
-        u64 thresh;
-        int ret = 0;
-        spin_lock(&root->fs_info->delalloc_lock);
-        total = btrfs_super_total_bytes(&root->fs_info->super_copy);
-        used = btrfs_super_bytes_used(&root->fs_info->super_copy);
-        if (for_del)
-                thresh = total * 90;
-        else
-                thresh = total * 85;
-        do_div(thresh, 100);
-        if (used + root->fs_info->delalloc_bytes + num_required > thresh)
-                ret = -ENOSPC;
-        spin_unlock(&root->fs_info->delalloc_lock);
-        return ret;
-}
-/*
 * this does all the hard work for inserting an inline extent into
 * the btree.  The caller should have done a btrfs_drop_extents so that
 * no overlapping inline items exist in the btree
@@ -1190,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
         */
        if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+                btrfs_delalloc_reserve_space(root, inode, end - start + 1);
                spin_lock(&root->fs_info->delalloc_lock);
                BTRFS_I(inode)->delalloc_bytes += end - start + 1;
                root->fs_info->delalloc_bytes += end - start + 1;
@@ -1223,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
                               (unsigned long long)end - start + 1,
                               (unsigned long long)
                               root->fs_info->delalloc_bytes);
+                        btrfs_delalloc_free_space(root, inode, (u64)-1);
                        root->fs_info->delalloc_bytes = 0;
                        BTRFS_I(inode)->delalloc_bytes = 0;
                } else {
+                        btrfs_delalloc_free_space(root, inode,
+                                                  end - start + 1);
                        root->fs_info->delalloc_bytes -= end - start + 1;
                        BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
                }
@@ -2245,10 +2221,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        root = BTRFS_I(dir)->root;
-        ret = btrfs_check_free_space(root, 1, 1);
-        if (ret)
-                goto fail;
        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, dir);
@@ -2261,7 +2233,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-fail:
        btrfs_btree_balance_dirty(root, nr);
        return ret;
 }
@@ -2284,10 +2255,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
                return -ENOTEMPTY;
        }
-        ret = btrfs_check_free_space(root, 1, 1);
-        if (ret)
-                goto fail;
        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, dir);
@@ -2304,7 +2271,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 fail_trans:
        nr = trans->blocks_used;
        ret = btrfs_end_transaction_throttle(trans, root);
-fail:
        btrfs_btree_balance_dirty(root, nr);
        if (ret && !err)
@@ -2531,8 +2497,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        key.offset = (u64)-1;
        key.type = (u8)-1;
-        btrfs_init_path(path);
 search_again:
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0)
@@ -2820,7 +2784,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        if (size <= hole_start)
                return 0;
-        err = btrfs_check_free_space(root, 1, 0);
+        err = btrfs_check_metadata_free_space(root);
        if (err)
                return err;
@@ -3016,6 +2980,7 @@ static noinline void init_btrfs_i(struct inode *inode)
        bi->last_trans = 0;
        bi->logged_trans = 0;
        bi->delalloc_bytes = 0;
+        bi->reserved_bytes = 0;
        bi->disk_i_size = 0;
        bi->flags = 0;
        bi->index_cnt = (u64)-1;
@@ -3037,6 +3002,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
        inode->i_ino = args->ino;
        init_btrfs_i(inode);
        BTRFS_I(inode)->root = args->root;
+        btrfs_set_inode_space_info(args->root, inode);
        return 0;
 }
@@ -3457,6 +3423,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        BTRFS_I(inode)->index_cnt = 2;
        BTRFS_I(inode)->root = root;
        BTRFS_I(inode)->generation = trans->transid;
+        btrfs_set_inode_space_info(root, inode);
        if (mode & S_IFDIR)
                owner = 0;
@@ -3604,7 +3571,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (!new_valid_dev(rdev))
                return -EINVAL;
-        err = btrfs_check_free_space(root, 1, 0);
+        err = btrfs_check_metadata_free_space(root);
        if (err)
                goto fail;
@@ -3667,7 +3634,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        u64 objectid;
        u64 index = 0;
-        err = btrfs_check_free_space(root, 1, 0);
+        err = btrfs_check_metadata_free_space(root);
        if (err)
                goto fail;
        trans = btrfs_start_transaction(root, 1);
@@ -3735,7 +3702,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                return -ENOENT;
        btrfs_inc_nlink(inode);
-        err = btrfs_check_free_space(root, 1, 0);
+        err = btrfs_check_metadata_free_space(root);
        if (err)
                goto fail;
        err = btrfs_set_inode_index(dir, &index);
@@ -3781,7 +3748,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        u64 index = 0;
        unsigned long nr = 1;
-        err = btrfs_check_free_space(root, 1, 0);
+        err = btrfs_check_metadata_free_space(root);
        if (err)
                goto out_unlock;
@@ -4263,7 +4230,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
        if (PageWriteback(page) || PageDirty(page))
                return 0;
-        return __btrfs_releasepage(page, gfp_flags);
+        return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
 }
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
@@ -4338,7 +4305,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        u64 page_start;
        u64 page_end;
-        ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
+        ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
        if (ret)
                goto out;
@@ -4351,6 +4318,7 @@ again:
        if ((page->mapping != inode->i_mapping) ||
            (page_start >= size)) {
+                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                /* page got truncated out from underneath us */
                goto out_unlock;
        }
@@ -4633,7 +4601,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                return -EXDEV;
-        ret = btrfs_check_free_space(root, 1, 0);
+        ret = btrfs_check_metadata_free_space(root);
        if (ret)
                goto out_unlock;
@@ -4751,7 +4719,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
                return -ENAMETOOLONG;
-        err = btrfs_check_free_space(root, 1, 0);
+        err = btrfs_check_metadata_free_space(root);
        if (err)
                goto out_fail;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 988fdc8b49e..bca729fc80c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -70,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        u64 index = 0;
        unsigned long nr = 1;
-        ret = btrfs_check_free_space(root, 1, 0);
+        ret = btrfs_check_metadata_free_space(root);
        if (ret)
                goto fail_commit;
@@ -203,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        if (!root->ref_cows)
                return -EINVAL;
-        ret = btrfs_check_free_space(root, 1, 0);
+        ret = btrfs_check_metadata_free_space(root);
        if (ret)
                goto fail_unlock;
@@ -374,7 +374,7 @@ static int btrfs_defrag_file(struct file *file)
        unsigned long i;
        int ret;
-        ret = btrfs_check_free_space(root, inode->i_size, 0);
+        ret = btrfs_check_data_free_space(root, inode, inode->i_size);
        if (ret)
                return -ENOSPC;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9ebe9385129..47b0a88c12a 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,21 +25,10 @@
 #include "extent_io.h"
 #include "locking.h"
-/*
- * btrfs_header_level() isn't free, so don't call it when lockdep isn't
- * on
- */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void spin_nested(struct extent_buffer *eb)
-{
-        spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
-}
-#else
 static inline void spin_nested(struct extent_buffer *eb)
 {
        spin_lock(&eb->lock);
 }
-#endif
 /*
 * Setting a lock to blocking will drop the spinlock and set the
@@ -231,8 +220,8 @@ int btrfs_tree_unlock(struct extent_buffer *eb)
        return 0;
 }
-int btrfs_tree_locked(struct extent_buffer *eb)
+void btrfs_assert_tree_locked(struct extent_buffer *eb)
 {
-        return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
+        if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-                        spin_is_locked(&eb->lock);
+                assert_spin_locked(&eb->lock);
 }
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6bb0afbff92..6c4ce457168 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,11 +21,11 @@
 int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
-int btrfs_tree_locked(struct extent_buffer *eb);
 int btrfs_try_tree_lock(struct extent_buffer *eb);
 int btrfs_try_spin_lock(struct extent_buffer *eb);
 void btrfs_set_lock_blocking(struct extent_buffer *eb);
 void btrfs_clear_lock_blocking(struct extent_buffer *eb);
+void btrfs_assert_tree_locked(struct extent_buffer *eb);
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f3fd7e2cbc3..19a4daf03cc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -379,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        btrfs_start_delalloc_inodes(root);
        btrfs_wait_ordered_extents(root, 0);
-        btrfs_clean_old_snapshots(root);
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
        sb->s_dirt = 0;
@@ -511,6 +510,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        struct btrfs_root *root = btrfs_sb(sb);
        int ret;
+        ret = btrfs_parse_options(root, data);
+        if (ret)
+                return -EINVAL;
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 919172de5c9..4112d53d4f4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
                num_bytes -= btrfs_root_used(&dirty->root->root_item);
                bytes_used = btrfs_root_used(&root->root_item);
                if (num_bytes) {
+                        mutex_lock(&root->fs_info->trans_mutex);
                        btrfs_record_root_in_trans(root);
+                        mutex_unlock(&root->fs_info->trans_mutex);
                        btrfs_set_root_used(&root->root_item,
                                            bytes_used - num_bytes);
                }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 20794290256..9c462fbd60f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2832,7 +2832,9 @@ again:
                BUG_ON(!wc.replay_dest);
                wc.replay_dest->log_root = log;
+                mutex_lock(&fs_info->trans_mutex);
                btrfs_record_root_in_trans(wc.replay_dest);
+                mutex_unlock(&fs_info->trans_mutex);
                ret = walk_log_tree(trans, log, &wc);
                BUG_ON(ret);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bcd14ebccae..1316139bf9e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2894,10 +2894,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                free_extent_map(em);
        }
-        map = kzalloc(sizeof(*map), GFP_NOFS);
-        if (!map)
-                return -ENOMEM;
        em = alloc_extent_map(GFP_NOFS);
        if (!em)
                return -ENOMEM;
@@ -3106,6 +3102,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
        if (!sb)
                return -ENOMEM;
        btrfs_set_buffer_uptodate(sb);
+        btrfs_set_buffer_lockdep_class(sb, 0);
        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
        array_size = btrfs_super_sys_array_size(super_copy);
diff --git a/fs/buffer.c b/fs/buffer.c
index 665d446b25b..9f697419ed8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -777,6 +777,7 @@ static int __set_page_dirty(struct page *page,
                        __inc_zone_page_state(page, NR_FILE_DIRTY);
                        __inc_bdi_stat(mapping->backing_dev_info,
                                        BDI_RECLAIMABLE);
+                        task_dirty_inc(current);
                        task_io_account_write(PAGE_CACHE_SIZE);
                }
                radix_tree_tag_set(&mapping->page_tree,
@@ -3108,7 +3109,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
        if (test_clear_buffer_dirty(bh)) {
                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
-                ret = submit_bh(WRITE_SYNC, bh);
+                ret = submit_bh(WRITE, bh);
                wait_on_buffer(bh);
                if (buffer_eopnotsupp(bh)) {
                        clear_buffer_eopnotsupp(bh);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 73ac7ebd1df..851388fafc7 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,13 @@
+Version 1.57
+------------
+Improve support for multiple security contexts to the same server. We
+used to use the same "vcnumber" for all connections which could cause
+the server to treat subsequent connections, especially those that
+are authenticated as guest, as reconnections, invalidating the earlier
+user's smb session.  This fix allows cifs to mount multiple times to the
+same server with different userids without risking invalidating earlier
+established security contexts.
 Version 1.56
 ------------
 Add "forcemandatorylock" mount option to allow user to use mandatory
@@ -7,7 +17,10 @@ specified and user does not have access to query information about the
 top of the share.  Fix problem in 2.6.28 resolving DFS paths to
 Samba servers (worked to Windows).  Fix rmdir so that pending search
 (readdir) requests do not get invalid results which include the now
-removed directory.
+removed directory.  Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
+when using DFS.  Add better file create support to servers which support
+the CIFS POSIX protocol extensions (this adds support for new flags
+on create, and improves semantics for write of locked ranges).
 Version 1.55
 ------------
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7ac481841f8..2b1d28a9ee2 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.56"
+#define CIFS_VERSION   "1.57"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 94c1ca0ec95..e004f6db5fc 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -164,9 +164,12 @@ struct TCP_Server_Info {
        /* multiplexed reads or writes */
        unsigned int maxBuf;    /* maxBuf specifies the maximum */
        /* message size the server can send or receive for non-raw SMBs */
-        unsigned int maxRw;     /* maxRw specifies the maximum */
+        unsigned int max_rw;    /* maxRw specifies the maximum */
        /* message size the server can send or receive for */
        /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
+        unsigned int max_vcs;   /* maximum number of smb sessions, at least
+                                   those that can be specified uniquely with
+                                   vcnumbers */
        char sessid[4];         /* unique token id for this session */
        /* (returned on Negotiate */
        int capabilities; /* allow selective disabling of caps by smb sess */
@@ -210,6 +213,7 @@ struct cifsSesInfo {
        unsigned overrideSecFlg;  /* if non-zero override global sec flags */
        __u16 ipc_tid;          /* special tid for connection to IPC share */
        __u16 flags;
+        __u16 vcnum;
        char *serverOS;         /* name of operating system underlying server */
        char *serverNOS;        /* name of network operating system of server */
        char *serverDomain;     /* security realm of server */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 382ba629880..083dfc57c7a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -42,6 +42,7 @@ extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
 #define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
 extern char *build_path_from_dentry(struct dentry *);
+extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
@@ -91,6 +92,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
+extern void posix_fill_in_inode(struct inode *tmp_inode,
+                                FILE_UNIX_BASIC_INFO *pData, int isNewInode);
+extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
 extern int cifs_get_inode_info(struct inode **pinode,
                        const unsigned char *search_path,
                        FILE_ALL_INFO *pfile_info,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 552642a507c..939e2f76b95 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -528,14 +528,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
                server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
                                (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+                server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
                GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
                /* even though we do not use raw we might as well set this
                accurately, in case we ever find a need for it */
                if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
-                        server->maxRw = 0xFF00;
+                        server->max_rw = 0xFF00;
                        server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
                } else {
-                        server->maxRw = 0;/* we do not need to use raw anyway */
+                        server->max_rw = 0;/* do not need to use raw anyway */
                        server->capabilities = CAP_MPX_MODE;
                }
                tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
@@ -638,7 +639,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        /* probably no need to store and check maxvcs */
        server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
                        (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
-        server->maxRw = le32_to_cpu(pSMBr->MaxRawSize);
+        server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
        cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
        GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2209be94305..da0f4ffa061 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,7 +23,6 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/wait.h>
-#include <linux/ipv6.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
@@ -35,6 +34,7 @@
 #include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -1379,8 +1379,8 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
                     server->addr.sockAddr.sin_addr.s_addr))
                        continue;
                else if (addr->ss_family == AF_INET6 &&
-                         memcmp(&server->addr.sockAddr6.sin6_addr,
+                         !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
-                                &addr6->sin6_addr, sizeof(addr6->sin6_addr)))
+                                          &addr6->sin6_addr))
                        continue;
                ++server->srv_count;
@@ -2180,6 +2180,33 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                           "mount option supported"));
 }
+static int
+is_path_accessible(int xid, struct cifsTconInfo *tcon,
+                   struct cifs_sb_info *cifs_sb, const char *full_path)
+{
+        int rc;
+        __u64 inode_num;
+        FILE_ALL_INFO *pfile_info;
+        rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num,
+                                   cifs_sb->local_nls,
+                                   cifs_sb->mnt_cifs_flags &
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc != -EOPNOTSUPP)
+                return rc;
+        pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+        if (pfile_info == NULL)
+                return -ENOMEM;
+        rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info,
+                              0 /* not legacy */, cifs_sb->local_nls,
+                              cifs_sb->mnt_cifs_flags &
+                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        kfree(pfile_info);
+        return rc;
+}
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
           char *mount_data, const char *devname)
@@ -2190,6 +2217,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        struct cifsSesInfo *pSesInfo = NULL;
        struct cifsTconInfo *tcon = NULL;
        struct TCP_Server_Info *srvTcp = NULL;
+        char   *full_path;
        xid = GetXid();
@@ -2426,6 +2454,23 @@ mount_fail_check:
                cifs_sb->rsize = min(cifs_sb->rsize,
                               (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+        if (!rc && cifs_sb->prepathlen) {
+                /* build_path_to_root works only when we have a valid tcon */
+                full_path = cifs_build_path_to_root(cifs_sb);
+                if (full_path == NULL) {
+                        rc = -ENOMEM;
+                        goto mount_fail_check;
+                }
+                rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
+                if (rc) {
+                        cERROR(1, ("Path %s in not accessible: %d",
+                                                full_path, rc));
+                        kfree(full_path);
+                        goto mount_fail_check;
+                }
+                kfree(full_path);
+        }
        /* volume_info->password is freed above when existing session found
        (in which case it is not needed anymore) but when new sesion is created
        the password ptr is put in the new session structure (in which case the
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 964aad03c5a..89fb7283265 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -3,7 +3,7 @@
 *
 *   vfs operations that deal with dentries
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -129,6 +129,78 @@ cifs_bp_rename_retry:
        return full_path;
 }
+static int cifs_posix_open(char *full_path, struct inode **pinode,
+                    struct super_block *sb, int mode, int oflags,
+                    int *poplock, __u16 *pnetfid, int xid)
+{
+        int rc;
+        __u32 oplock;
+        FILE_UNIX_BASIC_INFO *presp_data;
+        __u32 posix_flags = 0;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        cFYI(1, ("posix open %s", full_path));
+        presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+        if (presp_data == NULL)
+                return -ENOMEM;
+/* So far cifs posix extensions can only map the following flags.
+   There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
+   so far we do not seem to need them, and we can treat them as local only */
+        if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
+                (FMODE_READ | FMODE_WRITE))
+                posix_flags = SMB_O_RDWR;
+        else if (oflags & FMODE_READ)
+                posix_flags = SMB_O_RDONLY;
+        else if (oflags & FMODE_WRITE)
+                posix_flags = SMB_O_WRONLY;
+        if (oflags & O_CREAT)
+                posix_flags |= SMB_O_CREAT;
+        if (oflags & O_EXCL)
+                posix_flags |= SMB_O_EXCL;
+        if (oflags & O_TRUNC)
+                posix_flags |= SMB_O_TRUNC;
+        if (oflags & O_APPEND)
+                posix_flags |= SMB_O_APPEND;
+        if (oflags & O_SYNC)
+                posix_flags |= SMB_O_SYNC;
+        if (oflags & O_DIRECTORY)
+                posix_flags |= SMB_O_DIRECTORY;
+        if (oflags & O_NOFOLLOW)
+                posix_flags |= SMB_O_NOFOLLOW;
+        if (oflags & O_DIRECT)
+                posix_flags |= SMB_O_DIRECT;
+        rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
+                        pnetfid, presp_data, &oplock, full_path,
+                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc)
+                goto posix_open_ret;
+        if (presp_data->Type == cpu_to_le32(-1))
+                goto posix_open_ret; /* open ok, caller does qpathinfo */
+        /* get new inode and set it up */
+        if (!pinode)
+                goto posix_open_ret; /* caller does not need info */
+        *pinode = cifs_new_inode(sb, &presp_data->UniqueId);
+        /* We do not need to close the file if new_inode fails since
+           the caller will retry qpathinfo as long as inode is null */
+        if (*pinode == NULL)
+                goto posix_open_ret;
+        posix_fill_in_inode(*pinode, presp_data, 1);
+posix_open_ret:
+        kfree(presp_data);
+        return rc;
+}
 static void setup_cifs_dentry(struct cifsTconInfo *tcon,
                              struct dentry *direntry,
                              struct inode *newinode)
@@ -150,7 +222,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        int xid;
        int create_options = CREATE_NOT_DIR;
        int oplock = 0;
-        /* BB below access is too much for the mknod to request */
+        int oflags;
+        /*
+         * BB below access is probably too much for mknod to request
+         *    but we have to do query and setpathinfo so requesting
+         *    less could fail (unless we want to request getatr and setatr
+         *    permissions (only).  At least for POSIX we do not have to
+         *    request so much.
+         */
        int desiredAccess = GENERIC_READ | GENERIC_WRITE;
        __u16 fileHandle;
        struct cifs_sb_info *cifs_sb;
@@ -174,13 +253,43 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        }
        mode &= ~current->fs->umask;
+        if (oplockEnabled)
+                oplock = REQ_OPLOCK;
-        if (nd && (nd->flags & LOOKUP_OPEN)) {
+        if (nd && (nd->flags & LOOKUP_OPEN))
-                int oflags = nd->intent.open.flags;
+                oflags = nd->intent.open.flags;
+        else
+                oflags = FMODE_READ;
+        if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+                rc = cifs_posix_open(full_path, &newinode, inode->i_sb,
+                                     mode, oflags, &oplock, &fileHandle, xid);
+                /* EIO could indicate that (posix open) operation is not
+                   supported, despite what server claimed in capability
+                   negotation.  EREMOTE indicates DFS junction, which is not
+                   handled in posix open */
+                if ((rc == 0) && (newinode == NULL))
+                        goto cifs_create_get_file_info; /* query inode info */
+                else if (rc == 0) /* success, no need to query */
+                        goto cifs_create_set_dentry;
+                else if ((rc != -EIO) && (rc != -EREMOTE) &&
+                         (rc != -EOPNOTSUPP)) /* path not found or net err */
+                        goto cifs_create_out;
+                /* else fallthrough to retry, using older open call, this is
+                   case where server does not support this SMB level, and
+                   falsely claims capability (also get here for DFS case
+                   which should be rare for path not covered on files) */
+        }
+        if (nd && (nd->flags & LOOKUP_OPEN)) {
+                /* if the file is going to stay open, then we
+                   need to set the desired access properly */
                desiredAccess = 0;
                if (oflags & FMODE_READ)
-                        desiredAccess |= GENERIC_READ;
+                        desiredAccess |= GENERIC_READ; /* is this too little? */
                if (oflags & FMODE_WRITE) {
                        desiredAccess |= GENERIC_WRITE;
                        if (!(oflags & FMODE_READ))
@@ -199,8 +308,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        /* BB add processing to set equivalent of mode - e.g. via CreateX with
           ACLs */
-        if (oplockEnabled)
-                oplock = REQ_OPLOCK;
        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
        if (buf == NULL) {
@@ -233,116 +340,112 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        }
        if (rc) {
                cFYI(1, ("cifs_create returned 0x%x", rc));
-        } else {
+                goto cifs_create_out;
-                /* If Open reported that we actually created a file
+        }
-                then we now have to set the mode if possible */
-                if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+        /* If Open reported that we actually created a file
-                        struct cifs_unix_set_info_args args = {
+           then we now have to set the mode if possible */
+        if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+                struct cifs_unix_set_info_args args = {
                                .mode   = mode,
                                .ctime  = NO_CHANGE_64,
                                .atime  = NO_CHANGE_64,
                                .mtime  = NO_CHANGE_64,
                                .device = 0,
-                        };
+                };
-                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-                                args.uid = (__u64) current_fsuid();
+                        args.uid = (__u64) current_fsuid();
-                                if (inode->i_mode & S_ISGID)
+                        if (inode->i_mode & S_ISGID)
-                                        args.gid = (__u64) inode->i_gid;
+                                args.gid = (__u64) inode->i_gid;
-                                else
+                        else
-                                        args.gid = (__u64) current_fsgid();
+                                args.gid = (__u64) current_fsgid();
-                        } else {
-                                args.uid = NO_CHANGE_64;
-                                args.gid = NO_CHANGE_64;
-                        }
-                        CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
-                                cifs_sb->local_nls,
-                                cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                } else {
-                        /* BB implement mode setting via Windows security
+                        args.uid = NO_CHANGE_64;
-                           descriptors e.g. */
+                        args.gid = NO_CHANGE_64;
-                        /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
-                        /* Could set r/o dos attribute if mode & 0222 == 0 */
                }
+                CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
+                        cifs_sb->local_nls,
+                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        } else {
+                /* BB implement mode setting via Windows security
+                   descriptors e.g. */
+                /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
-                /* server might mask mode so we have to query for it */
+                /* Could set r/o dos attribute if mode & 0222 == 0 */
-                if (tcon->unix_ext)
+        }
-                        rc = cifs_get_inode_info_unix(&newinode, full_path,
-                                                 inode->i_sb, xid);
+cifs_create_get_file_info:
-                else {
+        /* server might mask mode so we have to query for it */
-                        rc = cifs_get_inode_info(&newinode, full_path,
+        if (tcon->unix_ext)
-                                                 buf, inode->i_sb, xid,
+                rc = cifs_get_inode_info_unix(&newinode, full_path,
-                                                 &fileHandle);
+                                              inode->i_sb, xid);
-                        if (newinode) {
+        else {
-                                if (cifs_sb->mnt_cifs_flags &
+                rc = cifs_get_inode_info(&newinode, full_path, buf,
-                                    CIFS_MOUNT_DYNPERM)
+                                         inode->i_sb, xid, &fileHandle);
-                                        newinode->i_mode = mode;
+                if (newinode) {
-                                if ((oplock & CIFS_CREATE_ACTION) &&
+                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
-                                    (cifs_sb->mnt_cifs_flags &
+                                newinode->i_mode = mode;
-                                     CIFS_MOUNT_SET_UID)) {
+                        if ((oplock & CIFS_CREATE_ACTION) &&
-                                        newinode->i_uid = current_fsuid();
+                            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
-                                        if (inode->i_mode & S_ISGID)
+                                newinode->i_uid = current_fsuid();
-                                                newinode->i_gid =
+                                if (inode->i_mode & S_ISGID)
-                                                        inode->i_gid;
+                                        newinode->i_gid = inode->i_gid;
-                                        else
+                                else
-                                                newinode->i_gid =
+                                        newinode->i_gid = current_fsgid();
-                                                        current_fsgid();
-                                }
                        }
                }
+        }
-                if (rc != 0) {
+cifs_create_set_dentry:
-                        cFYI(1, ("Create worked, get_inode_info failed rc = %d",
+        if (rc == 0)
-                                 rc));
+                setup_cifs_dentry(tcon, direntry, newinode);
-                } else
+        else
-                        setup_cifs_dentry(tcon, direntry, newinode);
+                cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
-                if ((nd == NULL /* nfsd case - nfs srv does not set nd */) ||
+        /* nfsd case - nfs srv does not set nd */
-                        (!(nd->flags & LOOKUP_OPEN))) {
+        if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
-                        /* mknod case - do not leave file open */
+                /* mknod case - do not leave file open */
-                        CIFSSMBClose(xid, tcon, fileHandle);
+                CIFSSMBClose(xid, tcon, fileHandle);
-                } else if (newinode) {
+        } else if (newinode) {
-                        struct cifsFileInfo *pCifsFile =
+                struct cifsFileInfo *pCifsFile =
-                           kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+                        kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-                        if (pCifsFile == NULL)
+                if (pCifsFile == NULL)
-                                goto cifs_create_out;
+                        goto cifs_create_out;
-                        pCifsFile->netfid = fileHandle;
+                pCifsFile->netfid = fileHandle;
-                        pCifsFile->pid = current->tgid;
+                pCifsFile->pid = current->tgid;
-                        pCifsFile->pInode = newinode;
+                pCifsFile->pInode = newinode;
-                        pCifsFile->invalidHandle = false;
+                pCifsFile->invalidHandle = false;
-                        pCifsFile->closePend     = false;
+                pCifsFile->closePend     = false;
-                        init_MUTEX(&pCifsFile->fh_sem);
+                init_MUTEX(&pCifsFile->fh_sem);
-                        mutex_init(&pCifsFile->lock_mutex);
+                mutex_init(&pCifsFile->lock_mutex);
-                        INIT_LIST_HEAD(&pCifsFile->llist);
+                INIT_LIST_HEAD(&pCifsFile->llist);
-                        atomic_set(&pCifsFile->wrtPending, 0);
+                atomic_set(&pCifsFile->wrtPending, 0);
-                        /* set the following in open now
+                /* set the following in open now
                                pCifsFile->pfile = file; */
-                        write_lock(&GlobalSMBSeslock);
+                write_lock(&GlobalSMBSeslock);
-                        list_add(&pCifsFile->tlist, &tcon->openFileList);
+                list_add(&pCifsFile->tlist, &tcon->openFileList);
-                        pCifsInode = CIFS_I(newinode);
+                pCifsInode = CIFS_I(newinode);
-                        if (pCifsInode) {
+                if (pCifsInode) {
-                                /* if readable file instance put first in list*/
+                        /* if readable file instance put first in list*/
-                                if (write_only) {
+                        if (write_only) {
-                                        list_add_tail(&pCifsFile->flist,
+                                list_add_tail(&pCifsFile->flist,
-                                                &pCifsInode->openFileList);
+                                              &pCifsInode->openFileList);
-                                } else {
+                        } else {
-                                        list_add(&pCifsFile->flist,
+                                list_add(&pCifsFile->flist,
-                                                &pCifsInode->openFileList);
+                                         &pCifsInode->openFileList);
-                                }
-                                if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                                        pCifsInode->clientCanCacheAll = true;
-                                        pCifsInode->clientCanCacheRead = true;
-                                        cFYI(1, ("Exclusive Oplock inode %p",
-                                                newinode));
-                                } else if ((oplock & 0xF) == OPLOCK_READ)
-                                        pCifsInode->clientCanCacheRead = true;
                        }
-                        write_unlock(&GlobalSMBSeslock);
+                        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+                                pCifsInode->clientCanCacheAll = true;
+                                pCifsInode->clientCanCacheRead = true;
+                                cFYI(1, ("Exclusive Oplock inode %p",
+                                        newinode));
+                        } else if ((oplock & 0xF) == OPLOCK_READ)
+                                pCifsInode->clientCanCacheRead = true;
                }
+                write_unlock(&GlobalSMBSeslock);
        }
 cifs_create_out:
        kfree(buf);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index bcf7b518466..4690a360c85 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -199,6 +199,49 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
        pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
 }
+/**
+ * cifs_new inode - create new inode, initialize, and hash it
+ * @sb - pointer to superblock
+ * @inum - if valid pointer and serverino is enabled, replace i_ino with val
+ *
+ * Create a new inode, initialize it for CIFS and hash it. Returns the new
+ * inode or NULL if one couldn't be allocated.
+ *
+ * If the share isn't mounted with "serverino" or inum is a NULL pointer then
+ * we'll just use the inode number assigned by new_inode(). Note that this can
+ * mean i_ino collisions since the i_ino assigned by new_inode is not
+ * guaranteed to be unique.
+ */
+struct inode *
+cifs_new_inode(struct super_block *sb, __u64 *inum)
+{
+        struct inode *inode;
+        inode = new_inode(sb);
+        if (inode == NULL)
+                return NULL;
+        /*
+         * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we
+         *     stop passing inum as ptr. Are there sanity checks we can use to
+         *     ensure that the server is really filling in that field? Also,
+         *     if serverino is disabled, perhaps we should be using iunique()?
+         */
+        if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
+                inode->i_ino = (unsigned long) *inum;
+        /*
+         * must set this here instead of cifs_alloc_inode since VFS will
+         * clobber i_flags
+         */
+        if (sb->s_flags & MS_NOATIME)
+                inode->i_flags |= S_NOATIME | S_NOCMTIME;
+        insert_inode_hash(inode);
+        return inode;
+}
 int cifs_get_inode_info_unix(struct inode **pinode,
        const unsigned char *full_path, struct super_block *sb, int xid)
 {
@@ -233,22 +276,11 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        /* get new inode */
        if (*pinode == NULL) {
-                *pinode = new_inode(sb);
+                *pinode = cifs_new_inode(sb, &find_data.UniqueId);
                if (*pinode == NULL) {
                        rc = -ENOMEM;
                        goto cgiiu_exit;
                }
-                /* Is an i_ino of zero legal? */
-                /* note ino incremented to unique num in new_inode */
-                /* Are there sanity checks we can use to ensure that
-                   the server is really filling in that field? */
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-                        (*pinode)->i_ino = (unsigned long)find_data.UniqueId;
-                if (sb->s_flags & MS_NOATIME)
-                        (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-                insert_inode_hash(*pinode);
        }
        inode = *pinode;
@@ -465,11 +497,9 @@ int cifs_get_inode_info(struct inode **pinode,
        /* get new inode */
        if (*pinode == NULL) {
-                *pinode = new_inode(sb);
+                __u64 inode_num;
-                if (*pinode == NULL) {
+                __u64 *pinum = &inode_num;
-                        rc = -ENOMEM;
-                        goto cgii_exit;
-                }
                /* Is an i_ino of zero legal? Can we use that to check
                   if the server supports returning inode numbers?  Are
                   there other sanity checks we can use to ensure that
@@ -486,22 +516,26 @@ int cifs_get_inode_info(struct inode **pinode,
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                        int rc1 = 0;
-                        __u64 inode_num;
                        rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
-                                        full_path, &inode_num,
+                                        full_path, pinum,
                                        cifs_sb->local_nls,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        if (rc1) {
                                cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+                                pinum = NULL;
                                /* BB EOPNOSUPP disable SERVER_INUM? */
-                        } else /* do we need cast or hash to ino? */
+                        }
-                                (*pinode)->i_ino = inode_num;
+                } else {
-                } /* else ino incremented to unique num in new_inode*/
+                        pinum = NULL;
-                if (sb->s_flags & MS_NOATIME)
+                }
-                        (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-                insert_inode_hash(*pinode);
+                *pinode = cifs_new_inode(sb, pinum);
+                if (*pinode == NULL) {
+                        rc = -ENOMEM;
+                        goto cgii_exit;
+                }
        }
        inode = *pinode;
        cifsInfo = CIFS_I(inode);
@@ -621,7 +655,7 @@ static const struct inode_operations cifs_ipc_inode_ops = {
        .lookup = cifs_lookup,
 };
-static char *build_path_to_root(struct cifs_sb_info *cifs_sb)
+char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 {
        int pplen = cifs_sb->prepathlen;
        int dfsplen;
@@ -678,7 +712,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
                return inode;
        cifs_sb = CIFS_SB(inode->i_sb);
-        full_path = build_path_to_root(cifs_sb);
+        full_path = cifs_build_path_to_root(cifs_sb);
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
@@ -1017,7 +1051,7 @@ out_reval:
        return rc;
 }
-static void posix_fill_in_inode(struct inode *tmp_inode,
+void posix_fill_in_inode(struct inode *tmp_inode,
        FILE_UNIX_BASIC_INFO *pData, int isNewInode)
 {
        struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
@@ -1114,24 +1148,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        else
                                direntry->d_op = &cifs_dentry_ops;
-                        newinode = new_inode(inode->i_sb);
+                        newinode = cifs_new_inode(inode->i_sb,
+                                                  &pInfo->UniqueId);
                        if (newinode == NULL) {
                                kfree(pInfo);
                                goto mkdir_get_info;
                        }
-                        /* Is an i_ino of zero legal? */
-                        /* Are there sanity checks we can use to ensure that
-                           the server is really filling in that field? */
-                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-                                newinode->i_ino =
-                                        (unsigned long)pInfo->UniqueId;
-                        } /* note ino incremented to unique num in new_inode */
-                        if (inode->i_sb->s_flags & MS_NOATIME)
-                                newinode->i_flags |= S_NOATIME | S_NOCMTIME;
                        newinode->i_nlink = 2;
-                        insert_inode_hash(newinode);
                        d_instantiate(direntry, newinode);
                        /* we already checked in POSIXCreate whether
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 9f51f9bf029..c2c01ff4c32 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -56,35 +56,34 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 }
 #endif /* DEBUG2 */
-/* Returns one if new inode created (which therefore needs to be hashed) */
+/* Returns 1 if new inode created, 2 if both dentry and inode were */
 /* Might check in the future if inode number changed so we can rehash inode */
-static int construct_dentry(struct qstr *qstring, struct file *file,
+static int
-        struct inode **ptmp_inode, struct dentry **pnew_dentry)
+construct_dentry(struct qstr *qstring, struct file *file,
+                 struct inode **ptmp_inode, struct dentry **pnew_dentry,
+                 __u64 *inum)
 {
-        struct dentry *tmp_dentry;
+        struct dentry *tmp_dentry = NULL;
-        struct cifs_sb_info *cifs_sb;
+        struct super_block *sb = file->f_path.dentry->d_sb;
-        struct cifsTconInfo *pTcon;
        int rc = 0;
        cFYI(1, ("For %s", qstring->name));
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
        qstring->hash = full_name_hash(qstring->name, qstring->len);
        tmp_dentry = d_lookup(file->f_path.dentry, qstring);
        if (tmp_dentry) {
+                /* BB: overwrite old name? i.e. tmp_dentry->d_name and
+                 * tmp_dentry->d_name.len??
+                 */
                cFYI(0, ("existing dentry with inode 0x%p",
                         tmp_dentry->d_inode));
                *ptmp_inode = tmp_dentry->d_inode;
-/* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/
                if (*ptmp_inode == NULL) {
-                        *ptmp_inode = new_inode(file->f_path.dentry->d_sb);
+                        *ptmp_inode = cifs_new_inode(sb, inum);
                        if (*ptmp_inode == NULL)
                                return rc;
                        rc = 1;
                }
-                if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
-                        (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
        } else {
                tmp_dentry = d_alloc(file->f_path.dentry, qstring);
                if (tmp_dentry == NULL) {
@@ -93,15 +92,14 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
                        return rc;
                }
-                *ptmp_inode = new_inode(file->f_path.dentry->d_sb);
+                if (CIFS_SB(sb)->tcon->nocase)
-                if (pTcon->nocase)
                        tmp_dentry->d_op = &cifs_ci_dentry_ops;
                else
                        tmp_dentry->d_op = &cifs_dentry_ops;
+                *ptmp_inode = cifs_new_inode(sb, inum);
                if (*ptmp_inode == NULL)
                        return rc;
-                if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
-                        (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
                rc = 2;
        }
@@ -822,7 +820,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 /* inode num, inode type and filename returned */
 static int cifs_get_name_from_search_buf(struct qstr *pqst,
        char *current_entry, __u16 level, unsigned int unicode,
-        struct cifs_sb_info *cifs_sb, int max_len, ino_t *pinum)
+        struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum)
 {
        int rc = 0;
        unsigned int len = 0;
@@ -842,9 +840,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                        len = strnlen(filename, PATH_MAX);
                }
-                /* BB fixme - hash low and high 32 bits if not 64 bit arch BB */
+                *pinum = pFindData->UniqueId;
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-                        *pinum = pFindData->UniqueId;
        } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
                FILE_DIRECTORY_INFO *pFindData =
                        (FILE_DIRECTORY_INFO *)current_entry;
@@ -907,7 +903,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
        struct qstr qstring;
        struct cifsFileInfo *pCifsF;
        unsigned int obj_type;
-        ino_t  inum;
+        __u64  inum;
        struct cifs_sb_info *cifs_sb;
        struct inode *tmp_inode;
        struct dentry *tmp_dentry;
@@ -940,20 +936,18 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
        if (rc)
                return rc;
-        rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry);
+        /* only these two infolevels return valid inode numbers */
+        if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX ||
+            pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
+                rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
+                                        &inum);
+        else
+                rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
+                                        NULL);
        if ((tmp_inode == NULL) || (tmp_dentry == NULL))
                return -ENOMEM;
-        if (rc) {
-                /* inode created, we need to hash it with right inode number */
-                if (inum != 0) {
-                        /* BB fixme - hash the 2 32 quantities bits together if
-                         *  necessary BB */
-                        tmp_inode->i_ino = inum;
-                }
-                insert_inode_hash(tmp_inode);
-        }
        /* we pass in rc below, indicating whether it is a new inode,
           so we can figure out whether to invalidate the inode cached
           data if the file has changed */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5f22de7b79a..5c68b4282be 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -34,15 +34,99 @@
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
                         unsigned char *p24);
+/* Checks if this is the first smb session to be reconnected after
+   the socket has been reestablished (so we know whether to use vc 0).
+   Called while holding the cifs_tcp_ses_lock, so do not block */
+static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
+{
+        struct list_head *tmp;
+        struct cifsSesInfo *tmp_ses;
+        list_for_each(tmp, &ses->server->smb_ses_list) {
+                tmp_ses = list_entry(tmp, struct cifsSesInfo,
+                                     smb_ses_list);
+                if (tmp_ses->need_reconnect == false)
+                        return false;
+        }
+        /* could not find a session that was already connected,
+           this must be the first one we are reconnecting */
+        return true;
+}
+/*
+ *      vc number 0 is treated specially by some servers, and should be the
+ *      first one we request.  After that we can use vcnumbers up to maxvcs,
+ *      one for each smb session (some Windows versions set maxvcs incorrectly
+ *      so maxvc=1 can be ignored).  If we have too many vcs, we can reuse
+ *      any vc but zero (some servers reset the connection on vcnum zero)
+ *
+ */
+static __le16 get_next_vcnum(struct cifsSesInfo *ses)
+{
+        __u16 vcnum = 0;
+        struct list_head *tmp;
+        struct cifsSesInfo *tmp_ses;
+        __u16 max_vcs = ses->server->max_vcs;
+        __u16 i;
+        int free_vc_found = 0;
+        /* Quoting the MS-SMB specification: "Windows-based SMB servers set this
+        field to one but do not enforce this limit, which allows an SMB client
+        to establish more virtual circuits than allowed by this value ... but
+        other server implementations can enforce this limit." */
+        if (max_vcs < 2)
+                max_vcs = 0xFFFF;
+        write_lock(&cifs_tcp_ses_lock);
+        if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
+                        goto get_vc_num_exit;  /* vcnum will be zero */
+        for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
+                if (i == 0) /* this is the only connection, use vc 0 */
+                        break;
+                free_vc_found = 1;
+                list_for_each(tmp, &ses->server->smb_ses_list) {
+                        tmp_ses = list_entry(tmp, struct cifsSesInfo,
+                                             smb_ses_list);
+                        if (tmp_ses->vcnum == i) {
+                                free_vc_found = 0;
+                                break; /* found duplicate, try next vcnum */
+                        }
+                }
+                if (free_vc_found)
+                        break; /* we found a vcnumber that will work - use it */
+        }
+        if (i == 0)
+                vcnum = 0; /* for most common case, ie if one smb session, use
+                              vc zero.  Also for case when no free vcnum, zero
+                              is safest to send (some clients only send zero) */
+        else if (free_vc_found == 0)
+                vcnum = 1;  /* we can not reuse vc=0 safely, since some servers
+                                reset all uids on that, but 1 is ok. */
+        else
+                vcnum = i;
+        ses->vcnum = vcnum;
+get_vc_num_exit:
+        write_unlock(&cifs_tcp_ses_lock);
+        return le16_to_cpu(vcnum);
+}
 static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 {
        __u32 capabilities = 0;
        /* init fields common to all four types of SessSetup */
-        /* note that header is initialized to zero in header_assemble */
+        /* Note that offsets for first seven fields in req struct are same  */
+        /*      in CIFS Specs so does not matter which of 3 forms of struct */
+        /*      that we use in next few lines                               */
+        /* Note that header is initialized to zero in header_assemble */
        pSMB->req.AndXCommand = 0xFF;
        pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
        pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
+        pSMB->req.VcNumber = get_next_vcnum(ses);
        /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@ -71,7 +155,6 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
        if (ses->capabilities & CAP_UNIX)
                capabilities |= CAP_UNIX;
-        /* BB check whether to init vcnum BB */
        return capabilities;
 }
@@ -228,7 +311,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
        kfree(ses->serverOS);
        /* UTF-8 string will not grow more than four times as big as UCS-16 */
-        ses->serverOS = kzalloc(4 * len, GFP_KERNEL);
+        ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
        if (ses->serverOS != NULL)
                cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp);
        data += 2 * (len + 1);
@@ -241,7 +324,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
                return rc;
        kfree(ses->serverNOS);
-        ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */
+        ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
        if (ses->serverNOS != NULL) {
                cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
                                   nls_cp);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 9c6d815dd19..45e59d3c7f1 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1913,6 +1913,9 @@ COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
 /* 0x00 */
 COMPATIBLE_IOCTL(FIBMAP)
 COMPATIBLE_IOCTL(FIGETBSZ)
+/* 'X' - originally XFS but some now in the VFS */
+COMPATIBLE_IOCTL(FIFREEZE)
+COMPATIBLE_IOCTL(FITHAW)
 /* RAID */
 COMPATIBLE_IOCTL(RAID_VERSION)
 COMPATIBLE_IOCTL(GET_ARRAY_INFO)
@@ -1938,6 +1941,8 @@ ULONG_IOCTL(SET_BITMAP_FILE)
 /* Big K */
 COMPATIBLE_IOCTL(PIO_FONT)
 COMPATIBLE_IOCTL(GIO_FONT)
+COMPATIBLE_IOCTL(PIO_CMAP)
+COMPATIBLE_IOCTL(GIO_CMAP)
 ULONG_IOCTL(KDSIGACCEPT)
 COMPATIBLE_IOCTL(KDGETKEYCODE)
 COMPATIBLE_IOCTL(KDSETKEYCODE)
diff --git a/fs/dcache.c b/fs/dcache.c
index 937df0fb0da..07e2d4a44bd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1180,7 +1180,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
        iput(inode);
        return res;
 }
-EXPORT_SYMBOL_GPL(d_obtain_alias);
+EXPORT_SYMBOL(d_obtain_alias);
 /**
 * d_splice_alias - splice a disconnected dentry into the tree if one exists
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5f3231b9633..bff4052b05e 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -198,9 +198,6 @@ static int mknod_ptmx(struct super_block *sb)
        fsi->ptmx_dentry = dentry;
        rc = 0;
-        printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
-                        inode->i_ino);
 out:
        mutex_unlock(&root->d_inode->i_mutex);
        return rc;
@@ -369,8 +366,6 @@ static int new_pts_mount(struct file_system_type *fs_type, int flags,
        struct pts_fs_info *fsi;
        struct pts_mount_opts *opts;
-        printk(KERN_NOTICE "devpts: newinstance mount\n");
        err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
        if (err)
                return err;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index f6caeb1d110..bdca1f4b3a3 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -946,6 +946,8 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
        list_for_each_entry(global_auth_tok,
                            &mount_crypt_stat->global_auth_tok_list,
                            mount_crypt_stat_list) {
+                if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK)
+                        continue;
                rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
                if (rc) {
                        printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c11fc95714a..eb2267eca1f 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -328,6 +328,7 @@ struct ecryptfs_dentry_info {
 */
 struct ecryptfs_global_auth_tok {
 #define ECRYPTFS_AUTH_TOK_INVALID 0x00000001
+#define ECRYPTFS_AUTH_TOK_FNEK    0x00000002
        u32 flags;
        struct list_head mount_crypt_stat_list;
        struct key *global_auth_tok_key;
@@ -696,7 +697,7 @@ ecryptfs_write_header_metadata(char *virt,
 int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig);
 int
 ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
-                           char *sig);
+                           char *sig, u32 global_auth_tok_flags);
 int ecryptfs_get_global_auth_tok_for_sig(
        struct ecryptfs_global_auth_tok **global_auth_tok,
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ff539420cc6..e4a6223c314 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -2375,7 +2375,7 @@ struct kmem_cache *ecryptfs_global_auth_tok_cache;
 int
 ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
-                             char *sig)
+                             char *sig, u32 global_auth_tok_flags)
 {
        struct ecryptfs_global_auth_tok *new_auth_tok;
        int rc = 0;
@@ -2389,6 +2389,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
                goto out;
        }
        memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX);
+        new_auth_tok->flags = global_auth_tok_flags;
        new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
        mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
        list_add(&new_auth_tok->mount_crypt_stat_list,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 789cf2e1be1..aed56c25539 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -319,7 +319,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                case ecryptfs_opt_ecryptfs_sig:
                        sig_src = args[0].from;
                        rc = ecryptfs_add_global_auth_tok(mount_crypt_stat,
-                                                          sig_src);
+                                                          sig_src, 0);
                        if (rc) {
                                printk(KERN_ERR "Error attempting to register "
                                       "global sig; rc = [%d]\n", rc);
@@ -370,7 +370,8 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                                ECRYPTFS_SIG_SIZE_HEX] = '\0';
                        rc = ecryptfs_add_global_auth_tok(
                                mount_crypt_stat,
-                                mount_crypt_stat->global_default_fnek_sig);
+                                mount_crypt_stat->global_default_fnek_sig,
+                                ECRYPTFS_AUTH_TOK_FNEK);
                        if (rc) {
                                printk(KERN_ERR "Error attempting to register "
                                       "global fnek sig [%s]; rc = [%d]\n",
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index da8bdeaa2e6..7c6e3606f0e 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1185,9 +1185,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        es = sbi->s_es;
        if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
            (old_mount_opt & EXT2_MOUNT_XIP)) &&
-            invalidate_inodes(sb))
+            invalidate_inodes(sb)) {
-                ext2_warning(sb, __func__, "busy inodes while remounting "\
+                ext2_warning(sb, __func__, "refusing change of xip flag "
-                             "xip remain in cache (no functional problem)");
+                             "with busy inodes while remounting");
+                sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
+                sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
+        }
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
        if (*flags & MS_RDONLY) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b70d90e08a3..4a970411a45 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2428,12 +2428,13 @@ static void ext3_write_super (struct super_block * sb)
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
-        sb->s_dirt = 0;
+        tid_t target;
-        if (wait)
-                ext3_force_commit(sb);
-        else
-                journal_start_commit(EXT3_SB(sb)->s_journal, NULL);
+        sb->s_dirt = 0;
+        if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
+                if (wait)
+                        log_wait_commit(EXT3_SB(sb)->s_journal, target);
+        }
        return 0;
 }
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9a50b8052dc..de9459b4cb9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -609,7 +609,9 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-        if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
+        if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+            (*retries)++ > 3 ||
+            !EXT4_SB(sb)->s_journal)
                return 0;
        jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index aafc9eba1c2..b0c87dce66a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -868,7 +868,7 @@ static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
 {
        unsigned len = le16_to_cpu(dlen);
-        if (len == EXT4_MAX_REC_LEN)
+        if (len == EXT4_MAX_REC_LEN || len == 0)
                return 1 << 16;
        return len;
 }
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e2eab196875..e0aa4fe4f59 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1122,7 +1122,8 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
        ext4_fsblk_t block;
-        int depth, ee_len;
+        int depth;      /* Note, NOT eh_depth; depth from top of tree */
+        int ee_len;
        BUG_ON(path == NULL);
        depth = path->p_depth;
@@ -1179,7 +1180,8 @@ got_index:
                if (bh == NULL)
                        return -EIO;
                eh = ext_block_hdr(bh);
-                if (ext4_ext_check_header(inode, eh, depth)) {
+                /* subtract from p_depth to get proper eh_depth */
+                if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
                        put_bh(bh);
                        return -EIO;
                }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4fb86a0061d..2d2b3585ee9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -188,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
-        int fatal = 0, err, count;
+        int fatal = 0, err, count, cleared;
        ext4_group_t flex_group;
        if (atomic_read(&inode->i_count) > 1) {
@@ -248,8 +248,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                goto error_return;
        /* Ok, now we can actually update the inode bitmaps.. */
-        if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+        spin_lock(sb_bgl_lock(sbi, block_group));
-                                        bit, bitmap_bh->b_data))
+        cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+        spin_unlock(sb_bgl_lock(sbi, block_group));
+        if (!cleared)
                ext4_error(sb, "ext4_free_inode",
                           "bit already cleared for inode %lu", ino);
        else {
@@ -696,6 +698,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
        struct inode *ret;
        ext4_group_t i;
        int free = 0;
+        static int once = 1;
        ext4_group_t flex_group;
        /* Cannot create files in a deleted directory */
@@ -715,6 +718,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
        if (sbi->s_log_groups_per_flex) {
                ret2 = find_group_flex(sb, dir, &group);
+                if (ret2 == -1) {
+                        ret2 = find_group_other(sb, dir, &group);
+                        if (ret2 == 0 && once)
+                                once = 0;
+                                printk(KERN_NOTICE "ext4: find_group_flex "
+                                       "failed, fallback succeeded dir %lu\n",
+                                       dir->i_ino);
+                }
                goto got_group;
        }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 03ba20be132..c7fed5b1874 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,8 +47,10 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
-        return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+        return jbd2_journal_begin_ordered_truncate(
-                                                   new_size);
+                                        EXT4_SB(inode->i_sb)->s_journal,
+                                        &EXT4_I(inode)->jinode,
+                                        new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -1366,6 +1368,10 @@ retry:
                goto out;
        }
+        /* We cannot recurse into the filesystem as the transaction is already
+         * started */
+        flags |= AOP_FLAG_NOFS;
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                ext4_journal_stop(handle);
@@ -1375,7 +1381,7 @@ retry:
        *pagep = page;
        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-                                                        ext4_get_block);
+                                ext4_get_block);
        if (!ret && ext4_should_journal_data(inode)) {
                ret = walk_page_buffers(handle, page_buffers(page),
@@ -2437,6 +2443,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        int no_nrwrite_index_update;
        int pages_written = 0;
        long pages_skipped;
+        int range_cyclic, cycled = 1, io_done = 0;
        int needed_blocks, ret = 0, nr_to_writebump = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
@@ -2488,9 +2495,15 @@ static int ext4_da_writepages(struct address_space *mapping,
        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;
-        if (wbc->range_cyclic)
+        range_cyclic = wbc->range_cyclic;
+        if (wbc->range_cyclic) {
                index = mapping->writeback_index;
-        else
+                if (index)
+                        cycled = 0;
+                wbc->range_start = index << PAGE_CACHE_SHIFT;
+                wbc->range_end  = LLONG_MAX;
+                wbc->range_cyclic = 0;
+        } else
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
        mpd.wbc = wbc;
@@ -2504,6 +2517,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        wbc->no_nrwrite_index_update = 1;
        pages_skipped = wbc->pages_skipped;
+retry:
        while (!ret && wbc->nr_to_write > 0) {
                /*
@@ -2530,7 +2544,7 @@ static int ext4_da_writepages(struct address_space *mapping,
                ext4_journal_stop(handle);
-                if (mpd.retval == -ENOSPC) {
+                if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
                        /* commit the transaction which would
                         * free blocks released in the transaction
                         * and try again
@@ -2546,6 +2560,7 @@ static int ext4_da_writepages(struct address_space *mapping,
                        pages_written += mpd.pages_written;
                        wbc->pages_skipped = pages_skipped;
                        ret = 0;
+                        io_done = 1;
                } else if (wbc->nr_to_write)
                        /*
                         * There is no more writeout needed
@@ -2554,6 +2569,13 @@ static int ext4_da_writepages(struct address_space *mapping,
                         */
                        break;
        }
+        if (!io_done && !cycled) {
+                cycled = 1;
+                index = 0;
+                wbc->range_start = index << PAGE_CACHE_SHIFT;
+                wbc->range_end  = mapping->writeback_index - 1;
+                goto retry;
+        }
        if (pages_skipped != wbc->pages_skipped)
                printk(KERN_EMERG "This should not happen leaving %s "
                                "with nr_to_write = %ld ret = %d\n",
@@ -2561,6 +2583,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        /* Update index */
        index += pages_written;
+        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * set the writeback_index so that range_cyclic
@@ -2648,6 +2671,9 @@ retry:
                ret = PTR_ERR(handle);
                goto out;
        }
+        /* We cannot recurse into the filesystem as the transaction is already
+         * started */
+        flags |= AOP_FLAG_NOFS;
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index deba54f6cbe..9f61e62f435 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1447,7 +1447,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
        struct ext4_free_extent *gex = &ac->ac_g_ex;
        BUG_ON(ex->fe_len <= 0);
-        BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+        BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
        BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
@@ -3292,7 +3292,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        }
        BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
                        start > ac->ac_o_ex.fe_logical);
-        BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+        BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
        /* now prepare goal request */
@@ -3589,6 +3589,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
                        struct super_block *sb, struct ext4_prealloc_space *pa)
 {
        ext4_group_t grp;
+        ext4_fsblk_t grp_blk;
        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
                return;
@@ -3603,8 +3604,12 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
        pa->pa_deleted = 1;
        spin_unlock(&pa->pa_lock);
-        /* -1 is to protect from crossing allocation group */
+        grp_blk = pa->pa_pstart;
-        ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
+        /* If linear, pa_pstart may be in the next group when pa is used up */
+        if (pa->pa_linear)
+                grp_blk--;
+        ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
        /*
         * possible race:
@@ -3693,6 +3698,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        pa->pa_free = pa->pa_len;
        atomic_set(&pa->pa_count, 1);
        spin_lock_init(&pa->pa_lock);
+        INIT_LIST_HEAD(&pa->pa_inode_list);
+        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
        pa->pa_linear = 0;
@@ -3755,6 +3762,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        atomic_set(&pa->pa_count, 1);
        spin_lock_init(&pa->pa_lock);
        INIT_LIST_HEAD(&pa->pa_inode_list);
+        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
        pa->pa_linear = 1;
@@ -4476,23 +4484,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                        pa->pa_free -= ac->ac_b_ex.fe_len;
                        pa->pa_len -= ac->ac_b_ex.fe_len;
                        spin_unlock(&pa->pa_lock);
-                        /*
-                         * We want to add the pa to the right bucket.
-                         * Remove it from the list and while adding
-                         * make sure the list to which we are adding
-                         * doesn't grow big.
-                         */
-                        if (likely(pa->pa_free)) {
-                                spin_lock(pa->pa_obj_lock);
-                                list_del_rcu(&pa->pa_inode_list);
-                                spin_unlock(pa->pa_obj_lock);
-                                ext4_mb_add_n_trim(ac);
-                        }
                }
-                ext4_mb_put_pa(ac, ac->ac_sb, pa);
        }
        if (ac->alloc_semp)
                up_read(ac->alloc_semp);
+        if (pa) {
+                /*
+                 * We want to add the pa to the right bucket.
+                 * Remove it from the list and while adding
+                 * make sure the list to which we are adding
+                 * doesn't grow big.  We need to release
+                 * alloc_semp before calling ext4_mb_add_n_trim()
+                 */
+                if (pa->pa_linear && likely(pa->pa_free)) {
+                        spin_lock(pa->pa_obj_lock);
+                        list_del_rcu(&pa->pa_inode_list);
+                        spin_unlock(pa->pa_obj_lock);
+                        ext4_mb_add_n_trim(ac);
+                }
+                ext4_mb_put_pa(ac, ac->ac_sb, pa);
+        }
        if (ac->ac_bitmap_page)
                page_cache_release(ac->ac_bitmap_page);
        if (ac->ac_buddy_page)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 734abca25e3..fe64d9f7985 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode)
                                        + 1);
        if (IS_ERR(handle)) {
                retval = PTR_ERR(handle);
-                goto err_out;
+                return retval;
        }
        tmp_inode = ext4_new_inode(handle,
                                inode->i_sb->s_root->d_inode,
@@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode)
        if (IS_ERR(tmp_inode)) {
                retval = -ENOMEM;
                ext4_journal_stop(handle);
-                tmp_inode = NULL;
+                return retval;
-                goto err_out;
        }
        i_size_write(tmp_inode, i_size_read(inode));
        /*
@@ -618,8 +617,7 @@ err_out:
        ext4_journal_stop(handle);
-        if (tmp_inode)
+        iput(tmp_inode);
-                iput(tmp_inode);
        return retval;
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e5f06a5f045..39d1993cfa1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3046,14 +3046,17 @@ static void ext4_write_super(struct super_block *sb)
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
        int ret = 0;
+        tid_t target;
        trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
        sb->s_dirt = 0;
        if (EXT4_SB(sb)->s_journal) {
-                if (wait)
+                if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
-                        ret = ext4_force_commit(sb);
+                                              &target)) {
-                else
+                        if (wait)
-                        jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+                                jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
+                                                     target);
+                }
        } else {
                ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
        }
@@ -3088,7 +3091,6 @@ static int ext4_freeze(struct super_block *sb)
                /* Journal blocked and flushed, clear needs_recovery flag. */
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
                error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
                if (error)
                        goto out;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 6b74d09adbe..de0004fe6e0 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -202,9 +202,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
        sector_t blocknr;
        /* fat_get_cluster() assumes the requested blocknr isn't truncated. */
-        mutex_lock(&mapping->host->i_mutex);
+        down_read(&mapping->host->i_alloc_sem);
        blocknr = generic_block_bmap(mapping, block, fat_get_block);
-        mutex_unlock(&mapping->host->i_mutex);
+        up_read(&mapping->host->i_alloc_sem);
        return blocknr;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e5eaa62fd17..e3fe9918faa 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -274,6 +274,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
        int ret;
        BUG_ON(inode->i_state & I_SYNC);
+        WARN_ON(inode->i_state & I_NEW);
        /* Set I_SYNC, reset I_DIRTY */
        dirty = inode->i_state & I_DIRTY;
@@ -298,6 +299,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
        }
        spin_lock(&inode_lock);
+        WARN_ON(inode->i_state & I_NEW);
        inode->i_state &= ~I_SYNC;
        if (!(inode->i_state & I_FREEING)) {
                if (!(inode->i_state & I_DIRTY) &&
@@ -470,6 +472,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
                        break;
                }
+                if (inode->i_state & I_NEW) {
+                        requeue_io(inode);
+                        continue;
+                }
                if (wbc->nonblocking && bdi_write_congested(bdi)) {
                        wbc->encountered_congestion = 1;
                        if (!sb_is_blkdev_sb(sb))
@@ -531,7 +538,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
                list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                        struct address_space *mapping;
-                        if (inode->i_state & (I_FREEING|I_WILL_FREE))
+                        if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
                                continue;
                        mapping = inode->i_mapping;
                        if (mapping->nrpages == 0)
diff --git a/fs/inode.c b/fs/inode.c
index 913ab2d9a5d..826fb0b9d1c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -359,6 +359,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
                invalidate_inode_buffers(inode);
                if (!atomic_read(&inode->i_count)) {
                        list_move(&inode->i_list, dispose);
+                        WARN_ON(inode->i_state & I_NEW);
                        inode->i_state |= I_FREEING;
                        count++;
                        continue;
@@ -460,6 +461,7 @@ static void prune_icache(int nr_to_scan)
                                continue;
                }
                list_move(&inode->i_list, &freeable);
+                WARN_ON(inode->i_state & I_NEW);
                inode->i_state |= I_FREEING;
                nr_pruned++;
        }
@@ -656,6 +658,7 @@ void unlock_new_inode(struct inode *inode)
         * just created it (so there can be no old holders
         * that haven't tested I_LOCK).
         */
+        WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
        inode->i_state &= ~(I_LOCK|I_NEW);
        wake_up_inode(inode);
 }
@@ -1145,6 +1148,7 @@ void generic_delete_inode(struct inode *inode)
        list_del_init(&inode->i_list);
        list_del_init(&inode->i_sb_list);
+        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
        inodes_stat.nr_inodes--;
        spin_unlock(&inode_lock);
@@ -1186,16 +1190,19 @@ static void generic_forget_inode(struct inode *inode)
                        spin_unlock(&inode_lock);
                        return;
                }
+                WARN_ON(inode->i_state & I_NEW);
                inode->i_state |= I_WILL_FREE;
                spin_unlock(&inode_lock);
                write_inode_now(inode, 1);
                spin_lock(&inode_lock);
+                WARN_ON(inode->i_state & I_NEW);
                inode->i_state &= ~I_WILL_FREE;
                inodes_stat.nr_unused--;
                hlist_del_init(&inode->i_hash);
        }
        list_del_init(&inode->i_list);
        list_del_init(&inode->i_sb_list);
+        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
        inodes_stat.nr_inodes--;
        spin_unlock(&inode_lock);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9e4fa52d7dc..e79c07812af 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -427,7 +427,7 @@ int __log_space_left(journal_t *journal)
 }
 /*
- * Called under j_state_lock.  Returns true if a transaction was started.
+ * Called under j_state_lock.  Returns true if a transaction commit was started.
 */
 int __log_start_commit(journal_t *journal, tid_t target)
 {
@@ -495,7 +495,8 @@ int journal_force_commit_nested(journal_t *journal)
 /*
 * Start a commit of the current running transaction (if any).  Returns true
- * if a transaction was started, and fills its tid in at *ptid
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
 */
 int journal_start_commit(journal_t *journal, tid_t *ptid)
 {
@@ -505,15 +506,19 @@ int journal_start_commit(journal_t *journal, tid_t *ptid)
        if (journal->j_running_transaction) {
                tid_t tid = journal->j_running_transaction->t_tid;
-                ret = __log_start_commit(journal, tid);
+                __log_start_commit(journal, tid);
-                if (ret && ptid)
+                /* There's a running transaction and we've just made sure
+                 * it's commit has been scheduled. */
+                if (ptid)
                        *ptid = tid;
-        } else if (journal->j_committing_transaction && ptid) {
+                ret = 1;
+        } else if (journal->j_committing_transaction) {
                /*
                 * If ext3_write_super() recently started a commit, then we
                 * have to wait for completion of that transaction
                 */
-                *ptid = journal->j_committing_transaction->t_tid;
+                if (ptid)
+                        *ptid = journal->j_committing_transaction->t_tid;
                ret = 1;
        }
        spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index eb343008ede..58144102bf2 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal)
 }
 /*
- * Called under j_state_lock.  Returns true if a transaction was started.
+ * Called under j_state_lock.  Returns true if a transaction commit was started.
 */
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
@@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 /*
 * Start a commit of the current running transaction (if any).  Returns true
- * if a transaction was started, and fills its tid in at *ptid
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
 */
 int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 {
@@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
        if (journal->j_running_transaction) {
                tid_t tid = journal->j_running_transaction->t_tid;
-                ret = __jbd2_log_start_commit(journal, tid);
+                __jbd2_log_start_commit(journal, tid);
-                if (ret && ptid)
+                /* There's a running transaction and we've just made sure
+                 * it's commit has been scheduled. */
+                if (ptid)
                        *ptid = tid;
-        } else if (journal->j_committing_transaction && ptid) {
+                ret = 1;
+        } else if (journal->j_committing_transaction) {
                /*
                 * If ext3_write_super() recently started a commit, then we
                 * have to wait for completion of that transaction
                 */
-                *ptid = journal->j_committing_transaction->t_tid;
+                if (ptid)
+                        *ptid = journal->j_committing_transaction->t_tid;
                ret = 1;
        }
        spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 46b4e347ed7..28ce21d8598 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2129,26 +2129,46 @@ done:
 }
 /*
- * This function must be called when inode is journaled in ordered mode
+ * File truncate and transaction commit interact with each other in a
- * before truncation happens. It starts writeout of truncated part in
+ * non-trivial way.  If a transaction writing data block A is
- * case it is in the committing transaction so that we stand to ordered
+ * committing, we cannot discard the data by truncate until we have
- * mode consistency guarantees.
+ * written them.  Otherwise if we crashed after the transaction with
+ * write has committed but before the transaction with truncate has
+ * committed, we could see stale data in block A.  This function is a
+ * helper to solve this problem.  It starts writeout of the truncated
+ * part in case it is in the committing transaction.
+ *
+ * Filesystem code must call this function when inode is journaled in
+ * ordered mode before truncation happens and after the inode has been
+ * placed on orphan list with the new inode size. The second condition
+ * avoids the race that someone writes new data and we start
+ * committing the transaction after this function has been called but
+ * before a transaction for truncate is started (and furthermore it
+ * allows us to optimize the case where the addition to orphan list
+ * happens in the same transaction as write --- we don't have to write
+ * any data in such case).
 */
-int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+int jbd2_journal_begin_ordered_truncate(journal_t *journal,
+                                        struct jbd2_inode *jinode,
                                        loff_t new_size)
 {
-        journal_t *journal;
+        transaction_t *inode_trans, *commit_trans;
-        transaction_t *commit_trans;
        int ret = 0;
-        if (!inode->i_transaction && !inode->i_next_transaction)
+        /* This is a quick check to avoid locking if not necessary */
+        if (!jinode->i_transaction)
                goto out;
-        journal = inode->i_transaction->t_journal;
+        /* Locks are here just to force reading of recent values, it is
+         * enough that the transaction was not committing before we started
+         * a transaction adding the inode to orphan list */
        spin_lock(&journal->j_state_lock);
        commit_trans = journal->j_committing_transaction;
        spin_unlock(&journal->j_state_lock);
-        if (inode->i_transaction == commit_trans) {
+        spin_lock(&journal->j_list_lock);
-                ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+        inode_trans = jinode->i_transaction;
+        spin_unlock(&journal->j_list_lock);
+        if (inode_trans == commit_trans) {
+                ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
                        new_size, LLONG_MAX);
                if (ret)
                        jbd2_journal_abort(journal, ret);
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3cceef4ad2b..e9580104b6b 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -95,13 +95,17 @@ static int jffs2_garbage_collect_thread(void *_c)
                        spin_unlock(&c->erase_completion_lock);
                        
-                /* This thread is purely an optimisation. But if it runs when
+                /* Problem - immediately after bootup, the GCD spends a lot
-                   other things could be running, it actually makes things a
+                 * of time in places like jffs2_kill_fragtree(); so much so
-                   lot worse. Use yield() and put it at the back of the runqueue
+                 * that userspace processes (like gdm and X) are starved
-                   every time. Especially during boot, pulling an inode in
+                 * despite plenty of cond_resched()s and renicing.  Yield()
-                   with read_inode() is much preferable to having the GC thread
+                 * doesn't help, either (presumably because userspace and GCD
-                   get there first. */
+                 * are generally competing for a higher latency resource -
-                yield();
+                 * disk).
+                 * This forces the GCD to slow the hell down.   Pulling an
+                 * inode in with read_inode() is much preferable to having
+                 * the GC thread get there first. */
+                schedule_timeout_interruptible(msecs_to_jiffies(50));
                /* Put_super will send a SIGKILL and then wait on the sem.
                 */
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 6ca08ad887c..1fc1e92356e 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -220,7 +220,7 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
                                struct jffs2_tmp_dnode_info *tn)
 {
        uint32_t fn_end = tn->fn->ofs + tn->fn->size;
-        struct jffs2_tmp_dnode_info *this;
+        struct jffs2_tmp_dnode_info *this, *ptn;
        dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw));
@@ -251,11 +251,18 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
        if (this) {
                /* If the node is coincident with another at a lower address,
                   back up until the other node is found. It may be relevant */
-                while (this->overlapped)
+                while (this->overlapped) {
-                        this = tn_prev(this);
+                        ptn = tn_prev(this);
+                        if (!ptn) {
-                /* First node should never be marked overlapped */
+                                /*
-                BUG_ON(!this);
+                                 * We killed a node which set the overlapped
+                                 * flags during the scan. Fix it up.
+                                 */
+                                this->overlapped = 0;
+                                break;
+                        }
+                        this = ptn;
+                }
                dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole");
        }
@@ -360,7 +367,17 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
                        }
                        if (!this->overlapped)
                                break;
-                        this = tn_prev(this);
+                        ptn = tn_prev(this);
+                        if (!ptn) {
+                                /*
+                                 * We killed a node which set the overlapped
+                                 * flags during the scan. Fix it up.
+                                 */
+                                this->overlapped = 0;
+                                break;
+                        }
+                        this = ptn;
                }
        }
@@ -456,8 +473,15 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
                eat_last(&rii->tn_root, &last->rb);
                ver_insert(&ver_root, last);
-                if (unlikely(last->overlapped))
+                if (unlikely(last->overlapped)) {
-                        continue;
+                        if (pen)
+                                continue;
+                        /*
+                         * We killed a node which set the overlapped
+                         * flags during the scan. Fix it up.
+                         */
+                        last->overlapped = 0;
+                }
                /* Now we have a bunch of nodes in reverse version
                   order, in the tree at ver_root. Most of the time,
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 1f3b0fc0d35..aedc47a264c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -139,6 +139,55 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
        return 0;
 }
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
+                                                 struct in6_addr *addr_mapped)
+{
+        const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
+        switch (sap->sa_family) {
+        case AF_INET6:
+                return &((const struct sockaddr_in6 *)sap)->sin6_addr;
+        case AF_INET:
+                ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
+                return addr_mapped;
+        }
+        return NULL;
+}
+/*
+ * If lockd is using a PF_INET6 listener, all incoming requests appear
+ * to come from AF_INET6 remotes.  The address of AF_INET remotes are
+ * mapped to AF_INET6 automatically by the network layer.  In case the
+ * user passed an AF_INET server address at mount time, ensure both
+ * addresses are AF_INET6 before comparing them.
+ */
+static int nlmclnt_cmp_addr(const struct nlm_host *host,
+                            const struct sockaddr *sap)
+{
+        const struct in6_addr *addr1;
+        const struct in6_addr *addr2;
+        struct in6_addr addr1_mapped;
+        struct in6_addr addr2_mapped;
+        addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
+        if (likely(addr1 != NULL)) {
+                addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
+                if (likely(addr2 != NULL))
+                        return ipv6_addr_equal(addr1, addr2);
+        }
+        return 0;
+}
+#else   /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
+static int nlmclnt_cmp_addr(const struct nlm_host *host,
+                            const struct sockaddr *sap)
+{
+        return nlm_cmp_addr(nlm_addr(host), sap);
+}
+#endif  /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
 /*
 * The server lockd has called us back to tell us the lock was granted
 */
@@ -166,7 +215,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
                 */
                if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
                        continue;
-                if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
+                if (!nlmclnt_cmp_addr(block->b_host, addr))
                        continue;
                if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
                        continue;
diff --git a/fs/namespace.c b/fs/namespace.c
index 228d8c4bfd1..06f8e63f6cb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -614,9 +614,11 @@ static inline void __mntput(struct vfsmount *mnt)
         */
        for_each_possible_cpu(cpu) {
                struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-                if (cpu_writer->mnt != mnt)
-                        continue;
                spin_lock(&cpu_writer->lock);
+                if (cpu_writer->mnt != mnt) {
+                        spin_unlock(&cpu_writer->lock);
+                        continue;
+                }
                atomic_add(cpu_writer->count, &mnt->__mnt_writers);
                cpu_writer->count = 0;
                /*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 9b728f3565a..574158ae239 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -255,6 +255,32 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
        }
        return 0;
 }
+/*
+ * Test if two ip6 socket addresses refer to the same socket by
+ * comparing relevant fields. The padding bytes specifically, are not
+ * compared. sin6_flowinfo is not compared because it only affects QoS
+ * and sin6_scope_id is only compared if the address is "link local"
+ * because "link local" addresses need only be unique to a specific
+ * link. Conversely, ordinary unicast addresses might have different
+ * sin6_scope_id.
+ *
+ * The caller should ensure both socket addresses are AF_INET6.
+ */
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+                                const struct sockaddr *sa2)
+{
+        const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1;
+        const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2;
+        if (!ipv6_addr_equal(&saddr1->sin6_addr,
+                             &saddr1->sin6_addr))
+                return 0;
+        if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
+            saddr1->sin6_scope_id != saddr2->sin6_scope_id)
+                return 0;
+        return saddr1->sin6_port == saddr2->sin6_port;
+}
 #else
 static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
                                 const struct sockaddr_in *sa2)
@@ -270,9 +296,52 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
        return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
                        (const struct sockaddr_in *)sa2);
 }
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
+                                const struct sockaddr * sa2)
+{
+        return 0;
+}
 #endif
 /*
+ * Test if two ip4 socket addresses refer to the same socket, by
+ * comparing relevant fields. The padding bytes specifically, are
+ * not compared.
+ *
+ * The caller should ensure both socket addresses are AF_INET.
+ */
+static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
+                                const struct sockaddr *sa2)
+{
+        const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1;
+        const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2;
+        if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr)
+                return 0;
+        return saddr1->sin_port == saddr2->sin_port;
+}
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields.
+ */
+static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
+                            const struct sockaddr *sa2)
+{
+        if (sa1->sa_family != sa2->sa_family)
+                return 0;
+        switch (sa1->sa_family) {
+        case AF_INET:
+                return nfs_sockaddr_cmp_ip4(sa1, sa2);
+        case AF_INET6:
+                return nfs_sockaddr_cmp_ip6(sa1, sa2);
+        }
+        return 0;
+}
+/*
 * Find a client by IP address and protocol version
 * - returns NULL if no such client
 */
@@ -344,8 +413,10 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
 static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
 {
        struct nfs_client *clp;
+        const struct sockaddr *sap = data->addr;
        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
                /* Don't match clients that failed to initialise properly */
                if (clp->cl_cons_state < 0)
                        continue;
@@ -358,7 +429,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
                        continue;
                /* Match the full socket address */
-                if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0)
+                if (!nfs_sockaddr_cmp(sap, clap))
                        continue;
                atomic_inc(&clp->cl_count);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e35c8199f82..672368f865c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1892,8 +1892,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
        cache.cred = cred;
        cache.jiffies = jiffies;
        status = NFS_PROTO(inode)->access(inode, &cache);
-        if (status != 0)
+        if (status != 0) {
+                if (status == -ESTALE) {
+                        nfs_zap_caches(inode);
+                        if (!S_ISDIR(inode->i_mode))
+                                set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+                }
                return status;
+        }
        nfs_access_add_cache(inode, &cache);
 out:
        if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index cef62557c87..6bbf0e6daad 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -292,7 +292,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 {
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_fattr fattr;
-        struct page *pages[NFSACL_MAXPAGES] = { };
+        struct page *pages[NFSACL_MAXPAGES];
        struct nfs3_setaclargs args = {
                .inode = inode,
                .mask = NFS_ACL,
@@ -303,7 +303,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
                .rpc_argp       = &args,
                .rpc_resp       = &fattr,
        };
-        int status, count;
+        int status;
        status = -EOPNOTSUPP;
        if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -319,6 +319,20 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        if (S_ISDIR(inode->i_mode)) {
                args.mask |= NFS_DFACL;
                args.acl_default = dfacl;
+                args.len = nfsacl_size(acl, dfacl);
+        } else
+                args.len = nfsacl_size(acl, NULL);
+        if (args.len > NFS_ACL_INLINE_BUFSIZE) {
+                unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT);
+                status = -ENOMEM;
+                do {
+                        args.pages[args.npages] = alloc_page(GFP_KERNEL);
+                        if (args.pages[args.npages] == NULL)
+                                goto out_freepages;
+                        args.npages++;
+                } while (args.npages < npages);
        }
        dprintk("NFS call setacl\n");
@@ -329,10 +343,6 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        nfs_zap_acl_cache(inode);
        dprintk("NFS reply setacl: %d\n", status);
-        /* pages may have been allocated at the xdr layer. */
-        for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
-                __free_page(args.pages[count]);
        switch (status) {
                case 0:
                        status = nfs_refresh_inode(inode, &fattr);
@@ -346,6 +356,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
                case -ENOTSUPP:
                        status = -EOPNOTSUPP;
        }
+out_freepages:
+        while (args.npages != 0) {
+                args.npages--;
+                __free_page(args.pages[args.npages]);
+        }
 out:
        return status;
 }
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 11cdddec143..6cdeacffde4 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -82,8 +82,10 @@
 #define NFS3_commitres_sz       (1+NFS3_wcc_data_sz+2)
 #define ACL3_getaclargs_sz      (NFS3_fh_sz+1)
-#define ACL3_setaclargs_sz      (NFS3_fh_sz+1+2*(2+5*3))
+#define ACL3_setaclargs_sz      (NFS3_fh_sz+1+ \
-#define ACL3_getaclres_sz       (1+NFS3_post_op_attr_sz+1+2*(2+5*3))
+                                XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+#define ACL3_getaclres_sz       (1+NFS3_post_op_attr_sz+1+ \
+                                XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
 #define ACL3_setaclres_sz       (1+NFS3_post_op_attr_sz)
 /*
@@ -703,28 +705,18 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
                   struct nfs3_setaclargs *args)
 {
        struct xdr_buf *buf = &req->rq_snd_buf;
-        unsigned int base, len_in_head, len = nfsacl_size(
+        unsigned int base;
-                (args->mask & NFS_ACL)   ? args->acl_access  : NULL,
+        int err;
-                (args->mask & NFS_DFACL) ? args->acl_default : NULL);
-        int count, err;
        p = xdr_encode_fhandle(p, NFS_FH(args->inode));
        *p++ = htonl(args->mask);
-        base = (char *)p - (char *)buf->head->iov_base;
+        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* put as much of the acls into head as possible. */
+        base = req->rq_slen;
-        len_in_head = min_t(unsigned int, buf->head->iov_len - base, len);
-        len -= len_in_head;
+        if (args->npages != 0)
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2));
+                xdr_encode_pages(buf, args->pages, 0, args->len);
+        else
-        for (count = 0; (count << PAGE_SHIFT) < len; count++) {
+                req->rq_slen += args->len;
-                args->pages[count] = alloc_page(GFP_KERNEL);
-                if (!args->pages[count]) {
-                        while (count)
-                                __free_page(args->pages[--count]);
-                        return -ENOMEM;
-                }
-        }
-        xdr_encode_pages(buf, args->pages, 0, len);
        err = nfsacl_encode(buf, base, args->inode,
                            (args->mask & NFS_ACL) ?
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 30befc39b3c..2a2a0a7143a 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -21,7 +21,9 @@
 #define NFSDBG_FACILITY         NFSDBG_VFS
 /*
- * Check if fs_root is valid
+ * Convert the NFSv4 pathname components into a standard posix path.
+ *
+ * Note that the resulting string will be placed at the end of the buffer
 */
 static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname,
                                         char *buffer, ssize_t buflen)
@@ -99,21 +101,20 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 {
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
        char *mnt_path;
-        int page2len;
+        unsigned int maxbuflen;
        unsigned int s;
        mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
        if (IS_ERR(mnt_path))
                return mnt;
        mountdata->mnt_path = mnt_path;
-        page2 += strlen(mnt_path) + 1;
+        maxbuflen = mnt_path - 1 - page2;
-        page2len = PAGE_SIZE - strlen(mnt_path) - 1;
        for (s = 0; s < location->nservers; s++) {
                const struct nfs4_string *buf = &location->servers[s];
                struct sockaddr_storage addr;
-                if (buf->len <= 0 || buf->len >= PAGE_SIZE)
+                if (buf->len <= 0 || buf->len >= maxbuflen)
                        continue;
                mountdata->addr = (struct sockaddr *)&addr;
@@ -126,8 +127,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                        continue;
                nfs_set_port(mountdata->addr, NFS_PORT);
-                strncpy(page2, buf->data, page2len);
+                memcpy(page2, buf->data, buf->len);
-                page2[page2len] = '\0';
+                page2[buf->len] = '\0';
                mountdata->hostname = page2;
                snprintf(page, PAGE_SIZE, "%s:%s",
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f65953be39c..9250067943d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2596,6 +2596,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_LOOKUPP]            = (nfsd4_enc)nfsd4_encode_noop,
        [OP_NVERIFY]            = (nfsd4_enc)nfsd4_encode_noop,
        [OP_OPEN]               = (nfsd4_enc)nfsd4_encode_open,
+        [OP_OPENATTR]           = (nfsd4_enc)nfsd4_encode_noop,
        [OP_OPEN_CONFIRM]       = (nfsd4_enc)nfsd4_encode_open_confirm,
        [OP_OPEN_DOWNGRADE]     = (nfsd4_enc)nfsd4_encode_open_downgrade,
        [OP_PUTFH]              = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d..331f2e88e28 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -156,7 +156,7 @@ static int inotify_handle_get_wd(struct inotify_handle *ih,
        int ret;
        do {
-                if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
+                if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
                        return -ENOSPC;
                ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
        } while (ret == -EAGAIN);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 60fe74035db..19e3a96aa02 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -176,7 +176,8 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
        BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
        mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
-                        (OCFS2_I(inode)->ip_clusters != rec->e_cpos),
+                        (OCFS2_I(inode)->ip_clusters !=
+                         le32_to_cpu(rec->e_cpos)),
                        "Device %s, asking for sparse allocation: inode %llu, "
                        "cpos %u, clusters %u\n",
                        osb->dev_str,
@@ -4796,6 +4797,29 @@ out:
        return ret;
 }
+static int ocfs2_replace_extent_rec(struct inode *inode,
+                                    handle_t *handle,
+                                    struct ocfs2_path *path,
+                                    struct ocfs2_extent_list *el,
+                                    int split_index,
+                                    struct ocfs2_extent_rec *split_rec)
+{
+        int ret;
+        ret = ocfs2_path_bh_journal_access(handle, inode, path,
+                                           path_num_items(path) - 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el->l_recs[split_index] = *split_rec;
+        ocfs2_journal_dirty(handle, path_leaf_bh(path));
+out:
+        return ret;
+}
 /*
 * Mark part or all of the extent record at split_index in the leaf
 * pointed to by path as written. This removes the unwritten
@@ -4885,7 +4909,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
        if (ctxt.c_contig_type == CONTIG_NONE) {
                if (ctxt.c_split_covers_rec)
-                        el->l_recs[split_index] = *split_rec;
+                        ret = ocfs2_replace_extent_rec(inode, handle,
+                                                       path, el,
+                                                       split_index, split_rec);
                else
                        ret = ocfs2_split_and_insert(inode, handle, path, et,
                                                     &last_eb_bh, split_index,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a067a6cffb0..8e1709a679b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
        size = i_size_read(inode);
        if (size > PAGE_CACHE_SIZE ||
-            size > ocfs2_max_inline_data(inode->i_sb)) {
+            size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
                ocfs2_error(inode->i_sb,
                            "Inode %llu has with inline data has bad size: %Lu",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1555,6 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
        int ret, written = 0;
        loff_t end = pos + len;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_dinode *di = NULL;
        mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
             (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
@@ -1587,7 +1588,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
        /*
         * Check whether the write can fit.
         */
-        if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb))
+        di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+        if (mmap_page ||
+            end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
                return 0;
 do_inline_write:
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 54e182a27ca..0a281394785 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1849,12 +1849,12 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
                if (!mle) {
                        if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
                            res->owner != assert->node_idx) {
-                                mlog(ML_ERROR, "assert_master from "
+                                mlog(ML_ERROR, "DIE! Mastery assert from %u, "
-                                          "%u, but current owner is "
+                                     "but current owner is %u! (%.*s)\n",
-                                          "%u! (%.*s)\n",
+                                     assert->node_idx, res->owner, namelen,
-                                       assert->node_idx, res->owner,
+                                     name);
-                                       namelen, name);
+                                __dlm_print_one_lock_resource(res);
-                                goto kill;
+                                BUG();
                        }
                } else if (mle->type != DLM_MLE_MIGRATION) {
                        if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d1295203029..4060bb328bc 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,8 +181,7 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
                spin_lock(&res->spinlock);
                /* This ensures that clear refmap is sent after the set */
-                __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
+                __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
-                                                  DLM_LOCK_RES_MIGRATING));
                spin_unlock(&res->spinlock);
                /* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 86ca085ef32..fcf879ed693 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -117,11 +117,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
        else
                BUG_ON(res->owner == dlm->node_num);
-        spin_lock(&dlm->spinlock);
+        spin_lock(&dlm->ast_lock);
        /* We want to be sure that we're not freeing a lock
         * that still has AST's pending... */
        in_use = !list_empty(&lock->ast_list);
-        spin_unlock(&dlm->spinlock);
+        spin_unlock(&dlm->ast_lock);
        if (in_use) {
               mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
                    "while waiting for an ast!", res->lockname.len,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 206a2370876..7219a86d34c 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -320,9 +320,14 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
                                        struct ocfs2_lock_res *lockres);
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
                                                int convert);
-#define ocfs2_log_dlm_error(_func, _err, _lockres) do {                 \
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do {                                 \
-        mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
+        if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY)                               \
-             _err, _func, _lockres->l_name);                            \
+                mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n",        \
+                     _err, _func, _lockres->l_name);                                    \
+        else                                                                            \
+                mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n",  \
+                     _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name,  \
+                     (unsigned int)ocfs2_get_dentry_lock_ino(_lockres));                \
 } while (0)
 static int ocfs2_downconvert_thread(void *arg);
 static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3c3532e1307..172850a9a12 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
                                               loff_t new_size)
 {
-        return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
+        return jbd2_journal_begin_ordered_truncate(
-                                                   new_size);
+                                OCFS2_SB(inode->i_sb)->journal->j_journal,
+                                &OCFS2_I(inode)->ip_jinode,
+                                new_size);
 }
 #endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 084aba86c3b..4b11762f249 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -532,7 +532,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
-                fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb));
+                fe->id2.i_data.id_count = cpu_to_le16(
+                                ocfs2_max_inline_data_with_xattr(osb->sb, fe));
        } else {
                fel = &fe->id2.i_list;
                fel->l_tree_depth = 0;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 077384135f4..946d3c34b90 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -341,6 +341,9 @@ struct ocfs2_super
        struct ocfs2_node_map           osb_recovering_orphan_dirs;
        unsigned int                    *osb_orphan_wipes;
        wait_queue_head_t               osb_wipe_event;
+        /* used to protect metaecc calculation check of xattr. */
+        spinlock_t osb_xattr_lock;
 };
 #define OCFS2_SB(sb)        ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c7ae45aaa36..2332ef740f4 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -1070,12 +1070,6 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
                 offsetof(struct ocfs2_dinode, id2.i_symlink);
 }
-static inline int ocfs2_max_inline_data(struct super_block *sb)
-{
-        return sb->s_blocksize -
-                offsetof(struct ocfs2_dinode, id2.i_data.id_data);
-}
 static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
                                                   struct ocfs2_dinode *di)
 {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b1cb38fbe80..7ac83a81ee5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1537,6 +1537,13 @@ static int ocfs2_get_sector(struct super_block *sb,
        unlock_buffer(*bh);
        ll_rw_block(READ, 1, bh);
        wait_on_buffer(*bh);
+        if (!buffer_uptodate(*bh)) {
+                mlog_errno(-EIO);
+                brelse(*bh);
+                *bh = NULL;
+                return -EIO;
+        }
        return 0;
 }
@@ -1747,6 +1754,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        INIT_LIST_HEAD(&osb->blocked_lock_list);
        osb->blocked_lock_count = 0;
        spin_lock_init(&osb->osb_lock);
+        spin_lock_init(&osb->osb_xattr_lock);
        ocfs2_init_inode_steal_slot(osb);
        atomic_set(&osb->alloc_stats.moves, 0);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 915039fffe6..2563df89fc2 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -82,13 +82,14 @@ struct ocfs2_xattr_set_ctxt {
 #define OCFS2_XATTR_ROOT_SIZE   (sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE 80
+#define OCFS2_XATTR_HEADER_GAP  4
 #define OCFS2_XATTR_FREE_IN_IBODY       (OCFS2_MIN_XATTR_INLINE_SIZE \
                                         - sizeof(struct ocfs2_xattr_header) \
-                                         - sizeof(__u32))
+                                         - OCFS2_XATTR_HEADER_GAP)
 #define OCFS2_XATTR_FREE_IN_BLOCK(ptr)  ((ptr)->i_sb->s_blocksize \
                                         - sizeof(struct ocfs2_xattr_block) \
                                         - sizeof(struct ocfs2_xattr_header) \
-                                         - sizeof(__u32))
+                                         - OCFS2_XATTR_HEADER_GAP)
 static struct ocfs2_xattr_def_value_root def_xv = {
        .xv.xr_list.l_count = cpu_to_le16(1),
@@ -274,10 +275,12 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
                               bucket->bu_blocks, bucket->bu_bhs, 0,
                               NULL);
        if (!rc) {
+                spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
                rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
                                                 bucket->bu_bhs,
                                                 bucket->bu_blocks,
                                                 &bucket_xh(bucket)->xh_check);
+                spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
                if (rc)
                        mlog_errno(rc);
        }
@@ -310,9 +313,11 @@ static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
 {
        int i;
+        spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
        ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
                                   bucket->bu_bhs, bucket->bu_blocks,
                                   &bucket_xh(bucket)->xh_check);
+        spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
        for (i = 0; i < bucket->bu_blocks; i++)
                ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
@@ -542,8 +547,12 @@ int ocfs2_calc_xattr_init(struct inode *dir,
         * when blocksize = 512, may reserve one more cluser for
         * xattr bucket, otherwise reserve one metadata block
         * for them is ok.
+         * If this is a new directory with inline data,
+         * we choose to reserve the entire inline area for
+         * directory contents and force an external xattr block.
         */
        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+            (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
            (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
                if (ret) {
@@ -1507,7 +1516,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                last += 1;
        }
-        free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+        free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
        if (free < 0)
                return -EIO;
@@ -2190,7 +2199,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
                last += 1;
        }
-        free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+        free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
        if (free < 0)
                return 0;
@@ -2592,8 +2601,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
        if (!ret) {
                /* Update inode ctime. */
-                ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
+                ret = ocfs2_journal_access_di(ctxt->handle, inode,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                              xis->inode_bh,
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -4785,19 +4795,33 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
                                                char *val,
                                                int value_len)
 {
-        int offset;
+        int ret, offset, block_off;
        struct ocfs2_xattr_value_root *xv;
        struct ocfs2_xattr_entry *xe = xs->here;
+        struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
+        void *base;
        BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
-        offset = le16_to_cpu(xe->xe_name_offset) +
+        ret = ocfs2_xattr_bucket_get_name_value(inode, xh,
-                 OCFS2_XATTR_SIZE(xe->xe_name_len);
+                                                xe - xh->xh_entries,
+                                                &block_off,
+                                                &offset);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
-        xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
+        base = bucket_block(xs->bucket, block_off);
+        xv = (struct ocfs2_xattr_value_root *)(base + offset +
+                 OCFS2_XATTR_SIZE(xe->xe_name_len));
-        return __ocfs2_xattr_set_value_outside(inode, handle,
+        ret = __ocfs2_xattr_set_value_outside(inode, handle,
-                                               xv, val, value_len);
+                                              xv, val, value_len);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return ret;
 }
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -5060,8 +5084,8 @@ try_again:
        xh_free_start = le16_to_cpu(xh->xh_free_start);
        header_size = sizeof(struct ocfs2_xattr_header) +
                        count * sizeof(struct ocfs2_xattr_entry);
-        max_free = OCFS2_XATTR_BUCKET_SIZE -
+        max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
-                le16_to_cpu(xh->xh_name_value_len) - header_size;
+                le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
        mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
                        "of %u which exceed block size\n",
@@ -5094,7 +5118,7 @@ try_again:
                        need = 0;
        }
-        free = xh_free_start - header_size;
+        free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
        /*
         * We need to make sure the new name/value pair
         * can exist in the same block.
@@ -5127,7 +5151,8 @@ try_again:
                        }
                        xh_free_start = le16_to_cpu(xh->xh_free_start);
-                        free = xh_free_start - header_size;
+                        free = xh_free_start - header_size
+                                - OCFS2_XATTR_HEADER_GAP;
                        if (xh_free_start % blocksize < need)
                                free -= xh_free_start % blocksize;
diff --git a/fs/pipe.c b/fs/pipe.c
index 3a48ba5179d..14f502b89cf 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -699,12 +699,12 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
        int retval;
        mutex_lock(&inode->i_mutex);
        retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
+        if (retval >= 0) {
-        if (retval >= 0)
                retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
+                if (retval < 0) /* this can happen only if on == T */
+                        fasync_helper(-1, filp, 0, &pipe->fasync_readers);
+        }
        mutex_unlock(&inode->i_mutex);
        if (retval < 0)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0c9de19a163..beaa0ce3b82 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3066,7 +3066,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
        int retval = -ENOENT;
        ino_t ino;
        int tid;
-        unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
        struct pid_namespace *ns;
        task = get_proc_task(inode);
@@ -3083,18 +3082,18 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
                goto out_no_task;
        retval = 0;
-        switch (pos) {
+        switch ((unsigned long)filp->f_pos) {
        case 0:
                ino = inode->i_ino;
-                if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0)
+                if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
                        goto out;
-                pos++;
+                filp->f_pos++;
                /* fall through */
        case 1:
                ino = parent_ino(dentry);
-                if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0)
+                if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
                        goto out;
-                pos++;
+                filp->f_pos++;
                /* fall through */
        }
@@ -3104,9 +3103,9 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
        ns = filp->f_dentry->d_sb->s_fs_info;
        tid = (int)filp->f_version;
        filp->f_version = 0;
-        for (task = first_tid(leader, tid, pos - 2, ns);
+        for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
             task;
-             task = next_tid(task), pos++) {
+             task = next_tid(task), filp->f_pos++) {
                tid = task_pid_nr_ns(task, ns);
                if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
                        /* returning this tgid failed, save it as the first
@@ -3117,7 +3116,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
                }
        }
 out:
-        filp->f_pos = pos;
        put_task_struct(leader);
 out_no_task:
        return retval;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3e76bb9b3ad..d8bb5c671f4 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -485,8 +485,10 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
                        }
                }
                unlock_new_inode(inode);
-        } else
+        } else {
               module_put(de->owner);
+               de_put(de);
+        }
        return inode;
 out_ino:
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 767d95a6d1b..e9983837d08 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -80,7 +80,7 @@ static const struct file_operations proc_kpagecount_operations = {
 #define KPF_RECLAIM    9
 #define KPF_BUDDY     10
-#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos)
+#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos)
 static ssize_t kpageflags_read(struct file *file, char __user *buf,
                             size_t count, loff_t *ppos)
@@ -107,7 +107,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
                else
                        kflags = ppage->flags;
-                uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) |
+                uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) |
                        kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
                        kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
                        kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index b9b567a2837..5d7c7ececa6 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -114,6 +114,9 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
                if (!pagevec_add(&lru_pvec, page))
                        __pagevec_lru_add_file(&lru_pvec);
+                /* prevent the page from being discarded on memory pressure */
+                SetPageDirty(page);
                unlock_page(page);
        }
@@ -126,6 +129,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
        return -EFBIG;
 add_error:
+        pagevec_lru_add_file(&lru_pvec);
        page_cache_release(pages + loop);
        for (loop++; loop < npages; loop++)
                __free_page(pages + loop);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5267098532b..a1a4cfe1921 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,8 +48,16 @@ int seq_open(struct file *file, const struct seq_operations *op)
         */
        file->f_version = 0;
-        /* SEQ files support lseek, but not pread/pwrite */
+        /*
-        file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE);
+         * seq_files support lseek() and pread().  They do not implement
+         * write() at all, but we clear FMODE_PWRITE here for historical
+         * reasons.
+         *
+         * If a client of seq_files a) implements file.write() and b) wishes to
+         * support pwrite() then that client will need to implement its own
+         * file.open() which calls seq_open() and then sets FMODE_PWRITE.
+         */
+        file->f_mode &= ~FMODE_PWRITE;
        return 0;
 }
 EXPORT_SYMBOL(seq_open);
@@ -131,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
        int err = 0;
        mutex_lock(&m->lock);
+        /* Don't assume *ppos is where we left it */
+        if (unlikely(*ppos != m->read_pos)) {
+                m->read_pos = *ppos;
+                while ((err = traverse(m, *ppos)) == -EAGAIN)
+                        ;
+                if (err) {
+                        /* With prejudice... */
+                        m->read_pos = 0;
+                        m->version = 0;
+                        m->index = 0;
+                        m->count = 0;
+                        goto Done;
+                }
+        }
        /*
         * seq_file->op->..m_start/m_stop/m_next may do special actions
         * or optimisations based on the file->f_version, so we want to
@@ -230,8 +254,10 @@ Fill:
 Done:
        if (!copied)
                copied = err;
-        else
+        else {
                *ppos += copied;
+                m->read_pos += copied;
+        }
        file->f_version = m->version;
        mutex_unlock(&m->lock);
        return copied;
@@ -266,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
                        if (offset < 0)
                                break;
                        retval = offset;
-                        if (offset != file->f_pos) {
+                        if (offset != m->read_pos) {
                                while ((retval=traverse(m, offset)) == -EAGAIN)
                                        ;
                                if (retval) {
                                        /* with extreme prejudice... */
                                        file->f_pos = 0;
+                                        m->read_pos = 0;
                                        m->version = 0;
                                        m->index = 0;
                                        m->count = 0;
                                } else {
+                                        m->read_pos = offset;
                                        retval = file->f_pos = offset;
                                }
                        }
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index c837dfc2b3c..2a796031034 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -80,7 +80,7 @@ static struct buffer_head *get_block_length(struct super_block *sb,
 * generated a larger block - this does occasionally happen with zlib).
 */
 int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
-                        int length, u64 *next_index, int srclength)
+                        int length, u64 *next_index, int srclength, int pages)
 {
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        struct buffer_head **bh;
@@ -184,7 +184,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
                                offset = 0;
                        }
-                        if (msblk->stream.avail_out == 0) {
+                        if (msblk->stream.avail_out == 0 && page < pages) {
                                msblk->stream.next_out = buffer[page++];
                                msblk->stream.avail_out = PAGE_CACHE_SIZE;
                        }
@@ -201,25 +201,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
                                zlib_init = 1;
                        }
-                        zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH);
+                        zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
                        if (msblk->stream.avail_in == 0 && k < b)
                                put_bh(bh[k++]);
                } while (zlib_err == Z_OK);
                if (zlib_err != Z_STREAM_END) {
-                        ERROR("zlib_inflate returned unexpected result"
+                        ERROR("zlib_inflate error, data probably corrupt\n");
-                                " 0x%x, srclength %d, avail_in %d,"
-                                " avail_out %d\n", zlib_err, srclength,
-                                msblk->stream.avail_in,
-                                msblk->stream.avail_out);
                        goto release_mutex;
                }
                zlib_err = zlib_inflateEnd(&msblk->stream);
                if (zlib_err != Z_OK) {
-                        ERROR("zlib_inflateEnd returned unexpected result 0x%x,"
+                        ERROR("zlib_inflate error, data probably corrupt\n");
-                                " srclength %d\n", zlib_err, srclength);
                        goto release_mutex;
                }
                length = msblk->stream.total_out;
@@ -268,7 +263,8 @@ block_release:
                put_bh(bh[k]);
 read_failure:
-        ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
+        ERROR("squashfs_read_data failed to read block 0x%llx\n",
+                                        (unsigned long long) index);
        kfree(bh);
        return -EIO;
 }
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f29eda16d25..1c4739e33af 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -119,7 +119,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
                        entry->length = squashfs_read_data(sb, entry->data,
                                block, length, &entry->next_index,
-                                cache->block_size);
+                                cache->block_size, cache->pages);
                        spin_lock(&cache->lock);
@@ -406,7 +406,7 @@ int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
        for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
                data[i] = buffer;
        res = squashfs_read_data(sb, data, block, length |
-                SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length);
+                SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
        kfree(data);
        return res;
 }
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 7a63398bb85..9101dbde39e 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -133,7 +133,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
        type = le16_to_cpu(sqshb_ino->inode_type);
        switch (type) {
        case SQUASHFS_REG_TYPE: {
-                unsigned int frag_offset, frag_size, frag;
+                unsigned int frag_offset, frag;
+                int frag_size;
                u64 frag_blk;
                struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
@@ -175,7 +176,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                break;
        }
        case SQUASHFS_LREG_TYPE: {
-                unsigned int frag_offset, frag_size, frag;
+                unsigned int frag_offset, frag;
+                int frag_size;
                u64 frag_blk;
                struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 6b2515d027d..0e9feb6adf7 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -34,7 +34,7 @@ static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
 /* block.c */
 extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
-                                int);
+                                int, int);
 /* cache.c */
 extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 071df5b5b49..681ec0d8379 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -389,7 +389,7 @@ static int __init init_squashfs_fs(void)
                return err;
        }
-        printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) "
+        printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) "
                "Phillip Lougher\n");
        return 0;
diff --git a/fs/super.c b/fs/super.c
index 61dce001dd5..6ce501447ad 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -82,7 +82,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
                 * lock ordering than usbfs:
                 */
                lockdep_set_class(&s->s_lock, &type->s_lock_key);
-                down_write(&s->s_umount);
+                /*
+                 * sget() can have s_umount recursion.
+                 *
+                 * When it cannot find a suitable sb, it allocates a new
+                 * one (this one), and tries again to find a suitable old
+                 * one.
+                 *
+                 * In case that succeeds, it will acquire the s_umount
+                 * lock of the old one. Since these are clearly distrinct
+                 * locks, and this object isn't exposed yet, there's no
+                 * risk of deadlocks.
+                 *
+                 * Annotate this by putting this lock in a different
+                 * subclass.
+                 */
+                down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
                s->s_count = S_BIAS;
                atomic_set(&s->s_active, 1);
                mutex_init(&s->s_vfs_rename_mutex);
@@ -356,8 +371,10 @@ retry:
                                continue;
                        if (!grab_super(old))
                                goto retry;
-                        if (s)
+                        if (s) {
+                                up_write(&s->s_umount);
                                destroy_super(s);
+                        }
                        return old;
                }
        }
@@ -372,6 +389,7 @@ retry:
        err = set(s, data);
        if (err) {
                spin_unlock(&sb_lock);
+                up_write(&s->s_umount);
                destroy_super(s);
                return ERR_PTR(err);
        }
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a123b8ff3f..b042bd7034b 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
        BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
-        if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
+        if ((flags & ~TFD_CREATE_FLAGS) ||
-                return -EINVAL;
+            (clockid != CLOCK_MONOTONIC &&
-        if (clockid != CLOCK_MONOTONIC &&
+             clockid != CLOCK_REALTIME))
-            clockid != CLOCK_REALTIME)
                return -EINVAL;
        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
        hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
        ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
-                               flags & (O_CLOEXEC | O_NONBLOCK));
+                               flags & TFD_SHARED_FCNTL_FLAGS);
        if (ufd < 0)
                kfree(ctx);
@@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
        if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
                return -EFAULT;
-        if (!timespec_valid(&ktmr.it_value) ||
+        if ((flags & ~TFD_SETTIME_FLAGS) ||
+            !timespec_valid(&ktmr.it_value) ||
            !timespec_valid(&ktmr.it_interval))
                return -EINVAL;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index d71dc44e21e..aa1016bb913 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -34,6 +34,12 @@
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
 STATIC int xfsbufd_wakeup(int, gfp_t);
@@ -166,6 +172,75 @@ test_page_region(
 }
 /*
+ *      Mapping of multi-page buffers into contiguous virtual space
+ */
+typedef struct a_list {
+        void            *vm_addr;
+        struct a_list   *next;
+} a_list_t;
+static a_list_t         *as_free_head;
+static int              as_list_len;
+static DEFINE_SPINLOCK(as_lock);
+/*
+ *      Try to batch vunmaps because they are costly.
+ */
+STATIC void
+free_address(
+        void            *addr)
+{
+        a_list_t        *aentry;
+#ifdef CONFIG_XEN
+        /*
+         * Xen needs to be able to make sure it can get an exclusive
+         * RO mapping of pages it wants to turn into a pagetable.  If
+         * a newly allocated page is also still being vmap()ed by xfs,
+         * it will cause pagetable construction to fail.  This is a
+         * quick workaround to always eagerly unmap pages so that Xen
+         * is happy.
+         */
+        vunmap(addr);
+        return;
+#endif
+        aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
+        if (likely(aentry)) {
+                spin_lock(&as_lock);
+                aentry->next = as_free_head;
+                aentry->vm_addr = addr;
+                as_free_head = aentry;
+                as_list_len++;
+                spin_unlock(&as_lock);
+        } else {
+                vunmap(addr);
+        }
+}
+STATIC void
+purge_addresses(void)
+{
+        a_list_t        *aentry, *old;
+        if (as_free_head == NULL)
+                return;
+        spin_lock(&as_lock);
+        aentry = as_free_head;
+        as_free_head = NULL;
+        as_list_len = 0;
+        spin_unlock(&as_lock);
+        while ((old = aentry) != NULL) {
+                vunmap(aentry->vm_addr);
+                aentry = aentry->next;
+                kfree(old);
+        }
+}
+/*
 *      Internal xfs_buf_t object manipulation
 */
@@ -264,7 +339,7 @@ xfs_buf_free(
                uint            i;
                if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-                       vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
+                        free_address(bp->b_addr - bp->b_offset);
                for (i = 0; i < bp->b_page_count; i++) {
                        struct page     *page = bp->b_pages[i];
@@ -386,8 +461,10 @@ _xfs_buf_map_pages(
                bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
                bp->b_flags |= XBF_MAPPED;
        } else if (flags & XBF_MAPPED) {
-               bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+                if (as_list_len > 64)
-                                       -1, PAGE_KERNEL);
+                        purge_addresses();
+                bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
+                                        VM_MAP, PAGE_KERNEL);
                if (unlikely(bp->b_addr == NULL))
                        return -ENOMEM;
                bp->b_addr += bp->b_offset;
@@ -1364,10 +1441,12 @@ xfs_unregister_buftarg(
 void
 xfs_free_buftarg(
-        xfs_buftarg_t           *btp)
+        struct xfs_mount        *mp,
+        struct xfs_buftarg      *btp)
 {
        xfs_flush_buftarg(btp, 1);
-        xfs_blkdev_issue_flush(btp);
+        if (mp->m_flags & XFS_MOUNT_BARRIER)
+                xfs_blkdev_issue_flush(btp);
        xfs_free_bufhash(btp);
        iput(btp->bt_mapping->host);
@@ -1672,6 +1751,8 @@ xfsbufd(
                        count++;
                }
+                if (as_list_len > 0)
+                        purge_addresses();
                if (count)
                        blk_run_address_space(target->bt_mapping);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 288ae7c4c80..9b4d666ad31 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -413,7 +413,7 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
 *      Handling of buftargs.
 */
 extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
-extern void xfs_free_buftarg(xfs_buftarg_t *);
+extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index c71e226da7f..32ae5028e96 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -734,15 +734,15 @@ xfs_close_devices(
 {
        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
                struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
-                xfs_free_buftarg(mp->m_logdev_targp);
+                xfs_free_buftarg(mp, mp->m_logdev_targp);
                xfs_blkdev_put(logdev);
        }
        if (mp->m_rtdev_targp) {
                struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
-                xfs_free_buftarg(mp->m_rtdev_targp);
+                xfs_free_buftarg(mp, mp->m_rtdev_targp);
                xfs_blkdev_put(rtdev);
        }
-        xfs_free_buftarg(mp->m_ddev_targp);
+        xfs_free_buftarg(mp, mp->m_ddev_targp);
 }
 /*
@@ -811,9 +811,9 @@ xfs_open_devices(
 out_free_rtdev_targ:
        if (mp->m_rtdev_targp)
-                xfs_free_buftarg(mp->m_rtdev_targp);
+                xfs_free_buftarg(mp, mp->m_rtdev_targp);
 out_free_ddev_targ:
-        xfs_free_buftarg(mp->m_ddev_targp);
+        xfs_free_buftarg(mp, mp->m_ddev_targp);
 out_close_rtdev:
        if (rtdev)
                xfs_blkdev_put(rtdev);
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e2fb6210d4c..478e587087f 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -246,9 +246,6 @@ xfs_iget_cache_miss(
                goto out_destroy;
        }
-        if (lock_flags)
-                xfs_ilock(ip, lock_flags);
        /*
         * Preload the radix tree so we can insert safely under the
         * write spinlock. Note that we cannot sleep inside the preload
@@ -256,7 +253,16 @@ xfs_iget_cache_miss(
         */
        if (radix_tree_preload(GFP_KERNEL)) {
                error = EAGAIN;
-                goto out_unlock;
+                goto out_destroy;
+        }
+        /*
+         * Because the inode hasn't been added to the radix-tree yet it can't
+         * be found by another thread, so we can do the non-sleeping lock here.
+         */
+        if (lock_flags) {
+                if (!xfs_ilock_nowait(ip, lock_flags))
+                        BUG();
        }
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
@@ -284,7 +290,6 @@ xfs_iget_cache_miss(
 out_preload_end:
        write_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
-out_unlock:
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
 out_destroy:
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b1047de2fff..61af610d79b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1455,10 +1455,19 @@ xlog_recover_add_to_trans(
        item = item->ri_prev;
        if (item->ri_total == 0) {              /* first region to be added */
-                item->ri_total  = in_f->ilf_size;
+                if (in_f->ilf_size == 0 ||
-                ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
+                    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
-                item->ri_buf = kmem_zalloc((item->ri_total *
+                        xlog_warn(
-                                            sizeof(xfs_log_iovec_t)), KM_SLEEP);
+        "XFS: bad number of regions (%d) in inode log format",
+                                  in_f->ilf_size);
+                        ASSERT(0);
+                        return XFS_ERROR(EIO);
+                }
+                item->ri_total = in_f->ilf_size;
+                item->ri_buf =
+                        kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+                                    KM_SLEEP);
        }
        ASSERT(item->ri_total > item->ri_cnt);
        /* Description region is ri_buf[0] */