55 files changed, 991 insertions, 596 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index de34bfad9ec3..5d505aaa72fb 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -178,16 +178,17 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        if (value) {
                acl = posix_acl_from_xattr(value, size);
-                if (acl == NULL) {
+                if (acl) {
-                        value = NULL;
+                        ret = posix_acl_valid(acl);
-                        size = 0;
+                        if (ret)
+                                goto out;
                } else if (IS_ERR(acl)) {
                        return PTR_ERR(acl);
                }
        }
        ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
+out:
        posix_acl_release(acl);
        return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3458b5725540..8f4b81de3ae2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -718,7 +718,7 @@ struct btrfs_space_info {
        u64 total_bytes;        /* total bytes in the space,
                                   this doesn't take mirrors into account */
        u64 bytes_used;         /* total bytes used,
-                                   this does't take mirrors into account */
+                                   this doesn't take mirrors into account */
        u64 bytes_pinned;       /* total bytes pinned, will be freed when the
                                   transaction finishes */
        u64 bytes_reserved;     /* total bytes the allocator has reserved for
@@ -740,8 +740,10 @@ struct btrfs_space_info {
         */
        unsigned long reservation_progress;
-        int full;               /* indicates that we cannot allocate any more
+        int full:1;             /* indicates that we cannot allocate any more
                                   chunks for this space */
+        int chunk_alloc:1;      /* set if we are allocating a chunk */
        int force_alloc;        /* set if we need to force a chunk alloc for
                                   this space */
@@ -2576,6 +2578,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                              struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
+void btrfs_drop_pages(struct page **pages, size_t num_pages);
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+                      struct page **pages, size_t num_pages,
+                      loff_t pos, size_t write_bytes,
+                      struct extent_state **cached);
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8f1d44ba332f..228cf36ece83 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2824,6 +2824,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
        spin_lock(&delayed_refs->lock);
        if (delayed_refs->num_entries == 0) {
+                spin_unlock(&delayed_refs->lock);
                printk(KERN_INFO "delayed_refs has NO entry\n");
                return ret;
        }
@@ -3057,7 +3058,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
                btrfs_destroy_pinned_extent(root,
                                            root->fs_info->pinned_extents);
-                t->use_count = 0;
+                atomic_set(&t->use_count, 0);
                list_del_init(&t->list);
                memset(t, 0, sizeof(*t));
                kmem_cache_free(btrfs_transaction_cachep, t);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f619c3cb13b7..cd52f7f556ef 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,25 @@
 #include "locking.h"
 #include "free-space-cache.h"
+/* control flags for do_chunk_alloc's force field
+ * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
+ * if we really need one.
+ *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ * CHUNK_ALLOC_LIMITED means to only try and allocate one
+ * if we have very few chunks already allocated.  This is
+ * used as part of the clustering code to help make sure
+ * we have a good pool of storage to cluster in, without
+ * filling the FS with empty chunks
+ *
+ */
+enum {
+        CHUNK_ALLOC_NO_FORCE = 0,
+        CHUNK_ALLOC_FORCE = 1,
+        CHUNK_ALLOC_LIMITED = 2,
+};
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc);
@@ -3019,7 +3038,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->bytes_readonly = 0;
        found->bytes_may_use = 0;
        found->full = 0;
-        found->force_alloc = 0;
+        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
+        found->chunk_alloc = 0;
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
        atomic_set(&found->caching_threads, 0);
@@ -3150,7 +3170,7 @@ again:
                if (!data_sinfo->full && alloc_chunk) {
                        u64 alloc_target;
-                        data_sinfo->force_alloc = 1;
+                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
                        spin_unlock(&data_sinfo->lock);
 alloc:
                        alloc_target = btrfs_get_alloc_profile(root, 1);
@@ -3160,7 +3180,8 @@ alloc:
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
                                             bytes + 2 * 1024 * 1024,
-                                             alloc_target, 0);
+                                             alloc_target,
+                                             CHUNK_ALLOC_NO_FORCE);
                        btrfs_end_transaction(trans, root);
                        if (ret < 0) {
                                if (ret != -ENOSPC)
@@ -3239,31 +3260,56 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
-                        found->force_alloc = 1;
+                        found->force_alloc = CHUNK_ALLOC_FORCE;
        }
        rcu_read_unlock();
 }
 static int should_alloc_chunk(struct btrfs_root *root,
-                              struct btrfs_space_info *sinfo, u64 alloc_bytes)
+                              struct btrfs_space_info *sinfo, u64 alloc_bytes,
+                              int force)
 {
        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+        u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
        u64 thresh;
-        if (sinfo->bytes_used + sinfo->bytes_reserved +
+        if (force == CHUNK_ALLOC_FORCE)
-            alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+                return 1;
+        /*
+         * in limited mode, we want to have some free space up to
+         * about 1% of the FS size.
+         */
+        if (force == CHUNK_ALLOC_LIMITED) {
+                thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+                thresh = max_t(u64, 64 * 1024 * 1024,
+                               div_factor_fine(thresh, 1));
+                if (num_bytes - num_allocated < thresh)
+                        return 1;
+        }
+        /*
+         * we have two similar checks here, one based on percentage
+         * and once based on a hard number of 256MB.  The idea
+         * is that if we have a good amount of free
+         * room, don't allocate a chunk.  A good mount is
+         * less than 80% utilized of the chunks we have allocated,
+         * or more than 256MB free
+         */
+        if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
                return 0;
-        if (sinfo->bytes_used + sinfo->bytes_reserved +
+        if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
-            alloc_bytes < div_factor(num_bytes, 8))
                return 0;
        thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        /* 256MB or 5% of the FS */
        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
        if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
                return 0;
        return 1;
 }
@@ -3273,10 +3319,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 {
        struct btrfs_space_info *space_info;
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
+        int wait_for_alloc = 0;
        int ret = 0;
-        mutex_lock(&fs_info->chunk_mutex);
        flags = btrfs_reduce_alloc_profile(extent_root, flags);
        space_info = __find_space_info(extent_root->fs_info, flags);
@@ -3287,21 +3332,40 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        }
        BUG_ON(!space_info);
+again:
        spin_lock(&space_info->lock);
        if (space_info->force_alloc)
-                force = 1;
+                force = space_info->force_alloc;
        if (space_info->full) {
                spin_unlock(&space_info->lock);
-                goto out;
+                return 0;
        }
-        if (!force && !should_alloc_chunk(extent_root, space_info,
+        if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
-                                          alloc_bytes)) {
                spin_unlock(&space_info->lock);
-                goto out;
+                return 0;
+        } else if (space_info->chunk_alloc) {
+                wait_for_alloc = 1;
+        } else {
+                space_info->chunk_alloc = 1;
        }
        spin_unlock(&space_info->lock);
+        mutex_lock(&fs_info->chunk_mutex);
+        /*
+         * The chunk_mutex is held throughout the entirety of a chunk
+         * allocation, so once we've acquired the chunk_mutex we know that the
+         * other guy is done and we need to recheck and see if we should
+         * allocate.
+         */
+        if (wait_for_alloc) {
+                mutex_unlock(&fs_info->chunk_mutex);
+                wait_for_alloc = 0;
+                goto again;
+        }
        /*
         * If we have mixed data/metadata chunks we want to make sure we keep
         * allocating mixed chunks instead of individual chunks.
@@ -3327,9 +3391,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                space_info->full = 1;
        else
                ret = 1;
-        space_info->force_alloc = 0;
+        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
-out:
        mutex_unlock(&extent_root->fs_info->chunk_mutex);
        return ret;
 }
@@ -5303,11 +5368,13 @@ loop:
                if (allowed_chunk_alloc) {
                        ret = do_chunk_alloc(trans, root, num_bytes +
-                                             2 * 1024 * 1024, data, 1);
+                                             2 * 1024 * 1024, data,
+                                             CHUNK_ALLOC_LIMITED);
                        allowed_chunk_alloc = 0;
                        done_chunk_alloc = 1;
-                } else if (!done_chunk_alloc) {
+                } else if (!done_chunk_alloc &&
-                        space_info->force_alloc = 1;
+                           space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) {
+                        space_info->force_alloc = CHUNK_ALLOC_LIMITED;
                }
                if (loop < LOOP_NO_EMPTY_SIZE) {
@@ -5393,7 +5460,8 @@ again:
         */
        if (empty_size || root->ref_cows)
                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                     num_bytes + 2 * 1024 * 1024, data, 0);
+                                     num_bytes + 2 * 1024 * 1024, data,
+                                     CHUNK_ALLOC_NO_FORCE);
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -5405,7 +5473,7 @@ again:
                num_bytes = num_bytes & ~(root->sectorsize - 1);
                num_bytes = max(num_bytes, min_alloc_size);
                do_chunk_alloc(trans, root->fs_info->extent_root,
-                               num_bytes, data, 1);
+                               num_bytes, data, CHUNK_ALLOC_FORCE);
                goto again;
        }
        if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
@@ -7991,6 +8059,10 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
                                u64 group_start = group->key.objectid;
                                new_extents = kmalloc(sizeof(*new_extents),
                                                      GFP_NOFS);
+                                if (!new_extents) {
+                                        ret = -ENOMEM;
+                                        goto out;
+                                }
                                nr_extents = 1;
                                ret = get_new_locations(reloc_inode,
                                                        extent_key,
@@ -8109,13 +8181,15 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        alloc_flags = update_block_group_flags(root, cache->flags);
        if (alloc_flags != cache->flags)
-                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                               CHUNK_ALLOC_FORCE);
        ret = set_block_group_ro(cache);
        if (!ret)
                goto out;
        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
-        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                             CHUNK_ALLOC_FORCE);
        if (ret < 0)
                goto out;
        ret = set_block_group_ro(cache);
@@ -8128,7 +8202,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 type)
 {
        u64 alloc_flags = get_alloc_profile(root, type);
-        return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+        return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                              CHUNK_ALLOC_FORCE);
 }
 /*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 20ddb28602a8..ba41da59e31b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -690,6 +690,15 @@ static void cache_state(struct extent_state *state,
        }
 }
+static void uncache_state(struct extent_state **cached_ptr)
+{
+        if (cached_ptr && (*cached_ptr)) {
+                struct extent_state *state = *cached_ptr;
+                *cached_ptr = NULL;
+                free_extent_state(state);
+        }
+}
 /*
 * set some bits on a range in the tree.  This may require allocations or
 * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -940,10 +949,10 @@ static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 }
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-                        gfp_t mask)
+                        struct extent_state **cached_state, gfp_t mask)
 {
-        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
-                              NULL, mask);
+                              NULL, cached_state, mask);
 }
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -1012,8 +1021,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
                                mask);
 }
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
-                  gfp_t mask)
 {
        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
                                mask);
@@ -1735,6 +1743,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
        do {
                struct page *page = bvec->bv_page;
+                struct extent_state *cached = NULL;
+                struct extent_state *state;
                tree = &BTRFS_I(page->mapping->host)->io_tree;
                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1749,9 +1760,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                if (++bvec <= bvec_end)
                        prefetchw(&bvec->bv_page->flags);
+                spin_lock(&tree->lock);
+                state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
+                if (state && state->start == start) {
+                        /*
+                         * take a reference on the state, unlock will drop
+                         * the ref
+                         */
+                        cache_state(state, &cached);
+                }
+                spin_unlock(&tree->lock);
                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
                        ret = tree->ops->readpage_end_io_hook(page, start, end,
-                                                              NULL);
+                                                              state);
                        if (ret)
                                uptodate = 0;
                }
@@ -1764,15 +1786,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
                                        uptodate = 0;
+                                uncache_state(&cached);
                                continue;
                        }
                }
                if (uptodate) {
-                        set_extent_uptodate(tree, start, end,
+                        set_extent_uptodate(tree, start, end, &cached,
                                            GFP_ATOMIC);
                }
-                unlock_extent(tree, start, end, GFP_ATOMIC);
+                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
                if (whole_page) {
                        if (uptodate) {
@@ -1811,6 +1834,7 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
        do {
                struct page *page = bvec->bv_page;
+                struct extent_state *cached = NULL;
                tree = &BTRFS_I(page->mapping->host)->io_tree;
                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1821,13 +1845,14 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
                        prefetchw(&bvec->bv_page->flags);
                if (uptodate) {
-                        set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+                        set_extent_uptodate(tree, start, end, &cached,
+                                            GFP_ATOMIC);
                } else {
                        ClearPageUptodate(page);
                        SetPageError(page);
                }
-                unlock_extent(tree, start, end, GFP_ATOMIC);
+                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
        } while (bvec >= bio->bi_io_vec);
@@ -2016,14 +2041,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        while (cur <= end) {
                if (cur >= last_byte) {
                        char *userpage;
+                        struct extent_state *cached = NULL;
                        iosize = PAGE_CACHE_SIZE - page_offset;
                        userpage = kmap_atomic(page, KM_USER0);
                        memset(userpage + page_offset, 0, iosize);
                        flush_dcache_page(page);
                        kunmap_atomic(userpage, KM_USER0);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
-                                            GFP_NOFS);
+                                            &cached, GFP_NOFS);
-                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        unlock_extent_cached(tree, cur, cur + iosize - 1,
+                                             &cached, GFP_NOFS);
                        break;
                }
                em = get_extent(inode, page, page_offset, cur,
@@ -2063,14 +2091,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                /* we've found a hole, just zero and go on */
                if (block_start == EXTENT_MAP_HOLE) {
                        char *userpage;
+                        struct extent_state *cached = NULL;
                        userpage = kmap_atomic(page, KM_USER0);
                        memset(userpage + page_offset, 0, iosize);
                        flush_dcache_page(page);
                        kunmap_atomic(userpage, KM_USER0);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
-                                            GFP_NOFS);
+                                            &cached, GFP_NOFS);
-                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        unlock_extent_cached(tree, cur, cur + iosize - 1,
+                                             &cached, GFP_NOFS);
                        cur = cur + iosize;
                        page_offset += iosize;
                        continue;
@@ -2650,7 +2681,7 @@ int extent_readpages(struct extent_io_tree *tree,
                prefetchw(&page->flags);
                list_del(&page->lru);
                if (!add_to_page_cache_lru(page, mapping,
-                                        page->index, GFP_KERNEL)) {
+                                        page->index, GFP_NOFS)) {
                        __extent_read_full_page(tree, page, get_extent,
                                                &bio, 0, &bio_flags);
                }
@@ -2789,9 +2820,12 @@ int extent_prepare_write(struct extent_io_tree *tree,
                        iocount++;
                        block_start = block_start + iosize;
                } else {
-                        set_extent_uptodate(tree, block_start, cur_end,
+                        struct extent_state *cached = NULL;
+                        set_extent_uptodate(tree, block_start, cur_end, &cached,
                                            GFP_NOFS);
-                        unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+                        unlock_extent_cached(tree, block_start, cur_end,
+                                             &cached, GFP_NOFS);
                        block_start = cur_end + 1;
                }
                page_offset = block_start & (PAGE_CACHE_SIZE - 1);
@@ -3457,7 +3491,7 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
        set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                            GFP_NOFS);
+                            NULL, GFP_NOFS);
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3885,6 +3919,12 @@ static void move_pages(struct page *dst_page, struct page *src_page,
        kunmap_atomic(dst_kaddr, KM_USER0);
 }
+static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
+{
+        unsigned long distance = (src > dst) ? src - dst : dst - src;
+        return distance < len;
+}
 static void copy_pages(struct page *dst_page, struct page *src_page,
                       unsigned long dst_off, unsigned long src_off,
                       unsigned long len)
@@ -3892,10 +3932,12 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
        char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
        char *src_kaddr;
-        if (dst_page != src_page)
+        if (dst_page != src_page) {
                src_kaddr = kmap_atomic(src_page, KM_USER1);
-        else
+        } else {
                src_kaddr = dst_kaddr;
+                BUG_ON(areas_overlap(src_off, dst_off, len));
+        }
        memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
        kunmap_atomic(dst_kaddr, KM_USER0);
@@ -3970,7 +4012,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
                       "len %lu len %lu\n", dst_offset, len, dst->len);
                BUG_ON(1);
        }
-        if (dst_offset < src_offset) {
+        if (!areas_overlap(src_offset, dst_offset, len)) {
                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
                return;
        }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f62c5442835d..af2d7179c372 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -208,7 +208,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   int bits, int exclusive_bits, u64 *failed_start,
                   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-                        gfp_t mask);
+                        struct extent_state **cached_state, gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e621ea54a3fd..75899a01dded 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -104,7 +104,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 /*
 * unlocks pages after btrfs_file_write is done with them
 */
-static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
        size_t i;
        for (i = 0; i < num_pages; i++) {
@@ -127,16 +127,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
 * this also makes the decision about creating an inline extent vs
 * doing real data extents, marking pages dirty and delalloc as required.
 */
-static noinline int dirty_and_release_pages(struct btrfs_root *root,
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
-                                            struct file *file,
+                      struct page **pages, size_t num_pages,
-                                            struct page **pages,
+                      loff_t pos, size_t write_bytes,
-                                            size_t num_pages,
+                      struct extent_state **cached)
-                                            loff_t pos,
-                                            size_t write_bytes)
 {
        int err = 0;
        int i;
-        struct inode *inode = fdentry(file)->d_inode;
        u64 num_bytes;
        u64 start_pos;
        u64 end_of_last_block;
@@ -149,7 +146,7 @@ static noinline int dirty_and_release_pages(struct btrfs_root *root,
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-                                        NULL);
+                                        cached);
        if (err)
                return err;
@@ -992,9 +989,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                }
                if (copied > 0) {
-                        ret = dirty_and_release_pages(root, file, pages,
+                        ret = btrfs_dirty_pages(root, inode, pages,
-                                                      dirty_pages, pos,
+                                                dirty_pages, pos, copied,
-                                                      copied);
+                                                NULL);
                        if (ret) {
                                btrfs_delalloc_release_space(inode,
                                        dirty_pages << PAGE_CACHE_SHIFT);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f561c953205b..63731a1fb0a1 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -508,6 +508,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        struct inode *inode;
        struct rb_node *node;
        struct list_head *pos, *n;
+        struct page **pages;
        struct page *page;
        struct extent_state *cached_state = NULL;
        struct btrfs_free_cluster *cluster = NULL;
@@ -517,13 +518,13 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        u64 start, end, len;
        u64 bytes = 0;
        u32 *crc, *checksums;
-        pgoff_t index = 0, last_index = 0;
        unsigned long first_page_offset;
-        int num_checksums;
+        int index = 0, num_pages = 0;
        int entries = 0;
        int bitmaps = 0;
        int ret = 0;
        bool next_page = false;
+        bool out_of_space = false;
        root = root->fs_info->tree_root;
@@ -551,24 +552,31 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                return 0;
        }
-        last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                PAGE_CACHE_SHIFT;
        filemap_write_and_wait(inode->i_mapping);
        btrfs_wait_ordered_range(inode, inode->i_size &
                                 ~(root->sectorsize - 1), (u64)-1);
        /* We need a checksum per page. */
-        num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+        crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
-        crc = checksums  = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
        if (!crc) {
                iput(inode);
                return 0;
        }
+        pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
+        if (!pages) {
+                kfree(crc);
+                iput(inode);
+                return 0;
+        }
        /* Since the first page has all of our checksums and our generation we
         * need to calculate the offset into the page that we can start writing
         * our entries.
         */
-        first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+        first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
        /* Get the cluster for this block_group if it exists */
        if (!list_empty(&block_group->cluster_list))
@@ -590,20 +598,18 @@ int btrfs_write_out_cache(struct btrfs_root *root,
         * after find_get_page at this point.  Just putting this here so people
         * know and don't freak out.
         */
-        while (index <= last_index) {
+        while (index < num_pages) {
                page = grab_cache_page(inode->i_mapping, index);
                if (!page) {
-                        pgoff_t i = 0;
+                        int i;
-                        while (i < index) {
+                        for (i = 0; i < num_pages; i++) {
-                                page = find_get_page(inode->i_mapping, i);
+                                unlock_page(pages[i]);
-                                unlock_page(page);
+                                page_cache_release(pages[i]);
-                                page_cache_release(page);
-                                page_cache_release(page);
-                                i++;
                        }
                        goto out_free;
                }
+                pages[index] = page;
                index++;
        }
@@ -631,7 +637,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                        offset = start_offset;
                }
-                page = find_get_page(inode->i_mapping, index);
+                if (index >= num_pages) {
+                        out_of_space = true;
+                        break;
+                }
+                page = pages[index];
                addr = kmap(page);
                entry = addr + start_offset;
@@ -708,23 +719,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                bytes += PAGE_CACHE_SIZE;
-                ClearPageChecked(page);
-                set_page_extent_mapped(page);
-                SetPageUptodate(page);
-                set_page_dirty(page);
-                /*
-                 * We need to release our reference we got for grab_cache_page,
-                 * except for the first page which will hold our checksums, we
-                 * do that below.
-                 */
-                if (index != 0) {
-                        unlock_page(page);
-                        page_cache_release(page);
-                }
-                page_cache_release(page);
                index++;
        } while (node || next_page);
@@ -734,7 +728,11 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                struct btrfs_free_space *entry =
                        list_entry(pos, struct btrfs_free_space, list);
-                page = find_get_page(inode->i_mapping, index);
+                if (index >= num_pages) {
+                        out_of_space = true;
+                        break;
+                }
+                page = pages[index];
                addr = kmap(page);
                memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
@@ -745,64 +743,58 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                crc++;
                bytes += PAGE_CACHE_SIZE;
-                ClearPageChecked(page);
-                set_page_extent_mapped(page);
-                SetPageUptodate(page);
-                set_page_dirty(page);
-                unlock_page(page);
-                page_cache_release(page);
-                page_cache_release(page);
                list_del_init(&entry->list);
                index++;
        }
+        if (out_of_space) {
+                btrfs_drop_pages(pages, num_pages);
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+                                     i_size_read(inode) - 1, &cached_state,
+                                     GFP_NOFS);
+                ret = 0;
+                goto out_free;
+        }
        /* Zero out the rest of the pages just to make sure */
-        while (index <= last_index) {
+        while (index < num_pages) {
                void *addr;
-                page = find_get_page(inode->i_mapping, index);
+                page = pages[index];
                addr = kmap(page);
                memset(addr, 0, PAGE_CACHE_SIZE);
                kunmap(page);
-                ClearPageChecked(page);
-                set_page_extent_mapped(page);
-                SetPageUptodate(page);
-                set_page_dirty(page);
-                unlock_page(page);
-                page_cache_release(page);
-                page_cache_release(page);
                bytes += PAGE_CACHE_SIZE;
                index++;
        }
-        btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
        /* Write the checksums and trans id to the first page */
        {
                void *addr;
                u64 *gen;
-                page = find_get_page(inode->i_mapping, 0);
+                page = pages[0];
                addr = kmap(page);
-                memcpy(addr, checksums, sizeof(u32) * num_checksums);
+                memcpy(addr, checksums, sizeof(u32) * num_pages);
-                gen = addr + (sizeof(u32) * num_checksums);
+                gen = addr + (sizeof(u32) * num_pages);
                *gen = trans->transid;
                kunmap(page);
-                ClearPageChecked(page);
-                set_page_extent_mapped(page);
-                SetPageUptodate(page);
-                set_page_dirty(page);
-                unlock_page(page);
-                page_cache_release(page);
-                page_cache_release(page);
        }
-        BTRFS_I(inode)->generation = trans->transid;
+        ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
+                                            bytes, &cached_state);
+        btrfs_drop_pages(pages, num_pages);
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
                             i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+        if (ret) {
+                ret = 0;
+                goto out_free;
+        }
+        BTRFS_I(inode)->generation = trans->transid;
        filemap_write_and_wait(inode->i_mapping);
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -853,6 +845,7 @@ out_free:
                BTRFS_I(inode)->generation = 0;
        }
        kfree(checksums);
+        kfree(pages);
        btrfs_update_inode(trans, root, inode);
        iput(inode);
        return ret;
@@ -1775,10 +1768,13 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
        while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
                info = rb_entry(node, struct btrfs_free_space, offset_index);
-                unlink_free_space(block_group, info);
+                if (!info->bitmap) {
-                if (info->bitmap)
+                        unlink_free_space(block_group, info);
-                        kfree(info->bitmap);
+                        kmem_cache_free(btrfs_free_space_cachep, info);
-                kmem_cache_free(btrfs_free_space_cachep, info);
+                } else {
+                        free_bitmap(block_group, info);
+                }
                if (need_resched()) {
                        spin_unlock(&block_group->tree_lock);
                        cond_resched();
@@ -2308,7 +2304,7 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                        start = entry->offset;
                        bytes = min(entry->bytes, end - start);
                        unlink_free_space(block_group, entry);
-                        kfree(entry);
+                        kmem_cache_free(btrfs_free_space_cachep, entry);
                }
                spin_unlock(&block_group->tree_lock);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5cc64ab9c485..7cd8ab0ef04d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -954,6 +954,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                         1, 0, NULL, GFP_NOFS);
        while (start < end) {
                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+                BUG_ON(!async_cow);
                async_cow->inode = inode;
                async_cow->root = root;
                async_cow->locked_page = locked_page;
@@ -1770,9 +1771,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
-        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+        ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-        ret = btrfs_update_inode(trans, root, inode);
+        if (!ret) {
-        BUG_ON(ret);
+                ret = btrfs_update_inode(trans, root, inode);
+                BUG_ON(ret);
+        }
+        ret = 0;
 out:
        if (nolock) {
                if (trans)
@@ -2590,6 +2594,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct btrfs_inode_item *item,
                            struct inode *inode)
 {
+        if (!leaf->map_token)
+                map_private_extent_buffer(leaf, (unsigned long)item,
+                                          sizeof(struct btrfs_inode_item),
+                                          &leaf->map_token, &leaf->kaddr,
+                                          &leaf->map_start, &leaf->map_len,
+                                          KM_USER1);
        btrfs_set_inode_uid(leaf, item, inode->i_uid);
        btrfs_set_inode_gid(leaf, item, inode->i_gid);
        btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2618,6 +2629,11 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
        btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+        if (leaf->map_token) {
+                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                leaf->map_token = NULL;
+        }
 }
 /*
@@ -4207,10 +4223,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        struct btrfs_key found_key;
        struct btrfs_path *path;
        int ret;
-        u32 nritems;
        struct extent_buffer *leaf;
        int slot;
-        int advance;
        unsigned char d_type;
        int over = 0;
        u32 di_cur;
@@ -4253,27 +4267,19 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
-        advance = 0;
        while (1) {
                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
                slot = path->slots[0];
-                if (advance || slot >= nritems) {
+                if (slot >= btrfs_header_nritems(leaf)) {
-                        if (slot >= nritems - 1) {
+                        ret = btrfs_next_leaf(root, path);
-                                ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
-                                if (ret)
+                                goto err;
-                                        break;
+                        else if (ret > 0)
-                                leaf = path->nodes[0];
+                                break;
-                                nritems = btrfs_header_nritems(leaf);
+                        continue;
-                                slot = path->slots[0];
-                        } else {
-                                slot++;
-                                path->slots[0]++;
-                        }
                }
-                advance = 1;
                item = btrfs_item_nr(leaf, slot);
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -4282,7 +4288,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                if (btrfs_key_type(&found_key) != key_type)
                        break;
                if (found_key.offset < filp->f_pos)
-                        continue;
+                        goto next;
                filp->f_pos = found_key.offset;
@@ -4335,6 +4341,8 @@ skip:
                        di_cur += di_len;
                        di = (struct btrfs_dir_item *)((char *)di + di_len);
                }
+next:
+                path->slots[0]++;
        }
        /* Reached end of directory/root. Bump pos past the last item. */
@@ -4527,14 +4535,17 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        BUG_ON(!path);
        inode = new_inode(root->fs_info->sb);
-        if (!inode)
+        if (!inode) {
+                btrfs_free_path(path);
                return ERR_PTR(-ENOMEM);
+        }
        if (dir) {
                trace_btrfs_inode_request(dir);
                ret = btrfs_set_inode_index(dir, index);
                if (ret) {
+                        btrfs_free_path(path);
                        iput(inode);
                        return ERR_PTR(ret);
                }
@@ -4721,9 +4732,10 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len, dir->i_ino, objectid,
                                BTRFS_I(dir)->block_group, mode, &index);
-        err = PTR_ERR(inode);
+        if (IS_ERR(inode)) {
-        if (IS_ERR(inode))
+                err = PTR_ERR(inode);
                goto out_unlock;
+        }
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
@@ -4782,9 +4794,10 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len, dir->i_ino, objectid,
                                BTRFS_I(dir)->block_group, mode, &index);
-        err = PTR_ERR(inode);
+        if (IS_ERR(inode)) {
-        if (IS_ERR(inode))
+                err = PTR_ERR(inode);
                goto out_unlock;
+        }
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
@@ -4834,9 +4847,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (inode->i_nlink == ~0U)
                return -EMLINK;
-        btrfs_inc_nlink(inode);
-        inode->i_ctime = CURRENT_TIME;
        err = btrfs_set_inode_index(dir, &index);
        if (err)
                goto fail;
@@ -4852,6 +4862,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                goto fail;
        }
+        btrfs_inc_nlink(inode);
+        inode->i_ctime = CURRENT_TIME;
        btrfs_set_trans_block_group(trans, dir);
        ihold(inode);
@@ -4989,6 +5002,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
        inline_size = btrfs_file_extent_inline_item_len(leaf,
                                        btrfs_item_nr(leaf, path->slots[0]));
        tmp = kmalloc(inline_size, GFP_NOFS);
+        if (!tmp)
+                return -ENOMEM;
        ptr = btrfs_file_extent_inline_start(item);
        read_extent_buffer(leaf, tmp, ptr, inline_size);
@@ -5221,7 +5236,7 @@ again:
                        btrfs_mark_buffer_dirty(leaf);
                }
                set_extent_uptodate(io_tree, em->start,
-                                    extent_map_end(em) - 1, GFP_NOFS);
+                                    extent_map_end(em) - 1, NULL, GFP_NOFS);
                goto insert;
        } else {
                printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
@@ -5428,17 +5443,30 @@ out:
 }
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+                                                  struct extent_map *em,
                                                  u64 start, u64 len)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
-        struct extent_map *em;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct btrfs_key ins;
        u64 alloc_hint;
        int ret;
+        bool insert = false;
-        btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+        /*
+         * Ok if the extent map we looked up is a hole and is for the exact
+         * range we want, there is no reason to allocate a new one, however if
+         * it is not right then we need to free this one and drop the cache for
+         * our range.
+         */
+        if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
+            em->len != len) {
+                free_extent_map(em);
+                em = NULL;
+                insert = true;
+                btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+        }
        trans = btrfs_join_transaction(root, 0);
        if (IS_ERR(trans))
@@ -5454,10 +5482,12 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                goto out;
        }
-        em = alloc_extent_map(GFP_NOFS);
        if (!em) {
-                em = ERR_PTR(-ENOMEM);
+                em = alloc_extent_map(GFP_NOFS);
-                goto out;
+                if (!em) {
+                        em = ERR_PTR(-ENOMEM);
+                        goto out;
+                }
        }
        em->start = start;
@@ -5467,9 +5497,15 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        em->block_start = ins.objectid;
        em->block_len = ins.offset;
        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        /*
+         * We need to do this because if we're using the original em we searched
+         * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
+         */
+        em->flags = 0;
        set_bit(EXTENT_FLAG_PINNED, &em->flags);
-        while (1) {
+        while (insert) {
                write_lock(&em_tree->lock);
                ret = add_extent_mapping(em_tree, em);
                write_unlock(&em_tree->lock);
@@ -5687,8 +5723,7 @@ must_cow:
         * it above
         */
        len = bh_result->b_size;
-        free_extent_map(em);
+        em = btrfs_new_extent_direct(inode, em, start, len);
-        em = btrfs_new_extent_direct(inode, start, len);
        if (IS_ERR(em))
                return PTR_ERR(em);
        len = min(len, em->len - (start - em->start));
@@ -5851,8 +5886,10 @@ again:
        }
        add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
-        btrfs_ordered_update_i_size(inode, 0, ordered);
+        ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-        btrfs_update_inode(trans, root, inode);
+        if (!ret)
+                btrfs_update_inode(trans, root, inode);
+        ret = 0;
 out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
                             ordered->file_offset + ordered->len - 1,
@@ -5938,7 +5975,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                                         int rw, u64 file_offset, int skip_sum,
-                                         u32 *csums)
+                                         u32 *csums, int async_submit)
 {
        int write = rw & REQ_WRITE;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -5949,13 +5986,24 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
        if (ret)
                goto err;
-        if (write && !skip_sum) {
+        if (skip_sum)
+                goto map;
+        if (write && async_submit) {
                ret = btrfs_wq_submit_bio(root->fs_info,
                                   inode, rw, bio, 0, 0,
                                   file_offset,
                                   __btrfs_submit_bio_start_direct_io,
                                   __btrfs_submit_bio_done);
                goto err;
+        } else if (write) {
+                /*
+                 * If we aren't doing async submit, calculate the csum of the
+                 * bio now.
+                 */
+                ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
+                if (ret)
+                        goto err;
        } else if (!skip_sum) {
                ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
                                          file_offset, csums);
@@ -5963,7 +6011,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                        goto err;
        }
-        ret = btrfs_map_bio(root, rw, bio, 0, 1);
+map:
+        ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
 err:
        bio_put(bio);
        return ret;
@@ -5985,23 +6034,30 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        int nr_pages = 0;
        u32 *csums = dip->csums;
        int ret = 0;
+        int async_submit = 0;
        int write = rw & REQ_WRITE;
-        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
-        if (!bio)
-                return -ENOMEM;
-        bio->bi_private = dip;
-        bio->bi_end_io = btrfs_end_dio_bio;
-        atomic_inc(&dip->pending_bios);
        map_length = orig_bio->bi_size;
        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
                              &map_length, NULL, 0);
        if (ret) {
-                bio_put(bio);
+                bio_put(orig_bio);
                return -EIO;
        }
+        if (map_length >= orig_bio->bi_size) {
+                bio = orig_bio;
+                goto submit;
+        }
+        async_submit = 1;
+        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_private = dip;
+        bio->bi_end_io = btrfs_end_dio_bio;
+        atomic_inc(&dip->pending_bios);
        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
                if (unlikely(map_length < submit_len + bvec->bv_len ||
                    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
@@ -6015,7 +6071,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                        atomic_inc(&dip->pending_bios);
                        ret = __btrfs_submit_dio_bio(bio, inode, rw,
                                                     file_offset, skip_sum,
-                                                     csums);
+                                                     csums, async_submit);
                        if (ret) {
                                bio_put(bio);
                                atomic_dec(&dip->pending_bios);
@@ -6052,8 +6108,9 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                }
        }
+submit:
        ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
-                                     csums);
+                                     csums, async_submit);
        if (!ret)
                return 0;
@@ -6148,6 +6205,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
                        unsigned long nr_segs)
 {
        int seg;
+        int i;
        size_t size;
        unsigned long addr;
        unsigned blocksize_mask = root->sectorsize - 1;
@@ -6162,8 +6220,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
                addr = (unsigned long)iov[seg].iov_base;
                size = iov[seg].iov_len;
                end += size;
-                if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+                if ((addr & blocksize_mask) || (size & blocksize_mask))
                        goto out;
+                /* If this is a write we don't need to check anymore */
+                if (rw & WRITE)
+                        continue;
+                /*
+                 * Check to make sure we don't have duplicate iov_base's in this
+                 * iovec, if so return EINVAL, otherwise we'll get csum errors
+                 * when reading back.
+                 */
+                for (i = seg + 1; i < nr_segs; i++) {
+                        if (iov[seg].iov_base == iov[i].iov_base)
+                                goto out;
+                }
        }
        retval = 0;
 out:
@@ -7206,9 +7278,10 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                                dentry->d_name.len, dir->i_ino, objectid,
                                BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
                                &index);
-        err = PTR_ERR(inode);
+        if (IS_ERR(inode)) {
-        if (IS_ERR(inode))
+                err = PTR_ERR(inode);
                goto out_unlock;
+        }
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cfc264fefdb0..ffb48d6c5433 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2287,7 +2287,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
        struct btrfs_ioctl_space_info space;
        struct btrfs_ioctl_space_info *dest;
        struct btrfs_ioctl_space_info *dest_orig;
-        struct btrfs_ioctl_space_info *user_dest;
+        struct btrfs_ioctl_space_info __user *user_dest;
        struct btrfs_space_info *info;
        u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
                       BTRFS_BLOCK_GROUP_SYSTEM,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 58e7de9cc90c..0ac712efcdf2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -159,7 +159,7 @@ enum {
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-        Opt_enospc_debug, Opt_err,
+        Opt_enospc_debug, Opt_subvolrootid, Opt_err,
 };
 static match_table_t tokens = {
@@ -189,6 +189,7 @@ static match_table_t tokens = {
        {Opt_clear_cache, "clear_cache"},
        {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
        {Opt_enospc_debug, "enospc_debug"},
+        {Opt_subvolrootid, "subvolrootid=%d"},
        {Opt_err, NULL},
 };
@@ -232,6 +233,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        break;
                case Opt_subvol:
                case Opt_subvolid:
+                case Opt_subvolrootid:
                case Opt_device:
                        /*
                         * These are parsed by btrfs_parse_early_options
@@ -388,7 +390,7 @@ out:
 */
 static int btrfs_parse_early_options(const char *options, fmode_t flags,
                void *holder, char **subvol_name, u64 *subvol_objectid,
-                struct btrfs_fs_devices **fs_devices)
+                u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
 {
        substring_t args[MAX_OPT_ARGS];
        char *opts, *orig, *p;
@@ -429,6 +431,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                                        *subvol_objectid = intarg;
                        }
                        break;
+                case Opt_subvolrootid:
+                        intarg = 0;
+                        error = match_int(&args[0], &intarg);
+                        if (!error) {
+                                /* we want the original fs_tree */
+                                if (!intarg)
+                                        *subvol_rootid =
+                                                BTRFS_FS_TREE_OBJECTID;
+                                else
+                                        *subvol_rootid = intarg;
+                        }
+                        break;
                case Opt_device:
                        error = btrfs_scan_one_device(match_strdup(&args[0]),
                                        flags, holder, fs_devices);
@@ -736,6 +750,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        fmode_t mode = FMODE_READ;
        char *subvol_name = NULL;
        u64 subvol_objectid = 0;
+        u64 subvol_rootid = 0;
        int error = 0;
        if (!(flags & MS_RDONLY))
@@ -743,7 +758,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        error = btrfs_parse_early_options(data, mode, fs_type,
                                          &subvol_name, &subvol_objectid,
-                                          &fs_devices);
+                                          &subvol_rootid, &fs_devices);
        if (error)
                return ERR_PTR(error);
@@ -807,15 +822,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                s->s_flags |= MS_ACTIVE;
        }
-        root = get_default_root(s, subvol_objectid);
-        if (IS_ERR(root)) {
-                error = PTR_ERR(root);
-                deactivate_locked_super(s);
-                goto error_free_subvol_name;
-        }
        /* if they gave us a subvolume name bind mount into that */
        if (strcmp(subvol_name, ".")) {
                struct dentry *new_root;
+                root = get_default_root(s, subvol_rootid);
+                if (IS_ERR(root)) {
+                        error = PTR_ERR(root);
+                        deactivate_locked_super(s);
+                        goto error_free_subvol_name;
+                }
                mutex_lock(&root->d_inode->i_mutex);
                new_root = lookup_one_len(subvol_name, root,
                                      strlen(subvol_name));
@@ -836,6 +853,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                }
                dput(root);
                root = new_root;
+        } else {
+                root = get_default_root(s, subvol_objectid);
+                if (IS_ERR(root)) {
+                        error = PTR_ERR(root);
+                        deactivate_locked_super(s);
+                        goto error_free_subvol_name;
+                }
        }
        kfree(subvol_name);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5b158da7e0bb..c571734d5e5a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -32,10 +32,8 @@
 static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
-        WARN_ON(transaction->use_count == 0);
+        WARN_ON(atomic_read(&transaction->use_count) == 0);
-        transaction->use_count--;
+        if (atomic_dec_and_test(&transaction->use_count)) {
-        if (transaction->use_count == 0) {
-                list_del_init(&transaction->list);
                memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
@@ -60,14 +58,14 @@ static noinline int join_transaction(struct btrfs_root *root)
                if (!cur_trans)
                        return -ENOMEM;
                root->fs_info->generation++;
-                cur_trans->num_writers = 1;
+                atomic_set(&cur_trans->num_writers, 1);
                cur_trans->num_joined = 0;
                cur_trans->transid = root->fs_info->generation;
                init_waitqueue_head(&cur_trans->writer_wait);
                init_waitqueue_head(&cur_trans->commit_wait);
                cur_trans->in_commit = 0;
                cur_trans->blocked = 0;
-                cur_trans->use_count = 1;
+                atomic_set(&cur_trans->use_count, 1);
                cur_trans->commit_done = 0;
                cur_trans->start_time = get_seconds();
@@ -88,7 +86,7 @@ static noinline int join_transaction(struct btrfs_root *root)
                root->fs_info->running_transaction = cur_trans;
                spin_unlock(&root->fs_info->new_trans_lock);
        } else {
-                cur_trans->num_writers++;
+                atomic_inc(&cur_trans->num_writers);
                cur_trans->num_joined++;
        }
@@ -145,7 +143,7 @@ static void wait_current_trans(struct btrfs_root *root)
        cur_trans = root->fs_info->running_transaction;
        if (cur_trans && cur_trans->blocked) {
                DEFINE_WAIT(wait);
-                cur_trans->use_count++;
+                atomic_inc(&cur_trans->use_count);
                while (1) {
                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
                                        TASK_UNINTERRUPTIBLE);
@@ -181,6 +179,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 {
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
+        int retries = 0;
        int ret;
        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -204,7 +203,7 @@ again:
        }
        cur_trans = root->fs_info->running_transaction;
-        cur_trans->use_count++;
+        atomic_inc(&cur_trans->use_count);
        if (type != TRANS_JOIN_NOLOCK)
                mutex_unlock(&root->fs_info->trans_mutex);
@@ -224,10 +223,18 @@ again:
        if (num_items > 0) {
                ret = btrfs_trans_reserve_metadata(h, root, num_items);
-                if (ret == -EAGAIN) {
+                if (ret == -EAGAIN && !retries) {
+                        retries++;
                        btrfs_commit_transaction(h, root);
                        goto again;
+                } else if (ret == -EAGAIN) {
+                        /*
+                         * We have already retried and got EAGAIN, so really we
+                         * don't have space, so set ret to -ENOSPC.
+                         */
+                        ret = -ENOSPC;
                }
                if (ret < 0) {
                        btrfs_end_transaction(h, root);
                        return ERR_PTR(ret);
@@ -327,7 +334,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
                        goto out_unlock;  /* nothing committing|committed */
        }
-        cur_trans->use_count++;
+        atomic_inc(&cur_trans->use_count);
        mutex_unlock(&root->fs_info->trans_mutex);
        wait_for_commit(root, cur_trans);
@@ -457,18 +464,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                        wake_up_process(info->transaction_kthread);
        }
-        if (lock)
-                mutex_lock(&info->trans_mutex);
        WARN_ON(cur_trans != info->running_transaction);
-        WARN_ON(cur_trans->num_writers < 1);
+        WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
-        cur_trans->num_writers--;
+        atomic_dec(&cur_trans->num_writers);
        smp_mb();
        if (waitqueue_active(&cur_trans->writer_wait))
                wake_up(&cur_trans->writer_wait);
        put_transaction(cur_trans);
-        if (lock)
-                mutex_unlock(&info->trans_mutex);
        if (current->journal_info == trans)
                current->journal_info = NULL;
@@ -1178,7 +1181,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        /* take transaction reference */
        mutex_lock(&root->fs_info->trans_mutex);
        cur_trans = trans->transaction;
-        cur_trans->use_count++;
+        atomic_inc(&cur_trans->use_count);
        mutex_unlock(&root->fs_info->trans_mutex);
        btrfs_end_transaction(trans, root);
@@ -1237,7 +1240,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        mutex_lock(&root->fs_info->trans_mutex);
        if (cur_trans->in_commit) {
-                cur_trans->use_count++;
+                atomic_inc(&cur_trans->use_count);
                mutex_unlock(&root->fs_info->trans_mutex);
                btrfs_end_transaction(trans, root);
@@ -1259,7 +1262,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
                if (!prev_trans->commit_done) {
-                        prev_trans->use_count++;
+                        atomic_inc(&prev_trans->use_count);
                        mutex_unlock(&root->fs_info->trans_mutex);
                        wait_for_commit(root, prev_trans);
@@ -1300,14 +1303,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                                TASK_UNINTERRUPTIBLE);
                smp_mb();
-                if (cur_trans->num_writers > 1)
+                if (atomic_read(&cur_trans->num_writers) > 1)
                        schedule_timeout(MAX_SCHEDULE_TIMEOUT);
                else if (should_grow)
                        schedule_timeout(1);
                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&cur_trans->writer_wait, &wait);
-        } while (cur_trans->num_writers > 1 ||
+        } while (atomic_read(&cur_trans->num_writers) > 1 ||
                 (should_grow && cur_trans->num_joined != joined));
        ret = create_pending_snapshots(trans, root->fs_info);
@@ -1394,6 +1397,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        wake_up(&cur_trans->commit_wait);
+        list_del_init(&cur_trans->list);
        put_transaction(cur_trans);
        put_transaction(cur_trans);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 229a594cacd5..e441acc6c584 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,11 +27,11 @@ struct btrfs_transaction {
         * total writers in this transaction, it must be zero before the
         * transaction can end
         */
-        unsigned long num_writers;
+        atomic_t num_writers;
        unsigned long num_joined;
        int in_commit;
-        int use_count;
+        atomic_t use_count;
        int commit_done;
        int blocked;
        struct list_head list;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c50271ad3157..f997ec0c1ba4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2209,8 +2209,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        log = root->log_root;
        path = btrfs_alloc_path();
-        if (!path)
+        if (!path) {
-                return -ENOMEM;
+                err = -ENOMEM;
+                goto out_unlock;
+        }
        di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
                                   name, name_len, -1);
@@ -2271,6 +2273,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        }
 fail:
        btrfs_free_path(path);
+out_unlock:
        mutex_unlock(&BTRFS_I(dir)->log_mutex);
        if (ret == -ENOSPC) {
                root->fs_info->last_trans_log_full_commit = trans->transid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 309a57b9fc85..c7367ae5a3e6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -155,6 +155,15 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        unsigned long limit;
        unsigned long last_waited = 0;
        int force_reg = 0;
+        struct blk_plug plug;
+        /*
+         * this function runs all the bios we've collected for
+         * a particular device.  We don't want to wander off to
+         * another device without first sending all of these down.
+         * So, setup a plug here and finish it off before we return
+         */
+        blk_start_plug(&plug);
        bdi = blk_get_backing_dev_info(device->bdev);
        fs_info = device->dev_root->fs_info;
@@ -294,6 +303,7 @@ loop_lock:
        spin_unlock(&device->io_lock);
 done:
+        blk_finish_plug(&plug);
        return 0;
 }
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a5303b871b13..cfd660550ded 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -180,11 +180,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_dir_item *di;
-        int ret = 0, slot, advance;
+        int ret = 0, slot;
        size_t total_size = 0, size_left = size;
        unsigned long name_ptr;
        size_t name_len;
-        u32 nritems;
        /*
         * ok we want all objects associated with this id.
@@ -204,34 +203,24 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
-        advance = 0;
        while (1) {
                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
                slot = path->slots[0];
                /* this is where we start walking through the path */
-                if (advance || slot >= nritems) {
+                if (slot >= btrfs_header_nritems(leaf)) {
                        /*
                         * if we've reached the last slot in this leaf we need
                         * to go to the next leaf and reset everything
                         */
-                        if (slot >= nritems-1) {
+                        ret = btrfs_next_leaf(root, path);
-                                ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
-                                if (ret)
+                                goto err;
-                                        break;
+                        else if (ret > 0)
-                                leaf = path->nodes[0];
+                                break;
-                                nritems = btrfs_header_nritems(leaf);
+                        continue;
-                                slot = path->slots[0];
-                        } else {
-                                /*
-                                 * just walking through the slots on this leaf
-                                 */
-                                slot++;
-                                path->slots[0]++;
-                        }
                }
-                advance = 1;
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -250,7 +239,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                /* we are just looking for how big our buffer needs to be */
                if (!size)
-                        continue;
+                        goto next;
                if (!buffer || (name_len + 1) > size_left) {
                        ret = -ERANGE;
@@ -263,6 +252,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                size_left -= name_len + 1;
                buffer += name_len + 1;
+next:
+                path->slots[0]++;
        }
        ret = total_size;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index db9d55b507d0..4bc862a80efa 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -807,8 +807,7 @@ static int
 cifs_parse_mount_options(char *options, const char *devname,
                         struct smb_vol *vol)
 {
-        char *value;
+        char *value, *data, *end;
-        char *data;
        unsigned int  temp_len, i, j;
        char separator[2];
        short int override_uid = -1;
@@ -851,6 +850,7 @@ cifs_parse_mount_options(char *options, const char *devname,
        if (!options)
                return 1;
+        end = options + strlen(options);
        if (strncmp(options, "sep=", 4) == 0) {
                if (options[4] != 0) {
                        separator[0] = options[4];
@@ -916,6 +916,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        the only illegal character in a password is null */
                        if ((value[temp_len] == 0) &&
+                            (value + temp_len < end) &&
                            (value[temp_len+1] == separator[0])) {
                                /* reinsert comma */
                                value[temp_len] = separator[0];
diff --git a/fs/dcache.c b/fs/dcache.c
index 129a35730994..22a0ef41bad1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -99,12 +99,9 @@ static struct kmem_cache *dentry_cache __read_mostly;
 static unsigned int d_hash_mask __read_mostly;
 static unsigned int d_hash_shift __read_mostly;
-struct dcache_hash_bucket {
+static struct hlist_bl_head *dentry_hashtable __read_mostly;
-        struct hlist_bl_head head;
-};
-static struct dcache_hash_bucket *dentry_hashtable __read_mostly;
-static inline struct dcache_hash_bucket *d_hash(struct dentry *parent,
+static inline struct hlist_bl_head *d_hash(struct dentry *parent,
                                        unsigned long hash)
 {
        hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
@@ -112,16 +109,6 @@ static inline struct dcache_hash_bucket *d_hash(struct dentry *parent,
        return dentry_hashtable + (hash & D_HASHMASK);
 }
-static inline void spin_lock_bucket(struct dcache_hash_bucket *b)
-{
-        bit_spin_lock(0, (unsigned long *)&b->head.first);
-}
-static inline void spin_unlock_bucket(struct dcache_hash_bucket *b)
-{
-        __bit_spin_unlock(0, (unsigned long *)&b->head.first);
-}
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
@@ -167,8 +154,8 @@ static void d_free(struct dentry *dentry)
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
-        /* if dentry was never inserted into hash, immediate free is OK */
+        /* if dentry was never visible to RCU, immediate free is OK */
-        if (hlist_bl_unhashed(&dentry->d_hash))
+        if (!(dentry->d_flags & DCACHE_RCUACCESS))
                __d_free(&dentry->d_u.d_rcu);
        else
                call_rcu(&dentry->d_u.d_rcu, __d_free);
@@ -330,28 +317,19 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 */
 void __d_drop(struct dentry *dentry)
 {
-        if (!(dentry->d_flags & DCACHE_UNHASHED)) {
+        if (!d_unhashed(dentry)) {
-                if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) {
+                struct hlist_bl_head *b;
-                        bit_spin_lock(0,
+                if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
-                                (unsigned long *)&dentry->d_sb->s_anon.first);
+                        b = &dentry->d_sb->s_anon;
-                        dentry->d_flags |= DCACHE_UNHASHED;
+                else
-                        hlist_bl_del_init(&dentry->d_hash);
-                        __bit_spin_unlock(0,
-                                (unsigned long *)&dentry->d_sb->s_anon.first);
-                } else {
-                        struct dcache_hash_bucket *b;
                        b = d_hash(dentry->d_parent, dentry->d_name.hash);
-                        spin_lock_bucket(b);
-                        /*
+                hlist_bl_lock(b);
-                         * We may not actually need to put DCACHE_UNHASHED
+                __hlist_bl_del(&dentry->d_hash);
-                         * manipulations under the hash lock, but follow
+                dentry->d_hash.pprev = NULL;
-                         * the principle of least surprise.
+                hlist_bl_unlock(b);
-                         */
-                        dentry->d_flags |= DCACHE_UNHASHED;
+                dentry_rcuwalk_barrier(dentry);
-                        hlist_bl_del_rcu(&dentry->d_hash);
-                        spin_unlock_bucket(b);
-                        dentry_rcuwalk_barrier(dentry);
-                }
        }
 }
 EXPORT_SYMBOL(__d_drop);
@@ -1304,7 +1282,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        dname[name->len] = 0;
        dentry->d_count = 1;
-        dentry->d_flags = DCACHE_UNHASHED;
+        dentry->d_flags = 0;
        spin_lock_init(&dentry->d_lock);
        seqcount_init(&dentry->d_seq);
        dentry->d_inode = NULL;
@@ -1606,10 +1584,9 @@ struct dentry *d_obtain_alias(struct inode *inode)
        tmp->d_inode = inode;
        tmp->d_flags |= DCACHE_DISCONNECTED;
        list_add(&tmp->d_alias, &inode->i_dentry);
-        bit_spin_lock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
+        hlist_bl_lock(&tmp->d_sb->s_anon);
-        tmp->d_flags &= ~DCACHE_UNHASHED;
        hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
-        __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
+        hlist_bl_unlock(&tmp->d_sb->s_anon);
        spin_unlock(&tmp->d_lock);
        spin_unlock(&inode->i_lock);
        security_d_instantiate(tmp, inode);
@@ -1789,7 +1766,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
        unsigned int len = name->len;
        unsigned int hash = name->hash;
        const unsigned char *str = name->name;
-        struct dcache_hash_bucket *b = d_hash(parent, hash);
+        struct hlist_bl_head *b = d_hash(parent, hash);
        struct hlist_bl_node *node;
        struct dentry *dentry;
@@ -1813,7 +1790,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
-        hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
+        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                struct inode *i;
                const char *tname;
                int tlen;
@@ -1908,7 +1885,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
        unsigned int len = name->len;
        unsigned int hash = name->hash;
        const unsigned char *str = name->name;
-        struct dcache_hash_bucket *b = d_hash(parent, hash);
+        struct hlist_bl_head *b = d_hash(parent, hash);
        struct hlist_bl_node *node;
        struct dentry *found = NULL;
        struct dentry *dentry;
@@ -1935,7 +1912,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
         */
        rcu_read_lock();
        
-        hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
+        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                const char *tname;
                int tlen;
@@ -2086,13 +2063,13 @@ again:
 }
 EXPORT_SYMBOL(d_delete);
-static void __d_rehash(struct dentry * entry, struct dcache_hash_bucket *b)
+static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
 {
        BUG_ON(!d_unhashed(entry));
-        spin_lock_bucket(b);
+        hlist_bl_lock(b);
-        entry->d_flags &= ~DCACHE_UNHASHED;
+        entry->d_flags |= DCACHE_RCUACCESS;
-        hlist_bl_add_head_rcu(&entry->d_hash, &b->head);
+        hlist_bl_add_head_rcu(&entry->d_hash, b);
-        spin_unlock_bucket(b);
+        hlist_bl_unlock(b);
 }
 static void _d_rehash(struct dentry * entry)
@@ -3025,7 +3002,7 @@ static void __init dcache_init_early(void)
        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
-                                        sizeof(struct dcache_hash_bucket),
+                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_EARLY,
@@ -3034,7 +3011,7 @@ static void __init dcache_init_early(void)
                                        0);
        for (loop = 0; loop < (1 << d_hash_shift); loop++)
-                INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
+                INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
 }
 static void __init dcache_init(void)
@@ -3057,7 +3034,7 @@ static void __init dcache_init(void)
        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
-                                        sizeof(struct dcache_hash_bucket),
+                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        0,
@@ -3066,7 +3043,7 @@ static void __init dcache_init(void)
                                        0);
        for (loop = 0; loop < (1 << d_hash_shift); loop++)
-                INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
+                INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
 }
 /* SLAB cache for __getname() consumers */
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index d2a70a4561f9..b8d5c8091024 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1452,6 +1452,25 @@ static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
        crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 }
+void ecryptfs_i_size_init(const char *page_virt, struct inode *inode)
+{
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+        struct ecryptfs_crypt_stat *crypt_stat;
+        u64 file_size;
+        crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+        mount_crypt_stat =
+                &ecryptfs_superblock_to_private(inode->i_sb)->mount_crypt_stat;
+        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
+                file_size = i_size_read(ecryptfs_inode_to_lower(inode));
+                if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+                        file_size += crypt_stat->metadata_size;
+        } else
+                file_size = get_unaligned_be64(page_virt);
+        i_size_write(inode, (loff_t)file_size);
+        crypt_stat->flags |= ECRYPTFS_I_SIZE_INITIALIZED;
+}
 /**
 * ecryptfs_read_headers_virt
 * @page_virt: The virtual address into which to read the headers
@@ -1482,6 +1501,8 @@ static int ecryptfs_read_headers_virt(char *page_virt,
                rc = -EINVAL;
                goto out;
        }
+        if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED))
+                ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
        offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
        rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset),
                                    &bytes_read);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index bd3cafd0949d..e70282775e2c 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -269,6 +269,7 @@ struct ecryptfs_crypt_stat {
 #define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00000800
 #define ECRYPTFS_ENCFN_USE_FEK        0x00001000
 #define ECRYPTFS_UNLINK_SIGS          0x00002000
+#define ECRYPTFS_I_SIZE_INITIALIZED   0x00004000
        u32 flags;
        unsigned int file_version;
        size_t iv_bytes;
@@ -295,6 +296,8 @@ struct ecryptfs_crypt_stat {
 struct ecryptfs_inode_info {
        struct inode vfs_inode;
        struct inode *wii_inode;
+        struct mutex lower_file_mutex;
+        atomic_t lower_file_count;
        struct file *lower_file;
        struct ecryptfs_crypt_stat crypt_stat;
 };
@@ -626,6 +629,7 @@ struct ecryptfs_open_req {
 int ecryptfs_interpose(struct dentry *hidden_dentry,
                       struct dentry *this_dentry, struct super_block *sb,
                       u32 flags);
+void ecryptfs_i_size_init(const char *page_virt, struct inode *inode);
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                                        struct dentry *lower_dentry,
                                        struct inode *ecryptfs_dir_inode);
@@ -757,7 +761,8 @@ int ecryptfs_privileged_open(struct file **lower_file,
                             struct dentry *lower_dentry,
                             struct vfsmount *lower_mnt,
                             const struct cred *cred);
-int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
+int ecryptfs_get_lower_file(struct dentry *ecryptfs_dentry);
+void ecryptfs_put_lower_file(struct inode *inode);
 int
 ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                             size_t *packet_size,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index cedc913d11ba..566e5472f78c 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -191,10 +191,10 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                                      | ECRYPTFS_ENCRYPTED);
        }
        mutex_unlock(&crypt_stat->cs_mutex);
-        rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+        rc = ecryptfs_get_lower_file(ecryptfs_dentry);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to initialize "
-                        "the persistent file for the dentry with name "
+                        "the lower file for the dentry with name "
                        "[%s]; rc = [%d]\n", __func__,
                        ecryptfs_dentry->d_name.name, rc);
                goto out_free;
@@ -202,9 +202,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
            == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
                rc = -EPERM;
-                printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
+                printk(KERN_WARNING "%s: Lower file is RO; eCryptfs "
                       "file must hence be opened RO\n", __func__);
-                goto out_free;
+                goto out_put;
        }
        ecryptfs_set_file_lower(
                file, ecryptfs_inode_to_private(inode)->lower_file);
@@ -232,10 +232,11 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                                       "Plaintext passthrough mode is not "
                                       "enabled; returning -EIO\n");
                                mutex_unlock(&crypt_stat->cs_mutex);
-                                goto out_free;
+                                goto out_put;
                        }
                        rc = 0;
-                        crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+                        crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED
+                                               | ECRYPTFS_ENCRYPTED);
                        mutex_unlock(&crypt_stat->cs_mutex);
                        goto out;
                }
@@ -245,6 +246,8 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                        "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
                        (unsigned long long)i_size_read(inode));
        goto out;
+out_put:
+        ecryptfs_put_lower_file(inode);
 out_free:
        kmem_cache_free(ecryptfs_file_info_cache,
                        ecryptfs_file_to_private(file));
@@ -254,17 +257,13 @@ out:
 static int ecryptfs_flush(struct file *file, fl_owner_t td)
 {
-        int rc = 0;
+        return file->f_mode & FMODE_WRITE
-        struct file *lower_file = NULL;
+               ? filemap_write_and_wait(file->f_mapping) : 0;
-        lower_file = ecryptfs_file_to_lower(file);
-        if (lower_file->f_op && lower_file->f_op->flush)
-                rc = lower_file->f_op->flush(lower_file, td);
-        return rc;
 }
 static int ecryptfs_release(struct inode *inode, struct file *file)
 {
+        ecryptfs_put_lower_file(inode);
        kmem_cache_free(ecryptfs_file_info_cache,
                        ecryptfs_file_to_private(file));
        return 0;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index f99051b7adab..4d4cc6a90cd5 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -168,19 +168,18 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
                                "context; rc = [%d]\n", rc);
                goto out;
        }
-        rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+        rc = ecryptfs_get_lower_file(ecryptfs_dentry);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to initialize "
-                        "the persistent file for the dentry with name "
+                        "the lower file for the dentry with name "
                        "[%s]; rc = [%d]\n", __func__,
                        ecryptfs_dentry->d_name.name, rc);
                goto out;
        }
        rc = ecryptfs_write_metadata(ecryptfs_dentry);
-        if (rc) {
+        if (rc)
                printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
-                goto out;
+        ecryptfs_put_lower_file(ecryptfs_dentry->d_inode);
-        }
 out:
        return rc;
 }
@@ -226,11 +225,9 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
        struct dentry *lower_dir_dentry;
        struct vfsmount *lower_mnt;
        struct inode *lower_inode;
-        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
        struct ecryptfs_crypt_stat *crypt_stat;
        char *page_virt = NULL;
-        u64 file_size;
+        int put_lower = 0, rc = 0;
-        int rc = 0;
        lower_dir_dentry = lower_dentry->d_parent;
        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
@@ -277,14 +274,15 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                rc = -ENOMEM;
                goto out;
        }
-        rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+        rc = ecryptfs_get_lower_file(ecryptfs_dentry);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to initialize "
-                        "the persistent file for the dentry with name "
+                        "the lower file for the dentry with name "
                        "[%s]; rc = [%d]\n", __func__,
                        ecryptfs_dentry->d_name.name, rc);
                goto out_free_kmem;
        }
+        put_lower = 1;
        crypt_stat = &ecryptfs_inode_to_private(
                                        ecryptfs_dentry->d_inode)->crypt_stat;
        /* TODO: lock for crypt_stat comparison */
@@ -302,18 +300,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                }
                crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
        }
-        mount_crypt_stat = &ecryptfs_superblock_to_private(
+        ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
-                ecryptfs_dentry->d_sb)->mount_crypt_stat;
-        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
-                if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-                        file_size = (crypt_stat->metadata_size
-                                     + i_size_read(lower_dentry->d_inode));
-                else
-                        file_size = i_size_read(lower_dentry->d_inode);
-        } else {
-                file_size = get_unaligned_be64(page_virt);
-        }
-        i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size);
 out_free_kmem:
        kmem_cache_free(ecryptfs_header_cache_2, page_virt);
        goto out;
@@ -322,6 +309,8 @@ out_put:
        mntput(lower_mnt);
        d_drop(ecryptfs_dentry);
 out:
+        if (put_lower)
+                ecryptfs_put_lower_file(ecryptfs_dentry->d_inode);
        return rc;
 }
@@ -538,8 +527,6 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
        dget(lower_dentry);
        rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry);
        dput(lower_dentry);
-        if (!rc)
-                d_delete(lower_dentry);
        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
        dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
        unlock_dir(lower_dir_dentry);
@@ -610,8 +597,8 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode);
 out_lock:
        unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
-        dput(lower_new_dentry->d_parent);
+        dput(lower_new_dir_dentry);
-        dput(lower_old_dentry->d_parent);
+        dput(lower_old_dir_dentry);
        dput(lower_new_dentry);
        dput(lower_old_dentry);
        return rc;
@@ -759,8 +746,11 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
        if (unlikely((ia->ia_size == i_size))) {
                lower_ia->ia_valid &= ~ATTR_SIZE;
-                goto out;
+                return 0;
        }
+        rc = ecryptfs_get_lower_file(dentry);
+        if (rc)
+                return rc;
        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
        /* Switch on growing or shrinking file */
        if (ia->ia_size > i_size) {
@@ -838,6 +828,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                        lower_ia->ia_valid &= ~ATTR_SIZE;
        }
 out:
+        ecryptfs_put_lower_file(inode);
        return rc;
 }
@@ -913,7 +904,13 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
                mount_crypt_stat = &ecryptfs_superblock_to_private(
                        dentry->d_sb)->mount_crypt_stat;
+                rc = ecryptfs_get_lower_file(dentry);
+                if (rc) {
+                        mutex_unlock(&crypt_stat->cs_mutex);
+                        goto out;
+                }
                rc = ecryptfs_read_metadata(dentry);
+                ecryptfs_put_lower_file(inode);
                if (rc) {
                        if (!(mount_crypt_stat->flags
                              & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
@@ -927,10 +924,17 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
                                goto out;
                        }
                        rc = 0;
-                        crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+                        crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED
+                                               | ECRYPTFS_ENCRYPTED);
                }
        }
        mutex_unlock(&crypt_stat->cs_mutex);
+        if (S_ISREG(inode->i_mode)) {
+                rc = filemap_write_and_wait(inode->i_mapping);
+                if (rc)
+                        goto out;
+                fsstack_copy_attr_all(inode, lower_inode);
+        }
        memcpy(&lower_ia, ia, sizeof(lower_ia));
        if (ia->ia_valid & ATTR_FILE)
                lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file);
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index 0851ab6980f5..69f994a7d524 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -44,7 +44,7 @@ static struct task_struct *ecryptfs_kthread;
 * @ignored: ignored
 *
 * The eCryptfs kernel thread that has the responsibility of getting
- * the lower persistent file with RW permissions.
+ * the lower file with RW permissions.
 *
 * Returns zero on success; non-zero otherwise
 */
@@ -141,8 +141,8 @@ int ecryptfs_privileged_open(struct file **lower_file,
        int rc = 0;
        /* Corresponding dput() and mntput() are done when the
-         * persistent file is fput() when the eCryptfs inode is
+         * lower file is fput() when all eCryptfs files for the inode are
-         * destroyed. */
+         * released. */
        dget(lower_dentry);
        mntget(lower_mnt);
        flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fdb2eb0ad09e..89b93389af8e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -96,7 +96,7 @@ void __ecryptfs_printk(const char *fmt, ...)
 }
 /**
- * ecryptfs_init_persistent_file
+ * ecryptfs_init_lower_file
 * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with
 *                   the lower dentry and the lower mount set
 *
@@ -104,42 +104,70 @@ void __ecryptfs_printk(const char *fmt, ...)
 * inode. All I/O operations to the lower inode occur through that
 * file. When the first eCryptfs dentry that interposes with the first
 * lower dentry for that inode is created, this function creates the
- * persistent file struct and associates it with the eCryptfs
+ * lower file struct and associates it with the eCryptfs
- * inode. When the eCryptfs inode is destroyed, the file is closed.
+ * inode. When all eCryptfs files associated with the inode are released, the
+ * file is closed.
 *
- * The persistent file will be opened with read/write permissions, if
+ * The lower file will be opened with read/write permissions, if
 * possible. Otherwise, it is opened read-only.
 *
- * This function does nothing if a lower persistent file is already
+ * This function does nothing if a lower file is already
 * associated with the eCryptfs inode.
 *
 * Returns zero on success; non-zero otherwise
 */
-int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
+static int ecryptfs_init_lower_file(struct dentry *dentry,
+                                    struct file **lower_file)
 {
        const struct cred *cred = current_cred();
-        struct ecryptfs_inode_info *inode_info =
+        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-                ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
+        struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
-        int rc = 0;
+        int rc;
-        if (!inode_info->lower_file) {
+        rc = ecryptfs_privileged_open(lower_file, lower_dentry, lower_mnt,
-                struct dentry *lower_dentry;
+                                      cred);
-                struct vfsmount *lower_mnt =
+        if (rc) {
-                        ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
+                printk(KERN_ERR "Error opening lower file "
+                       "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
+                       "rc = [%d]\n", lower_dentry, lower_mnt, rc);
+                (*lower_file) = NULL;
+        }
+        return rc;
+}
-                lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+int ecryptfs_get_lower_file(struct dentry *dentry)
-                rc = ecryptfs_privileged_open(&inode_info->lower_file,
+{
-                                              lower_dentry, lower_mnt, cred);
+        struct ecryptfs_inode_info *inode_info =
-                if (rc) {
+                ecryptfs_inode_to_private(dentry->d_inode);
-                        printk(KERN_ERR "Error opening lower persistent file "
+        int count, rc = 0;
-                               "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
-                               "rc = [%d]\n", lower_dentry, lower_mnt, rc);
+        mutex_lock(&inode_info->lower_file_mutex);
-                        inode_info->lower_file = NULL;
+        count = atomic_inc_return(&inode_info->lower_file_count);
-                }
+        if (WARN_ON_ONCE(count < 1))
+                rc = -EINVAL;
+        else if (count == 1) {
+                rc = ecryptfs_init_lower_file(dentry,
+                                              &inode_info->lower_file);
+                if (rc)
+                        atomic_set(&inode_info->lower_file_count, 0);
        }
+        mutex_unlock(&inode_info->lower_file_mutex);
        return rc;
 }
+void ecryptfs_put_lower_file(struct inode *inode)
+{
+        struct ecryptfs_inode_info *inode_info;
+        inode_info = ecryptfs_inode_to_private(inode);
+        if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count,
+                                      &inode_info->lower_file_mutex)) {
+                fput(inode_info->lower_file);
+                inode_info->lower_file = NULL;
+                mutex_unlock(&inode_info->lower_file_mutex);
+        }
+}
 static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
                       struct super_block *sb)
 {
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index bacc882e1ae4..245b517bf1b6 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -55,6 +55,8 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
        if (unlikely(!inode_info))
                goto out;
        ecryptfs_init_crypt_stat(&inode_info->crypt_stat);
+        mutex_init(&inode_info->lower_file_mutex);
+        atomic_set(&inode_info->lower_file_count, 0);
        inode_info->lower_file = NULL;
        inode = &inode_info->vfs_inode;
 out:
@@ -77,8 +79,7 @@ static void ecryptfs_i_callback(struct rcu_head *head)
 *
 * This is used during the final destruction of the inode.  All
 * allocation of memory related to the inode, including allocated
- * memory in the crypt_stat struct, will be released here. This
+ * memory in the crypt_stat struct, will be released here.
- * function also fput()'s the persistent file for the lower inode.
 * There should be no chance that this deallocation will be missed.
 */
 static void ecryptfs_destroy_inode(struct inode *inode)
@@ -86,16 +87,7 @@ static void ecryptfs_destroy_inode(struct inode *inode)
        struct ecryptfs_inode_info *inode_info;
        inode_info = ecryptfs_inode_to_private(inode);
-        if (inode_info->lower_file) {
+        BUG_ON(inode_info->lower_file);
-                struct dentry *lower_dentry =
-                        inode_info->lower_file->f_dentry;
-                BUG_ON(!lower_dentry);
-                if (lower_dentry->d_inode) {
-                        fput(inode_info->lower_file);
-                        inode_info->lower_file = NULL;
-                }
-        }
        ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
        call_rcu(&inode->i_rcu, ecryptfs_i_callback);
 }
diff --git a/fs/file.c b/fs/file.c
index 0be344755c02..4c6992d8f3ba 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/mmzone.h>
 #include <linux/time.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -39,14 +40,17 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */
 */
 static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
-static inline void *alloc_fdmem(unsigned int size)
+static void *alloc_fdmem(unsigned int size)
 {
-        void *data;
+        /*
+         * Very large allocations can stress page reclaim, so fall back to
-        data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
+         * vmalloc() if the allocation size will be considered "large" by the VM.
-        if (data != NULL)
+         */
-                return data;
+        if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+                void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
+                if (data != NULL)
+                        return data;
+        }
        return vmalloc(size);
 }
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index c71995b111bf..0f5c4f9d5d62 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -884,8 +884,8 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        }
        brelse(dibh);
-        gfs2_trans_end(sdp);
 failed:
+        gfs2_trans_end(sdp);
        if (al) {
                gfs2_inplace_release(ip);
                gfs2_quota_unlock(ip);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 5c356d09c321..f789c5732b7c 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1506,7 +1506,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
                inode = gfs2_inode_lookup(dir->i_sb, 
                                be16_to_cpu(dent->de_type),
                                be64_to_cpu(dent->de_inum.no_addr),
-                                be64_to_cpu(dent->de_inum.no_formal_ino));
+                                be64_to_cpu(dent->de_inum.no_formal_ino), 0);
                brelse(bh);
                return inode;
        }
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b2682e073eee..e48310885c48 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -617,18 +617,51 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
-static void empty_write_end(struct page *page, unsigned from,
+static int empty_write_end(struct page *page, unsigned from,
-                           unsigned to)
+                           unsigned to, int mode)
 {
-        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        struct inode *inode = page->mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct buffer_head *bh;
+        unsigned offset, blksize = 1 << inode->i_blkbits;
+        pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
        zero_user(page, from, to-from);
        mark_page_accessed(page);
-        if (!gfs2_is_writeback(ip))
+        if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
-                gfs2_page_add_databufs(ip, page, from, to);
+                if (!gfs2_is_writeback(ip))
+                        gfs2_page_add_databufs(ip, page, from, to);
+                block_commit_write(page, from, to);
+                return 0;
+        }
+        offset = 0;
+        bh = page_buffers(page);
+        while (offset < to) {
+                if (offset >= from) {
+                        set_buffer_uptodate(bh);
+                        mark_buffer_dirty(bh);
+                        clear_buffer_new(bh);
+                        write_dirty_buffer(bh, WRITE);
+                }
+                offset += blksize;
+                bh = bh->b_this_page;
+        }
-        block_commit_write(page, from, to);
+        offset = 0;
+        bh = page_buffers(page);
+        while (offset < to) {
+                if (offset >= from) {
+                        wait_on_buffer(bh);
+                        if (!buffer_uptodate(bh))
+                                return -EIO;
+                }
+                offset += blksize;
+                bh = bh->b_this_page;
+        }
+        return 0;
 }
 static int needs_empty_write(sector_t block, struct inode *inode)
@@ -643,7 +676,8 @@ static int needs_empty_write(sector_t block, struct inode *inode)
        return !buffer_mapped(&bh_map);
 }
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
+                              int mode)
 {
        struct inode *inode = page->mapping->host;
        unsigned start, end, next, blksize;
@@ -668,7 +702,9 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
                                                          gfs2_block_map);
                                if (unlikely(ret))
                                        return ret;
-                                empty_write_end(page, start, end);
+                                ret = empty_write_end(page, start, end, mode);
+                                if (unlikely(ret))
+                                        return ret;
                                end = 0;
                        }
                        start = next;
@@ -682,7 +718,9 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
                ret = __block_write_begin(page, start, end - start, gfs2_block_map);
                if (unlikely(ret))
                        return ret;
-                empty_write_end(page, start, end);
+                ret = empty_write_end(page, start, end, mode);
+                if (unlikely(ret))
+                        return ret;
        }
        return 0;
@@ -731,7 +769,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
                if (curr == end)
                        to = end_offset;
-                error = write_empty_blocks(page, from, to);
+                error = write_empty_blocks(page, from, to, mode);
                if (!error && offset + to > inode->i_size &&
                    !(mode & FALLOC_FL_KEEP_SIZE)) {
                        i_size_write(inode, offset + to);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f07643e21bfa..7a4fb630a320 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -93,14 +93,12 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
 static inline void spin_lock_bucket(unsigned int hash)
 {
-        struct hlist_bl_head *bl = &gl_hash_table[hash];
+        hlist_bl_lock(&gl_hash_table[hash]);
-        bit_spin_lock(0, (unsigned long *)bl);
 }
 static inline void spin_unlock_bucket(unsigned int hash)
 {
-        struct hlist_bl_head *bl = &gl_hash_table[hash];
+        hlist_bl_unlock(&gl_hash_table[hash]);
-        __bit_spin_unlock(0, (unsigned long *)bl);
 }
 static void gfs2_glock_dealloc(struct rcu_head *rcu)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 3754e3cbf02b..25eeb2bcee47 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -385,6 +385,10 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
 static void iopen_go_callback(struct gfs2_glock *gl)
 {
        struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        if (sdp->sd_vfs->s_flags & MS_RDONLY)
+                return;
        if (gl->gl_demote_state == LM_ST_UNLOCKED &&
            gl->gl_state == LM_ST_SHARED && ip) {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 97d54a28776a..9134dcb89479 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -40,37 +40,61 @@ struct gfs2_inum_range_host {
        u64 ir_length;
 };
+struct gfs2_skip_data {
+        u64 no_addr;
+        int skipped;
+        int non_block;
+};
 static int iget_test(struct inode *inode, void *opaque)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        u64 *no_addr = opaque;
+        struct gfs2_skip_data *data = opaque;
-        if (ip->i_no_addr == *no_addr)
+        if (ip->i_no_addr == data->no_addr) {
+                if (data->non_block &&
+                    inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
+                        data->skipped = 1;
+                        return 0;
+                }
                return 1;
+        }
        return 0;
 }
 static int iget_set(struct inode *inode, void *opaque)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        u64 *no_addr = opaque;
+        struct gfs2_skip_data *data = opaque;
-        inode->i_ino = (unsigned long)*no_addr;
+        if (data->skipped)
-        ip->i_no_addr = *no_addr;
+                return -ENOENT;
+        inode->i_ino = (unsigned long)(data->no_addr);
+        ip->i_no_addr = data->no_addr;
        return 0;
 }
 struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr)
 {
        unsigned long hash = (unsigned long)no_addr;
-        return ilookup5(sb, hash, iget_test, &no_addr);
+        struct gfs2_skip_data data;
+        data.no_addr = no_addr;
+        data.skipped = 0;
+        data.non_block = 0;
+        return ilookup5(sb, hash, iget_test, &data);
 }
-static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
+static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr,
+                               int non_block)
 {
+        struct gfs2_skip_data data;
        unsigned long hash = (unsigned long)no_addr;
-        return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
+        data.no_addr = no_addr;
+        data.skipped = 0;
+        data.non_block = non_block;
+        return iget5_locked(sb, hash, iget_test, iget_set, &data);
 }
 /**
@@ -111,19 +135,20 @@ static void gfs2_set_iop(struct inode *inode)
 * @sb: The super block
 * @no_addr: The inode number
 * @type: The type of the inode
+ * non_block: Can we block on inodes that are being freed?
 *
 * Returns: A VFS inode, or an error
 */
 struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
-                                u64 no_addr, u64 no_formal_ino)
+                                u64 no_addr, u64 no_formal_ino, int non_block)
 {
        struct inode *inode;
        struct gfs2_inode *ip;
        struct gfs2_glock *io_gl = NULL;
        int error;
-        inode = gfs2_iget(sb, no_addr);
+        inode = gfs2_iget(sb, no_addr, non_block);
        ip = GFS2_I(inode);
        if (!inode)
@@ -185,11 +210,12 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
 {
        struct super_block *sb = sdp->sd_vfs;
        struct gfs2_holder i_gh;
-        struct inode *inode;
+        struct inode *inode = NULL;
        int error;
+        /* Must not read in block until block type is verified */
        error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
-                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+                                  LM_ST_EXCLUSIVE, GL_SKIP, &i_gh);
        if (error)
                return ERR_PTR(error);
@@ -197,7 +223,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
        if (error)
                goto fail;
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1);
        if (IS_ERR(inode))
                goto fail;
@@ -843,7 +869,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
                goto fail_gunlock2;
        inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
-                                  inum.no_formal_ino);
+                                  inum.no_formal_ino, 0);
        if (IS_ERR(inode))
                goto fail_gunlock2;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 3e00a66e7cbd..099ca305e518 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -97,7 +97,8 @@ err:
 }
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-                                       u64 no_addr, u64 no_formal_ino);
+                                       u64 no_addr, u64 no_formal_ino,
+                                       int non_block);
 extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
                                         u64 *no_formal_ino,
                                         unsigned int blktype);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 42ef24355afb..d3c69eb91c74 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -430,7 +430,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
        struct dentry *dentry;
        struct inode *inode;
-        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
+        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
        if (IS_ERR(inode)) {
                fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
                return PTR_ERR(inode);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index cf930cd9664a..6fcae8469f6d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -945,7 +945,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
                /* rgblk_search can return a block < goal, so we need to
                   keep it marching forward. */
                no_addr = block + rgd->rd_data0;
-                goal++;
+                goal = max(block + 1, goal + 1);
                if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
                        continue;
                if (no_addr == skip)
@@ -971,7 +971,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
                        found++;
                /* Limit reclaim to sensible number of tasks */
-                if (found > 2*NR_CPUS)
+                if (found > NR_CPUS)
                        return;
        }
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index a4e23d68a398..b9f28e66dad1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1318,15 +1318,17 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 static void gfs2_evict_inode(struct inode *inode)
 {
-        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct super_block *sb = inode->i_sb;
+        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_holder gh;
        int error;
-        if (inode->i_nlink)
+        if (inode->i_nlink || (sb->s_flags & MS_RDONLY))
                goto out;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        /* Must not read inode block until block type has been verified */
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
        if (unlikely(error)) {
                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
                goto out;
@@ -1336,6 +1338,12 @@ static void gfs2_evict_inode(struct inode *inode)
        if (error)
                goto out_truncate;
+        if (test_bit(GIF_INVALID, &ip->i_flags)) {
+                error = gfs2_inode_refresh(ip);
+                if (error)
+                        goto out_truncate;
+        }
        ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
        gfs2_glock_dq_wait(&ip->i_iopen_gh);
        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 33435e4b14d2..ce03a182c771 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -480,10 +480,6 @@ static int logfs_read_sb(struct super_block *sb, int read_only)
                        !read_only)
                return -EIO;
-        mutex_init(&super->s_dirop_mutex);
-        mutex_init(&super->s_object_alias_mutex);
-        INIT_LIST_HEAD(&super->s_freeing_list);
        ret = logfs_init_rw(sb);
        if (ret)
                return ret;
@@ -601,6 +597,10 @@ static struct dentry *logfs_mount(struct file_system_type *type, int flags,
        if (!super)
                return ERR_PTR(-ENOMEM);
+        mutex_init(&super->s_dirop_mutex);
+        mutex_init(&super->s_object_alias_mutex);
+        INIT_LIST_HEAD(&super->s_freeing_list);
        if (!devname)
                err = logfs_get_sb_bdev(super, type, devname);
        else if (strncmp(devname, "mtd", 3))
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 89fc160fd5b0..1f063bacd285 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -119,7 +119,7 @@ Elong:
 }
 #ifdef CONFIG_NFS_V4
-static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors, struct inode *inode)
+static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
 {
        struct gss_api_mech *mech;
        struct xdr_netobj oid;
@@ -166,7 +166,7 @@ static int nfs_negotiate_security(const struct dentry *parent,
                }
                flavors = page_address(page);
                ret = secinfo(parent->d_inode, &dentry->d_name, flavors);
-                *flavor = nfs_find_best_sec(flavors, dentry->d_inode);
+                *flavor = nfs_find_best_sec(flavors);
                put_page(page);
        }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index e1c261ddd65d..c4a69833dd0d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -47,6 +47,7 @@ enum nfs4_client_state {
        NFS4CLNT_LAYOUTRECALL,
        NFS4CLNT_SESSION_RESET,
        NFS4CLNT_RECALL_SLOT,
+        NFS4CLNT_LEASE_CONFIRM,
 };
 enum nfs4_session_state {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9bf41eab3e46..69c0f3c5ee7a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -46,6 +46,7 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/nfs_mount.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/module.h>
@@ -443,8 +444,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
        if (res->sr_status == 1)
                res->sr_status = NFS_OK;
-        /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
+        /* don't increment the sequence number if the task wasn't sent */
-        if (!res->sr_slot)
+        if (!RPC_WAS_SENT(task))
                goto out;
        /* Check the SEQUENCE operation status */
@@ -2185,9 +2186,14 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
        struct nfs4_exception exception = { };
        int err;
        do {
-                err = nfs4_handle_exception(server,
+                err = _nfs4_lookup_root(server, fhandle, info);
-                                _nfs4_lookup_root(server, fhandle, info),
+                switch (err) {
-                                &exception);
+                case 0:
+                case -NFS4ERR_WRONGSEC:
+                        break;
+                default:
+                        err = nfs4_handle_exception(server, err, &exception);
+                }
        } while (exception.retry);
        return err;
 }
@@ -2208,25 +2214,47 @@ out:
        return ret;
 }
-/*
+static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
- * get the file handle for the "/" directory on the server
- */
-static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
                              struct nfs_fsinfo *info)
 {
        int i, len, status = 0;
-        rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS + 2];
+        rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS];
-        flav_array[0] = RPC_AUTH_UNIX;
+        len = gss_mech_list_pseudoflavors(&flav_array[0]);
-        len = gss_mech_list_pseudoflavors(&flav_array[1]);
+        flav_array[len] = RPC_AUTH_NULL;
-        flav_array[1+len] = RPC_AUTH_NULL;
+        len += 1;
-        len += 2;
        for (i = 0; i < len; i++) {
                status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]);
-                if (status != -EPERM)
+                if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
-                        break;
+                        continue;
+                break;
        }
+        /*
+         * -EACCESS could mean that the user doesn't have correct permissions
+         * to access the mount.  It could also mean that we tried to mount
+         * with a gss auth flavor, but rpc.gssd isn't running.  Either way,
+         * existing mount programs don't handle -EACCES very well so it should
+         * be mapped to -EPERM instead.
+         */
+        if (status == -EACCES)
+                status = -EPERM;
+        return status;
+}
+/*
+ * get the file handle for the "/" directory on the server
+ */
+static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+                              struct nfs_fsinfo *info)
+{
+        int status = nfs4_lookup_root(server, fhandle, info);
+        if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR))
+                /*
+                 * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM
+                 * by nfs4_map_errors() as this function exits.
+                 */
+                status = nfs4_find_root_sec(server, fhandle, info);
        if (status == 0)
                status = nfs4_server_capabilities(server, fhandle);
        if (status == 0)
@@ -3723,21 +3751,20 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                                sizeof(setclientid.sc_uaddr), "%s.%u.%u",
                                clp->cl_ipaddr, port >> 8, port & 255);
-                status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+                status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
                if (status != -NFS4ERR_CLID_INUSE)
                        break;
-                if (signalled())
+                if (loop != 0) {
+                        ++clp->cl_id_uniquifier;
                        break;
-                if (loop++ & 1)
+                }
-                        ssleep(clp->cl_lease_time / HZ + 1);
+                ++loop;
-                else
+                ssleep(clp->cl_lease_time / HZ + 1);
-                        if (++clp->cl_id_uniquifier == 0)
-                                break;
        }
        return status;
 }
-static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
                struct nfs4_setclientid_res *arg,
                struct rpc_cred *cred)
 {
@@ -3752,7 +3779,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp,
        int status;
        now = jiffies;
-        status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+        status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
        if (status == 0) {
                spin_lock(&clp->cl_lock);
                clp->cl_lease_time = fsinfo.lease_time * HZ;
@@ -3762,26 +3789,6 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp,
        return status;
 }
-int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
-                struct nfs4_setclientid_res *arg,
-                struct rpc_cred *cred)
-{
-        long timeout = 0;
-        int err;
-        do {
-                err = _nfs4_proc_setclientid_confirm(clp, arg, cred);
-                switch (err) {
-                        case 0:
-                                return err;
-                        case -NFS4ERR_RESOURCE:
-                                /* The IBM lawyers misread another document! */
-                        case -NFS4ERR_DELAY:
-                                err = nfs4_delay(clp->cl_rpcclient, &timeout);
-                }
-        } while (err == 0);
-        return err;
-}
 struct nfs4_delegreturndata {
        struct nfs4_delegreturnargs args;
        struct nfs4_delegreturnres res;
@@ -4786,7 +4793,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
                                init_utsname()->domainname,
                                clp->cl_rpcclient->cl_auth->au_flavor);
-        status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+        status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
        if (!status)
                status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
        dprintk("<-- %s status= %d\n", __func__, status);
@@ -4869,7 +4876,8 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
                .rpc_client = clp->cl_rpcclient,
                .rpc_message = &msg,
                .callback_ops = &nfs4_get_lease_time_ops,
-                .callback_data = &data
+                .callback_data = &data,
+                .flags = RPC_TASK_TIMEOUT,
        };
        int status;
@@ -5171,7 +5179,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp)
        nfs4_init_channel_attrs(&args);
        args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
-        status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0);
+        status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
        if (!status)
                /* Verify the session's negotiated channel_attrs values */
@@ -5194,20 +5202,10 @@ int nfs4_proc_create_session(struct nfs_client *clp)
        int status;
        unsigned *ptr;
        struct nfs4_session *session = clp->cl_session;
-        long timeout = 0;
-        int err;
        dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
-        do {
+        status = _nfs4_proc_create_session(clp);
-                status = _nfs4_proc_create_session(clp);
-                if (status == -NFS4ERR_DELAY) {
-                        err = nfs4_delay(clp->cl_rpcclient, &timeout);
-                        if (err)
-                                status = err;
-                }
-        } while (status == -NFS4ERR_DELAY);
        if (status)
                goto out;
@@ -5248,7 +5246,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
        msg.rpc_argp = session;
        msg.rpc_resp = NULL;
        msg.rpc_cred = NULL;
-        status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0);
+        status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
        if (status)
                printk(KERN_WARNING
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a6804f704d9d..036f5adc9e1f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -64,10 +64,15 @@ static LIST_HEAD(nfs4_clientid_list);
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
-        struct nfs4_setclientid_res clid;
+        struct nfs4_setclientid_res clid = {
+                .clientid = clp->cl_clientid,
+                .confirm = clp->cl_confirm,
+        };
        unsigned short port;
        int status;
+        if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
+                goto do_confirm;
        port = nfs_callback_tcpport;
        if (clp->cl_addr.ss_family == AF_INET6)
                port = nfs_callback_tcpport6;
@@ -75,10 +80,14 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
        status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
        if (status != 0)
                goto out;
+        clp->cl_clientid = clid.clientid;
+        clp->cl_confirm = clid.confirm;
+        set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+do_confirm:
        status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
        if (status != 0)
                goto out;
-        clp->cl_clientid = clid.clientid;
+        clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
        nfs4_schedule_state_renewal(clp);
 out:
        return status;
@@ -230,13 +239,18 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
        int status;
+        if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
+                goto do_confirm;
        nfs4_begin_drain_session(clp);
        status = nfs4_proc_exchange_id(clp, cred);
        if (status != 0)
                goto out;
+        set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+do_confirm:
        status = nfs4_proc_create_session(clp);
        if (status != 0)
                goto out;
+        clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
        nfs41_setup_state_renewal(clp);
        nfs_mark_client_ready(clp, NFS_CS_READY);
 out:
@@ -1584,20 +1598,23 @@ static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
 */
 static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
 {
-        if (nfs4_has_session(clp)) {
+        switch (status) {
-                switch (status) {
+        case -NFS4ERR_CLID_INUSE:
-                case -NFS4ERR_DELAY:
+        case -NFS4ERR_STALE_CLIENTID:
-                case -NFS4ERR_CLID_INUSE:
+                clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
-                case -EAGAIN:
+                break;
-                        break;
+        case -NFS4ERR_DELAY:
+        case -ETIMEDOUT:
+        case -EAGAIN:
+                ssleep(1);
+                break;
-                case -EKEYEXPIRED:
+        case -EKEYEXPIRED:
-                        nfs4_warn_keyexpired(clp->cl_hostname);
+                nfs4_warn_keyexpired(clp->cl_hostname);
-                case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+        case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
-                                         * in nfs4_exchange_id */
+                                 * in nfs4_exchange_id */
-                default:
+        default:
-                        return;
+                return;
-                }
        }
        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 }
@@ -1607,7 +1624,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
        int status = 0;
        /* Ensure exclusive access to NFSv4 state */
-        for(;;) {
+        do {
                if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
                        /* We're going to have to re-establish a clientid */
                        status = nfs4_reclaim_lease(clp);
@@ -1691,7 +1708,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        break;
                if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
                        break;
-        }
+        } while (atomic_read(&clp->cl_count) > 1);
        return;
 out_error:
        printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index dddfb5795d7b..c3ccd2c46834 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1452,26 +1452,25 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
-        uint32_t attrs[2] = {0, 0};
+        uint32_t attrs[2] = {
+                FATTR4_WORD0_RDATTR_ERROR,
+                FATTR4_WORD1_MOUNTED_ON_FILEID,
+        };
        uint32_t dircount = readdir->count >> 1;
        __be32 *p;
        if (readdir->plus) {
                attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
-                        FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE;
+                        FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID;
                attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
                        FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
                        FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
                        FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
                dircount >>= 1;
        }
-        attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID;
+        /* Use mounted_on_fileid only if the server supports it */
-        attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
+        if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
-        /* Switch to mounted_on_fileid if the server supports it */
+                attrs[0] |= FATTR4_WORD0_FILEID;
-        if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
-                attrs[0] &= ~FATTR4_WORD0_FILEID;
-        else
-                attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
        p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
        *p++ = cpu_to_be32(OP_READDIR);
@@ -3140,7 +3139,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
                        goto out_overflow;
                xdr_decode_hyper(p, fileid);
                bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
-                ret = NFS_ATTR_FATTR_FILEID;
+                ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
        }
        dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
        return ret;
@@ -4002,7 +4001,6 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 {
        int status;
        umode_t fmode = 0;
-        uint64_t fileid;
        uint32_t type;
        status = decode_attr_type(xdr, bitmap, &type);
@@ -4101,13 +4099,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
+        status = decode_attr_mounted_on_fileid(xdr, bitmap, &fattr->mounted_on_fileid);
        if (status < 0)
                goto xdr_error;
-        if (status != 0 && !(fattr->valid & status)) {
+        fattr->valid |= status;
-                fattr->fileid = fileid;
-                fattr->valid |= status;
-        }
 xdr_error:
        dprintk("%s: xdr returned %d\n", __func__, -status);
@@ -4838,17 +4833,21 @@ static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
        struct nfs4_secinfo_flavor *sec_flavor;
        int status;
        __be32 *p;
-        int i;
+        int i, num_flavors;
        status = decode_op_hdr(xdr, OP_SECINFO);
+        if (status)
+                goto out;
        p = xdr_inline_decode(xdr, 4);
        if (unlikely(!p))
                goto out_overflow;
-        res->flavors->num_flavors = be32_to_cpup(p);
-        for (i = 0; i < res->flavors->num_flavors; i++) {
+        res->flavors->num_flavors = 0;
+        num_flavors = be32_to_cpup(p);
+        for (i = 0; i < num_flavors; i++) {
                sec_flavor = &res->flavors->flavors[i];
-                if ((char *)&sec_flavor[1] - (char *)res > PAGE_SIZE)
+                if ((char *)&sec_flavor[1] - (char *)res->flavors > PAGE_SIZE)
                        break;
                p = xdr_inline_decode(xdr, 4);
@@ -4857,13 +4856,15 @@ static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
                sec_flavor->flavor = be32_to_cpup(p);
                if (sec_flavor->flavor == RPC_AUTH_GSS) {
-                        if (decode_secinfo_gss(xdr, sec_flavor))
+                        status = decode_secinfo_gss(xdr, sec_flavor);
-                                break;
+                        if (status)
+                                goto out;
                }
+                res->flavors->num_flavors++;
        }
-        return 0;
+out:
+        return status;
 out_overflow:
        print_overflow_msg(__func__, xdr);
        return -EIO;
@@ -6408,7 +6409,9 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
                                        entry->server, 1) < 0)
                goto out_overflow;
-        if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
+        if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+                entry->ino = entry->fattr->mounted_on_fileid;
+        else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
                entry->ino = entry->fattr->fileid;
        entry->d_type = DT_UNKNOWN;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d9ab97269ce6..ff681ab65d31 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1004,6 +1004,7 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 {
        struct nfs_inode *nfsi = NFS_I(wdata->inode);
        loff_t end_pos = wdata->args.offset + wdata->res.count;
+        bool mark_as_dirty = false;
        spin_lock(&nfsi->vfs_inode.i_lock);
        if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
@@ -1011,13 +1012,18 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
                get_lseg(wdata->lseg);
                wdata->lseg->pls_lc_cred =
                        get_rpccred(wdata->args.context->state->owner->so_cred);
-                mark_inode_dirty_sync(wdata->inode);
+                mark_as_dirty = true;
                dprintk("%s: Set layoutcommit for inode %lu ",
                        __func__, wdata->inode->i_ino);
        }
        if (end_pos > wdata->lseg->pls_end_pos)
                wdata->lseg->pls_end_pos = end_pos;
        spin_unlock(&nfsi->vfs_inode.i_lock);
+        /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
+         * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
+        if (mark_as_dirty)
+                mark_inode_dirty_sync(wdata->inode);
 }
 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2b8e9a5e366a..e288f06d3fa7 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1004,6 +1004,7 @@ static int nfs_parse_security_flavors(char *value,
                return 0;
        }
+        mnt->flags |= NFS_MOUNT_SECFLAVOUR;
        mnt->auth_flavor_len = 1;
        return 1;
 }
@@ -1976,6 +1977,15 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
        if (error < 0)
                goto out;
+        /*
+         * noac is a special case. It implies -o sync, but that's not
+         * necessarily reflected in the mtab options. do_remount_sb
+         * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the
+         * remount options, so we have to explicitly reset it.
+         */
+        if (data->flags & NFS_MOUNT_NOAC)
+                *flags |= MS_SYNCHRONOUS;
        /* compare new mount options with old ones */
        error = nfs_compare_remount_data(nfss, data);
 out:
@@ -2235,8 +2245,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
        if (!s->s_root) {
                /* initial superblock/root creation */
                nfs_fill_super(s, data);
-                nfs_fscache_get_super_cookie(
+                nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL);
-                        s, data ? data->fscache_uniq : NULL, NULL);
        }
        mntroot = nfs_get_root(s, mntfh, dev_name);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e4cbc11a74ab..3bd5d7e80f6c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -680,7 +680,6 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
        req = nfs_setup_write_request(ctx, page, offset, count);
        if (IS_ERR(req))
                return PTR_ERR(req);
-        nfs_mark_request_dirty(req);
        /* Update file length */
        nfs_grow_file(page, offset, count);
        nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
@@ -1418,8 +1417,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
                                task->tk_pid, task->tk_status);
        /* Call the NFS version-specific code */
-        if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
+        NFS_PROTO(data->inode)->commit_done(task, data);
-                return;
 }
 void nfs_commit_release_pages(struct nfs_write_data *data)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index aa309aa93fe8..4cf04e11c66c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -258,6 +258,7 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
        if (atomic_dec_and_test(&fp->fi_delegees)) {
                vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
                fp->fi_lease = NULL;
+                fput(fp->fi_deleg_file);
                fp->fi_deleg_file = NULL;
        }
 }
@@ -402,8 +403,8 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
        if (stp->st_access_bmap) {
                oflag = nfs4_access_bmap_to_omode(stp);
                nfs4_file_put_access(stp->st_file, oflag);
-                put_nfs4_file(stp->st_file);
        }
+        put_nfs4_file(stp->st_file);
        kmem_cache_free(stateid_slab, stp);
 }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2e1cebde90df..129f3c9f62d5 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1363,7 +1363,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                goto out;
        if (!(iap->ia_valid & ATTR_MODE))
                iap->ia_mode = 0;
-        err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+        err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
        if (err)
                goto out;
@@ -1385,6 +1385,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (IS_ERR(dchild))
                goto out_nfserr;
+        /* If file doesn't exist, check for permissions to create one */
+        if (!dchild->d_inode) {
+                err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+                if (err)
+                        goto out;
+        }
        err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
        if (err)
                goto out;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index b68f87a83924..938387a10d5d 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -1019,7 +1019,7 @@ struct ocfs2_xattr_entry {
        __le16  xe_name_offset;  /* byte offset from the 1st entry in the
                                    local xattr storage(inode, xattr block or
                                    xattr bucket). */
-        __u8    xe_name_len;     /* xattr name len, does't include prefix. */
+        __u8    xe_name_len;     /* xattr name len, doesn't include prefix. */
        __u8    xe_type;         /* the low 7 bits indicate the name prefix
                                  * type and the highest bit indicates whether
                                  * the EA is stored in the local storage. */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dd6628d3ba42..dfa532730e55 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3124,11 +3124,16 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-        unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
+        unsigned int nr;
-        struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode);
+        struct task_struct *reaper;
        struct tgid_iter iter;
        struct pid_namespace *ns;
+        if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
+                goto out_no_task;
+        nr = filp->f_pos - FIRST_PROCESS_ENTRY;
+        reaper = get_proc_task(filp->f_path.dentry->d_inode);
        if (!reaper)
                goto out_no_task;
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 4d0cb1241460..40fa780ebea7 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -175,26 +175,6 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
 }
 /**
- * ubifs_create_buds_lists - create journal head buds lists for remount rw.
- * @c: UBIFS file-system description object
- */
-void ubifs_create_buds_lists(struct ubifs_info *c)
-{
-        struct rb_node *p;
-        spin_lock(&c->buds_lock);
-        p = rb_first(&c->buds);
-        while (p) {
-                struct ubifs_bud *bud = rb_entry(p, struct ubifs_bud, rb);
-                struct ubifs_jhead *jhead = &c->jheads[bud->jhead];
-                list_add_tail(&bud->list, &jhead->buds_list);
-                p = rb_next(p);
-        }
-        spin_unlock(&c->buds_lock);
-}
-/**
 * ubifs_add_bud_to_log - add a new bud to the log.
 * @c: UBIFS file-system description object
 * @jhead: journal head the bud belongs to
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 936f2cbfe6b6..3dbad6fbd1eb 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -317,6 +317,32 @@ int ubifs_recover_master_node(struct ubifs_info *c)
                        goto out_free;
                }
                memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ);
+                /*
+                 * We had to recover the master node, which means there was an
+                 * unclean reboot. However, it is possible that the master node
+                 * is clean at this point, i.e., %UBIFS_MST_DIRTY is not set.
+                 * E.g., consider the following chain of events:
+                 *
+                 * 1. UBIFS was cleanly unmounted, so the master node is clean
+                 * 2. UBIFS is being mounted R/W and starts changing the master
+                 *    node in the first (%UBIFS_MST_LNUM). A power cut happens,
+                 *    so this LEB ends up with some amount of garbage at the
+                 *    end.
+                 * 3. UBIFS is being mounted R/O. We reach this place and
+                 *    recover the master node from the second LEB
+                 *    (%UBIFS_MST_LNUM + 1). But we cannot update the media
+                 *    because we are being mounted R/O. We have to defer the
+                 *    operation.
+                 * 4. However, this master node (@c->mst_node) is marked as
+                 *    clean (since the step 1). And if we just return, the
+                 *    mount code will be confused and won't recover the master
+                 *    node when it is re-mounter R/W later.
+                 *
+                 *    Thus, to force the recovery by marking the master node as
+                 *    dirty.
+                 */
+                c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
        } else {
                /* Write the recovered master node */
                c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index eed0fcff8d73..d3d6d365bfc1 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -59,6 +59,7 @@ enum {
 * @new_size: truncation new size
 * @free: amount of free space in a bud
 * @dirty: amount of dirty space in a bud from padding and deletion nodes
+ * @jhead: journal head number of the bud
 *
 * UBIFS journal replay must compare node sequence numbers, which means it must
 * build a tree of node information to insert into the TNC.
@@ -80,6 +81,7 @@ struct replay_entry {
                struct {
                        int free;
                        int dirty;
+                        int jhead;
                };
        };
 };
@@ -159,6 +161,11 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
                err = PTR_ERR(lp);
                goto out;
        }
+        /* Make sure the journal head points to the latest bud */
+        err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum,
+                                     c->leb_size - r->free, UBI_SHORTTERM);
 out:
        ubifs_release_lprops(c);
        return err;
@@ -627,10 +634,6 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
        ubifs_assert(sleb->endpt - offs >= used);
        ubifs_assert(sleb->endpt % c->min_io_size == 0);
-        if (sleb->endpt + c->min_io_size <= c->leb_size && !c->ro_mount)
-                err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
-                                             sleb->endpt, UBI_SHORTTERM);
        *dirty = sleb->endpt - offs - used;
        *free = c->leb_size - sleb->endpt;
@@ -653,12 +656,14 @@ out_dump:
 * @sqnum: sequence number
 * @free: amount of free space in bud
 * @dirty: amount of dirty space from padding and deletion nodes
+ * @jhead: journal head number for the bud
 *
 * This function inserts a reference node to the replay tree and returns zero
 * in case of success or a negative error code in case of failure.
 */
 static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
-                           unsigned long long sqnum, int free, int dirty)
+                           unsigned long long sqnum, int free, int dirty,
+                           int jhead)
 {
        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
        struct replay_entry *r;
@@ -688,6 +693,7 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
        r->flags = REPLAY_REF;
        r->free = free;
        r->dirty = dirty;
+        r->jhead = jhead;
        rb_link_node(&r->rb, parent, p);
        rb_insert_color(&r->rb, &c->replay_tree);
@@ -712,7 +718,7 @@ static int replay_buds(struct ubifs_info *c)
                if (err)
                        return err;
                err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
-                                      free, dirty);
+                                      free, dirty, b->bud->jhead);
                if (err)
                        return err;
        }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c75f6133206c..04ad07f4fcc3 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1257,12 +1257,12 @@ static int mount_ubifs(struct ubifs_info *c)
                goto out_free;
        }
+        err = alloc_wbufs(c);
+        if (err)
+                goto out_cbuf;
        sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
        if (!c->ro_mount) {
-                err = alloc_wbufs(c);
-                if (err)
-                        goto out_cbuf;
                /* Create background thread */
                c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
                if (IS_ERR(c->bgt)) {
@@ -1631,12 +1631,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        if (err)
                goto out;
-        err = alloc_wbufs(c);
-        if (err)
-                goto out;
-        ubifs_create_buds_lists(c);
        /* Create background thread */
        c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
        if (IS_ERR(c->bgt)) {
@@ -1671,14 +1665,25 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        if (err)
                goto out;
+        dbg_gen("re-mounted read-write");
+        c->remounting_rw = 0;
        if (c->need_recovery) {
                c->need_recovery = 0;
                ubifs_msg("deferred recovery completed");
+        } else {
+                /*
+                 * Do not run the debugging space check if the were doing
+                 * recovery, because when we saved the information we had the
+                 * file-system in a state where the TNC and lprops has been
+                 * modified in memory, but all the I/O operations (including a
+                 * commit) were deferred. So the file-system was in
+                 * "non-committed" state. Now the file-system is in committed
+                 * state, and of course the amount of free space will change
+                 * because, for example, the old index size was imprecise.
+                 */
+                err = dbg_check_space_info(c);
        }
-        dbg_gen("re-mounted read-write");
-        c->remounting_rw = 0;
-        err = dbg_check_space_info(c);
        mutex_unlock(&c->umount_mutex);
        return err;
@@ -1733,7 +1738,6 @@ static void ubifs_remount_ro(struct ubifs_info *c)
        if (err)
                ubifs_ro_mode(c, err);
-        free_wbufs(c);
        vfree(c->orph_buf);
        c->orph_buf = NULL;
        kfree(c->write_reserve_buf);
@@ -1761,10 +1765,12 @@ static void ubifs_put_super(struct super_block *sb)
         * of the media. For example, there will be dirty inodes if we failed
         * to write them back because of I/O errors.
         */
-        ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
+        if (!c->ro_error) {
-        ubifs_assert(c->budg_idx_growth == 0);
+                ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
-        ubifs_assert(c->budg_dd_growth == 0);
+                ubifs_assert(c->budg_idx_growth == 0);
-        ubifs_assert(c->budg_data_growth == 0);
+                ubifs_assert(c->budg_dd_growth == 0);
+                ubifs_assert(c->budg_data_growth == 0);
+        }
        /*
         * The 'c->umount_lock' prevents races between UBIFS memory shrinker
diff --git a/fs/xattr.c b/fs/xattr.c
index a19acdb81cd1..f1ef94974dea 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -666,7 +666,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
                return -EOPNOTSUPP;
-        return handler->set(dentry, name, value, size, 0, handler->flags);
+        return handler->set(dentry, name, value, size, flags, handler->flags);
 }
 /*
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
index 3ca795609113..9f76cceb678d 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -34,8 +34,10 @@ __xfs_printk(
        const struct xfs_mount  *mp,
        struct va_format        *vaf)
 {
-        if (mp && mp->m_fsname)
+        if (mp && mp->m_fsname) {
                printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+                return;
+        }
        printk("%sXFS: %pV\n", level, vaf);
 }