45 files changed, 734 insertions, 441 deletions
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index cde698a07d21..a2ae42720a6a 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1802,6 +1802,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
        set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
        inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
        BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
+        BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
        inode->i_version = btrfs_stack_inode_sequence(inode_item);
        inode->i_rdev = 0;
        *rdev = btrfs_stack_inode_rdev(inode_item);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1eef4ee01d1a..7effed6f2fa6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3178,10 +3178,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(path);
 fail:
-        if (ret)
+        btrfs_release_path(path);
-                btrfs_abort_transaction(trans, root, ret);
        return ret;
 }
@@ -3305,8 +3303,7 @@ again:
        spin_lock(&block_group->lock);
        if (block_group->cached != BTRFS_CACHE_FINISHED ||
-            !btrfs_test_opt(root, SPACE_CACHE) ||
+            !btrfs_test_opt(root, SPACE_CACHE)) {
-            block_group->delalloc_bytes) {
                /*
                 * don't bother trying to write stuff out _if_
                 * a) we're not cached,
@@ -3408,17 +3405,14 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
        int loops = 0;
        spin_lock(&cur_trans->dirty_bgs_lock);
-        if (!list_empty(&cur_trans->dirty_bgs)) {
+        if (list_empty(&cur_trans->dirty_bgs)) {
-                list_splice_init(&cur_trans->dirty_bgs, &dirty);
+                spin_unlock(&cur_trans->dirty_bgs_lock);
+                return 0;
        }
+        list_splice_init(&cur_trans->dirty_bgs, &dirty);
        spin_unlock(&cur_trans->dirty_bgs_lock);
 again:
-        if (list_empty(&dirty)) {
-                btrfs_free_path(path);
-                return 0;
-        }
        /*
         * make sure all the block groups on our dirty list actually
         * exist
@@ -3431,18 +3425,16 @@ again:
                        return -ENOMEM;
        }
+        /*
+         * cache_write_mutex is here only to save us from balance or automatic
+         * removal of empty block groups deleting this block group while we are
+         * writing out the cache
+         */
+        mutex_lock(&trans->transaction->cache_write_mutex);
        while (!list_empty(&dirty)) {
                cache = list_first_entry(&dirty,
                                         struct btrfs_block_group_cache,
                                         dirty_list);
-                /*
-                 * cache_write_mutex is here only to save us from balance
-                 * deleting this block group while we are writing out the
-                 * cache
-                 */
-                mutex_lock(&trans->transaction->cache_write_mutex);
                /*
                 * this can happen if something re-dirties a block
                 * group that is already under IO.  Just wait for it to
@@ -3493,9 +3485,30 @@ again:
                                ret = 0;
                        }
                }
-                if (!ret)
+                if (!ret) {
                        ret = write_one_cache_group(trans, root, path, cache);
-                mutex_unlock(&trans->transaction->cache_write_mutex);
+                        /*
+                         * Our block group might still be attached to the list
+                         * of new block groups in the transaction handle of some
+                         * other task (struct btrfs_trans_handle->new_bgs). This
+                         * means its block group item isn't yet in the extent
+                         * tree. If this happens ignore the error, as we will
+                         * try again later in the critical section of the
+                         * transaction commit.
+                         */
+                        if (ret == -ENOENT) {
+                                ret = 0;
+                                spin_lock(&cur_trans->dirty_bgs_lock);
+                                if (list_empty(&cache->dirty_list)) {
+                                        list_add_tail(&cache->dirty_list,
+                                                      &cur_trans->dirty_bgs);
+                                        btrfs_get_block_group(cache);
+                                }
+                                spin_unlock(&cur_trans->dirty_bgs_lock);
+                        } else if (ret) {
+                                btrfs_abort_transaction(trans, root, ret);
+                        }
+                }
                /* if its not on the io list, we need to put the block group */
                if (should_put)
@@ -3503,7 +3516,16 @@ again:
                if (ret)
                        break;
+                /*
+                 * Avoid blocking other tasks for too long. It might even save
+                 * us from writing caches for block groups that are going to be
+                 * removed.
+                 */
+                mutex_unlock(&trans->transaction->cache_write_mutex);
+                mutex_lock(&trans->transaction->cache_write_mutex);
        }
+        mutex_unlock(&trans->transaction->cache_write_mutex);
        /*
         * go through delayed refs for all the stuff we've just kicked off
@@ -3514,8 +3536,15 @@ again:
                loops++;
                spin_lock(&cur_trans->dirty_bgs_lock);
                list_splice_init(&cur_trans->dirty_bgs, &dirty);
+                /*
+                 * dirty_bgs_lock protects us from concurrent block group
+                 * deletes too (not just cache_write_mutex).
+                 */
+                if (!list_empty(&dirty)) {
+                        spin_unlock(&cur_trans->dirty_bgs_lock);
+                        goto again;
+                }
                spin_unlock(&cur_trans->dirty_bgs_lock);
-                goto again;
        }
        btrfs_free_path(path);
@@ -3588,8 +3617,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                ret = 0;
                        }
                }
-                if (!ret)
+                if (!ret) {
                        ret = write_one_cache_group(trans, root, path, cache);
+                        if (ret)
+                                btrfs_abort_transaction(trans, root, ret);
+                }
                /* if its not on the io list, we need to put the block group */
                if (should_put)
@@ -7537,7 +7569,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
 * returns the key for the extent through ins, and a tree buffer for
 * the first block of the extent through buf.
 *
- * returns the tree buffer or NULL.
+ * returns the tree buffer or an ERR_PTR on error.
 */
 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
@@ -7548,6 +7580,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
        struct btrfs_key ins;
        struct btrfs_block_rsv *block_rsv;
        struct extent_buffer *buf;
+        struct btrfs_delayed_extent_op *extent_op;
        u64 flags = 0;
        int ret;
        u32 blocksize = root->nodesize;
@@ -7568,13 +7601,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
        ret = btrfs_reserve_extent(root, blocksize, blocksize,
                                   empty_size, hint, &ins, 0, 0);
-        if (ret) {
+        if (ret)
-                unuse_block_rsv(root->fs_info, block_rsv, blocksize);
+                goto out_unuse;
-                return ERR_PTR(ret);
-        }
        buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
-        BUG_ON(IS_ERR(buf)); /* -ENOMEM */
+        if (IS_ERR(buf)) {
+                ret = PTR_ERR(buf);
+                goto out_free_reserved;
+        }
        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
                if (parent == 0)
@@ -7584,9 +7618,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                BUG_ON(parent > 0);
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-                struct btrfs_delayed_extent_op *extent_op;
                extent_op = btrfs_alloc_delayed_extent_op();
-                BUG_ON(!extent_op); /* -ENOMEM */
+                if (!extent_op) {
+                        ret = -ENOMEM;
+                        goto out_free_buf;
+                }
                if (key)
                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
                else
@@ -7601,13 +7637,24 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                extent_op->level = level;
                ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
-                                        ins.objectid,
+                                                 ins.objectid, ins.offset,
-                                        ins.offset, parent, root_objectid,
+                                                 parent, root_objectid, level,
-                                        level, BTRFS_ADD_DELAYED_EXTENT,
+                                                 BTRFS_ADD_DELAYED_EXTENT,
-                                        extent_op, 0);
+                                                 extent_op, 0);
-                BUG_ON(ret); /* -ENOMEM */
+                if (ret)
+                        goto out_free_delayed;
        }
        return buf;
+out_free_delayed:
+        btrfs_free_delayed_extent_op(extent_op);
+out_free_buf:
+        free_extent_buffer(buf);
+out_free_reserved:
+        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
+out_unuse:
+        unuse_block_rsv(root->fs_info, block_rsv, blocksize);
+        return ERR_PTR(ret);
 }
 struct walk_control {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 782f3bc4651d..c32d226bfecc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4560,36 +4560,37 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
        do {
                index--;
                page = eb->pages[index];
-                if (page && mapped) {
+                if (!page)
+                        continue;
+                if (mapped)
                        spin_lock(&page->mapping->private_lock);
+                /*
+                 * We do this since we'll remove the pages after we've
+                 * removed the eb from the radix tree, so we could race
+                 * and have this page now attached to the new eb.  So
+                 * only clear page_private if it's still connected to
+                 * this eb.
+                 */
+                if (PagePrivate(page) &&
+                    page->private == (unsigned long)eb) {
+                        BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+                        BUG_ON(PageDirty(page));
+                        BUG_ON(PageWriteback(page));
                        /*
-                         * We do this since we'll remove the pages after we've
+                         * We need to make sure we haven't be attached
-                         * removed the eb from the radix tree, so we could race
+                         * to a new eb.
-                         * and have this page now attached to the new eb.  So
-                         * only clear page_private if it's still connected to
-                         * this eb.
                         */
-                        if (PagePrivate(page) &&
+                        ClearPagePrivate(page);
-                            page->private == (unsigned long)eb) {
+                        set_page_private(page, 0);
-                                BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+                        /* One for the page private */
-                                BUG_ON(PageDirty(page));
-                                BUG_ON(PageWriteback(page));
-                                /*
-                                 * We need to make sure we haven't be attached
-                                 * to a new eb.
-                                 */
-                                ClearPagePrivate(page);
-                                set_page_private(page, 0);
-                                /* One for the page private */
-                                page_cache_release(page);
-                        }
-                        spin_unlock(&page->mapping->private_lock);
-                }
-                if (page) {
-                        /* One for when we alloced the page */
                        page_cache_release(page);
                }
+                if (mapped)
+                        spin_unlock(&page->mapping->private_lock);
+                /* One for when we alloced the page */
+                page_cache_release(page);
        } while (index != 0);
 }
@@ -4771,6 +4772,25 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
                               start >> PAGE_CACHE_SHIFT);
        if (eb && atomic_inc_not_zero(&eb->refs)) {
                rcu_read_unlock();
+                /*
+                 * Lock our eb's refs_lock to avoid races with
+                 * free_extent_buffer. When we get our eb it might be flagged
+                 * with EXTENT_BUFFER_STALE and another task running
+                 * free_extent_buffer might have seen that flag set,
+                 * eb->refs == 2, that the buffer isn't under IO (dirty and
+                 * writeback flags not set) and it's still in the tree (flag
+                 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
+                 * of decrementing the extent buffer's reference count twice.
+                 * So here we could race and increment the eb's reference count,
+                 * clear its stale flag, mark it as dirty and drop our reference
+                 * before the other task finishes executing free_extent_buffer,
+                 * which would later result in an attempt to free an extent
+                 * buffer that is dirty.
+                 */
+                if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
+                        spin_lock(&eb->refs_lock);
+                        spin_unlock(&eb->refs_lock);
+                }
                mark_extent_buffer_accessed(eb, NULL);
                return eb;
        }
@@ -4870,6 +4890,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
                                mark_extent_buffer_accessed(exists, p);
                                goto free_eb;
                        }
+                        exists = NULL;
                        /*
                         * Do this so attach doesn't complain and we need to
@@ -4933,12 +4954,12 @@ again:
        return eb;
 free_eb:
+        WARN_ON(!atomic_dec_and_test(&eb->refs));
        for (i = 0; i < num_pages; i++) {
                if (eb->pages[i])
                        unlock_page(eb->pages[i]);
        }
-        WARN_ON(!atomic_dec_and_test(&eb->refs));
        btrfs_release_extent_buffer(eb);
        return exists;
 }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 81fa75a8e1f3..9dbe5b548fa6 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -86,7 +86,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
        mapping_set_gfp_mask(inode->i_mapping,
                        mapping_gfp_mask(inode->i_mapping) &
-                        ~(GFP_NOFS & ~__GFP_HIGHMEM));
+                        ~(__GFP_FS | __GFP_HIGHMEM));
        return inode;
 }
@@ -1218,7 +1218,7 @@ out:
 *
 * This function writes out a free space cache struct to disk for quick recovery
 * on mount.  This will return 0 if it was successfull in writing the cache out,
- * and -1 if it was not.
+ * or an errno if it was not.
 */
 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                                   struct btrfs_free_space_ctl *ctl,
@@ -1235,12 +1235,12 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        int must_iput = 0;
        if (!i_size_read(inode))
-                return -1;
+                return -EIO;
        WARN_ON(io_ctl->pages);
        ret = io_ctl_init(io_ctl, inode, root, 1);
        if (ret)
-                return -1;
+                return ret;
        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
                down_write(&block_group->data_rwsem);
@@ -1258,7 +1258,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        }
        /* Lock all pages first so we can lock the extent safely. */
-        io_ctl_prepare_pages(io_ctl, inode, 0);
+        ret = io_ctl_prepare_pages(io_ctl, inode, 0);
+        if (ret)
+                goto out;
        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
                         0, &cached_state);
@@ -3464,6 +3466,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
        int ret;
        struct btrfs_io_ctl io_ctl;
+        bool release_metadata = true;
        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
                return 0;
@@ -3471,11 +3474,20 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
        memset(&io_ctl, 0, sizeof(io_ctl));
        ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
                                      trans, path, 0);
-        if (!ret)
+        if (!ret) {
+                /*
+                 * At this point writepages() didn't error out, so our metadata
+                 * reservation is released when the writeback finishes, at
+                 * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
+                 * with or without an error.
+                 */
+                release_metadata = false;
                ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
+        }
        if (ret) {
-                btrfs_delalloc_release_metadata(inode, inode->i_size);
+                if (release_metadata)
+                        btrfs_delalloc_release_metadata(inode, inode->i_size);
 #ifdef DEBUG
                btrfs_err(root->fs_info,
                        "failed to write free ino cache for root %llu",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ada4d24ed11b..8bb013672aee 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3632,25 +3632,28 @@ static void btrfs_read_locked_inode(struct inode *inode)
        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
        BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+        inode->i_version = btrfs_inode_sequence(leaf, inode_item);
+        inode->i_generation = BTRFS_I(inode)->generation;
+        inode->i_rdev = 0;
+        rdev = btrfs_inode_rdev(leaf, inode_item);
+        BTRFS_I(inode)->index_cnt = (u64)-1;
+        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+cache_index:
        /*
         * If we were modified in the current generation and evicted from memory
         * and then re-read we need to do a full sync since we don't have any
         * idea about which extents were modified before we were evicted from
         * cache.
+         *
+         * This is required for both inode re-read from disk and delayed inode
+         * in delayed_nodes_tree.
         */
        if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                        &BTRFS_I(inode)->runtime_flags);
-        inode->i_version = btrfs_inode_sequence(leaf, inode_item);
-        inode->i_generation = BTRFS_I(inode)->generation;
-        inode->i_rdev = 0;
-        rdev = btrfs_inode_rdev(leaf, inode_item);
-        BTRFS_I(inode)->index_cnt = (u64)-1;
-        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
-cache_index:
        path->slots[0]++;
        if (inode->i_nlink != 1 ||
            path->slots[0] >= btrfs_header_nritems(leaf))
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b05653f182c2..1c22c6518504 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2410,7 +2410,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                        "Attempt to delete subvolume %llu during send",
                        dest->root_key.objectid);
                err = -EPERM;
-                goto out_dput;
+                goto out_unlock_inode;
        }
        d_invalidate(dentry);
@@ -2505,6 +2505,7 @@ out_up_write:
                                root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
                spin_unlock(&dest->root_item_lock);
        }
+out_unlock_inode:
        mutex_unlock(&inode->i_mutex);
        if (!err) {
                shrink_dcache_sb(root->fs_info->sb);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 157cc54fc634..760c4a5e096b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -722,6 +722,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
        int ret = 0;
+        int ret_wb = 0;
        u64 end;
        u64 orig_end;
        struct btrfs_ordered_extent *ordered;
@@ -741,9 +742,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
        if (ret)
                return ret;
-        ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+        /*
-        if (ret)
+         * If we have a writeback error don't return immediately. Wait first
-                return ret;
+         * for any ordered extents that haven't completed yet. This is to make
+         * sure no one can dirty the same page ranges and call writepages()
+         * before the ordered extents complete - to avoid failures (-EEXIST)
+         * when adding the new ordered extents to the ordered tree.
+         */
+        ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
        end = orig_end;
        while (1) {
@@ -767,7 +773,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
                        break;
                end--;
        }
-        return ret;
+        return ret_wb ? ret_wb : ret;
 }
 /*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bcd2a007517..96aebf3bcd5b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1058,6 +1058,7 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
        struct extent_map *em;
        struct list_head *search_list = &trans->transaction->pending_chunks;
        int ret = 0;
+        u64 physical_start = *start;
 again:
        list_for_each_entry(em, search_list, list) {
@@ -1068,9 +1069,9 @@ again:
                for (i = 0; i < map->num_stripes; i++) {
                        if (map->stripes[i].dev != device)
                                continue;
-                        if (map->stripes[i].physical >= *start + len ||
+                        if (map->stripes[i].physical >= physical_start + len ||
                            map->stripes[i].physical + em->orig_block_len <=
-                            *start)
+                            physical_start)
                                continue;
                        *start = map->stripes[i].physical +
                                em->orig_block_len;
@@ -1193,8 +1194,14 @@ again:
                         */
                        if (contains_pending_extent(trans, device,
                                                    &search_start,
-                                                    hole_size))
+                                                    hole_size)) {
-                                hole_size = 0;
+                                if (key.offset >= search_start) {
+                                        hole_size = key.offset - search_start;
+                                } else {
+                                        WARN_ON_ONCE(1);
+                                        hole_size = 0;
+                                }
+                        }
                        if (hole_size > max_hole_size) {
                                max_hole_start = search_start;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index da94e41bdbf6..537356742091 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -173,5 +173,5 @@ MODULE_LICENSE("GPL");
 MODULE_VERSION("0.0.2");
 MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
-module_init(configfs_init);
+core_initcall(configfs_init);
 module_exit(configfs_exit);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 59fedbcf8798..86a2121828c3 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -121,7 +121,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
        int len, i;
        int err = -ENOMEM;
-        entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry)
                return err;
diff --git a/fs/exec.c b/fs/exec.c
index 49a1c61433b7..1977c2a553ac 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -659,6 +659,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
        if (stack_base > STACK_SIZE_MAX)
                stack_base = STACK_SIZE_MAX;
+        /* Add space for stack randomization. */
+        stack_base += (STACK_RND_MASK << PAGE_SHIFT);
        /* Make sure we didn't let the argument array grow too large. */
        if (vma->vm_end - vma->vm_start > stack_base)
                return -ENOMEM;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 18228c201f7f..024f2284d3f6 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -64,8 +64,8 @@ config EXT4_FS_SECURITY
          If you are not using a security module that requires using
          extended attributes for file security labels, say N.
-config EXT4_FS_ENCRYPTION
+config EXT4_ENCRYPTION
-        bool "Ext4 Encryption"
+        tristate "Ext4 Encryption"
        depends on EXT4_FS
        select CRYPTO_AES
        select CRYPTO_CBC
@@ -81,6 +81,11 @@ config EXT4_FS_ENCRYPTION
          efficient since it avoids caching the encrypted and
          decrypted pages in the page cache.
+config EXT4_FS_ENCRYPTION
+        bool
+        default y
+        depends on EXT4_ENCRYPTION
 config EXT4_DEBUG
        bool "EXT4 debugging support"
        depends on EXT4_FS
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index ca2f5948c1ac..fded02f72299 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -66,6 +66,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx,
        int res = 0;
        char iv[EXT4_CRYPTO_BLOCK_SIZE];
        struct scatterlist sg[1];
+        int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK);
        char *workbuf;
        if (iname->len <= 0 || iname->len > ctx->lim)
@@ -73,6 +74,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx,
        ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ?
                EXT4_CRYPTO_BLOCK_SIZE : iname->len;
+        ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding);
        ciphertext_len = (ciphertext_len > ctx->lim)
                        ? ctx->lim : ciphertext_len;
@@ -101,7 +103,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx,
        /* Create encryption request */
        sg_init_table(sg, 1);
        sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0);
-        ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv);
+        ablkcipher_request_set_crypt(req, sg, sg, ciphertext_len, iv);
        res = crypto_ablkcipher_encrypt(req);
        if (res == -EINPROGRESS || res == -EBUSY) {
                BUG_ON(req->base.data != &ecr);
@@ -198,106 +200,57 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx,
        return oname->len;
 }
+static const char *lookup_table =
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
 /**
 * ext4_fname_encode_digest() -
 *
 * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
 * The encoded string is roughly 4/3 times the size of the input string.
 */
-int ext4_fname_encode_digest(char *dst, char *src, u32 len)
+static int digest_encode(const char *src, int len, char *dst)
 {
-        static const char *lookup_table =
+        int i = 0, bits = 0, ac = 0;
-                "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_+";
+        char *cp = dst;
-        u32 current_chunk, num_chunks, i;
-        char tmp_buf[3];
+        while (i < len) {
-        u32 c0, c1, c2, c3;
+                ac += (((unsigned char) src[i]) << bits);
+                bits += 8;
-        current_chunk = 0;
+                do {
-        num_chunks = len/3;
+                        *cp++ = lookup_table[ac & 0x3f];
-        for (i = 0; i < num_chunks; i++) {
+                        ac >>= 6;
-                c0 = src[3*i] & 0x3f;
+                        bits -= 6;
-                c1 = (((src[3*i]>>6)&0x3) | ((src[3*i+1] & 0xf)<<2)) & 0x3f;
+                } while (bits >= 6);
-                c2 = (((src[3*i+1]>>4)&0xf) | ((src[3*i+2] & 0x3)<<4)) & 0x3f;
-                c3 = (src[3*i+2]>>2) & 0x3f;
-                dst[4*i] = lookup_table[c0];
-                dst[4*i+1] = lookup_table[c1];
-                dst[4*i+2] = lookup_table[c2];
-                dst[4*i+3] = lookup_table[c3];
-        }
-        if (i*3 < len) {
-                memset(tmp_buf, 0, 3);
-                memcpy(tmp_buf, &src[3*i], len-3*i);
-                c0 = tmp_buf[0] & 0x3f;
-                c1 = (((tmp_buf[0]>>6)&0x3) | ((tmp_buf[1] & 0xf)<<2)) & 0x3f;
-                c2 = (((tmp_buf[1]>>4)&0xf) | ((tmp_buf[2] & 0x3)<<4)) & 0x3f;
-                c3 = (tmp_buf[2]>>2) & 0x3f;
-                dst[4*i] = lookup_table[c0];
-                dst[4*i+1] = lookup_table[c1];
-                dst[4*i+2] = lookup_table[c2];
-                dst[4*i+3] = lookup_table[c3];
                i++;
        }
-        return (i * 4);
+        if (bits)
+                *cp++ = lookup_table[ac & 0x3f];
+        return cp - dst;
 }
-/**
+static int digest_decode(const char *src, int len, char *dst)
- * ext4_fname_hash() -
- *
- * This function computes the hash of the input filename, and sets the output
- * buffer to the *encoded* digest.  It returns the length of the digest as its
- * return value.  Errors are returned as negative numbers.  We trust the caller
- * to allocate sufficient memory to oname string.
- */
-static int ext4_fname_hash(struct ext4_fname_crypto_ctx *ctx,
-                           const struct ext4_str *iname,
-                           struct ext4_str *oname)
 {
-        struct scatterlist sg;
+        int i = 0, bits = 0, ac = 0;
-        struct hash_desc desc = {
+        const char *p;
-                .tfm = (struct crypto_hash *)ctx->htfm,
+        char *cp = dst;
-                .flags = CRYPTO_TFM_REQ_MAY_SLEEP
-        };
+        while (i < len) {
-        int res = 0;
+                p = strchr(lookup_table, src[i]);
+                if (p == NULL || src[i] == 0)
-        if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) {
+                        return -2;
-                res = ext4_fname_encode_digest(oname->name, iname->name,
+                ac += (p - lookup_table) << bits;
-                                               iname->len);
+                bits += 6;
-                oname->len = res;
+                if (bits >= 8) {
-                return res;
+                        *cp++ = ac & 0xff;
-        }
+                        ac >>= 8;
+                        bits -= 8;
-        sg_init_one(&sg, iname->name, iname->len);
+                }
-        res = crypto_hash_init(&desc);
+                i++;
-        if (res) {
-                printk(KERN_ERR
-                       "%s: Error initializing crypto hash; res = [%d]\n",
-                       __func__, res);
-                goto out;
-        }
-        res = crypto_hash_update(&desc, &sg, iname->len);
-        if (res) {
-                printk(KERN_ERR
-                       "%s: Error updating crypto hash; res = [%d]\n",
-                       __func__, res);
-                goto out;
-        }
-        res = crypto_hash_final(&desc,
-                &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE]);
-        if (res) {
-                printk(KERN_ERR
-                       "%s: Error finalizing crypto hash; res = [%d]\n",
-                       __func__, res);
-                goto out;
        }
-        /* Encode the digest as a printable string--this will increase the
+        if (ac)
-         * size of the digest */
+                return -1;
-        oname->name[0] = 'I';
+        return cp - dst;
-        res = ext4_fname_encode_digest(oname->name+1,
-                &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE],
-                EXT4_FNAME_CRYPTO_DIGEST_SIZE) + 1;
-        oname->len = res;
-out:
-        return res;
 }
 /**
@@ -405,6 +358,7 @@ struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(
        if (IS_ERR(ctx))
                return ctx;
+        ctx->flags = ei->i_crypt_policy_flags;
        if (ctx->has_valid_key) {
                if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) {
                        printk_once(KERN_WARNING
@@ -517,6 +471,7 @@ int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx,
                                      u32 namelen)
 {
        u32 ciphertext_len;
+        int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK);
        if (ctx == NULL)
                return -EIO;
@@ -524,6 +479,7 @@ int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx,
                return -EACCES;
        ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ?
                EXT4_CRYPTO_BLOCK_SIZE : namelen;
+        ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding);
        ciphertext_len = (ciphertext_len > ctx->lim)
                        ? ctx->lim : ciphertext_len;
        return (int) ciphertext_len;
@@ -539,10 +495,13 @@ int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx,
                                   u32 ilen, struct ext4_str *crypto_str)
 {
        unsigned int olen;
+        int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK);
        if (!ctx)
                return -EIO;
-        olen = ext4_fname_crypto_round_up(ilen, EXT4_CRYPTO_BLOCK_SIZE);
+        if (padding < EXT4_CRYPTO_BLOCK_SIZE)
+                padding = EXT4_CRYPTO_BLOCK_SIZE;
+        olen = ext4_fname_crypto_round_up(ilen, padding);
        crypto_str->len = olen;
        if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2)
                olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2;
@@ -571,9 +530,13 @@ void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str)
 * ext4_fname_disk_to_usr() - converts a filename from disk space to user space
 */
 int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
-                           const struct ext4_str *iname,
+                            struct dx_hash_info *hinfo,
-                           struct ext4_str *oname)
+                            const struct ext4_str *iname,
+                            struct ext4_str *oname)
 {
+        char buf[24];
+        int ret;
        if (ctx == NULL)
                return -EIO;
        if (iname->len < 3) {
@@ -587,18 +550,33 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
        }
        if (ctx->has_valid_key)
                return ext4_fname_decrypt(ctx, iname, oname);
-        else
-                return ext4_fname_hash(ctx, iname, oname);
+        if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) {
+                ret = digest_encode(iname->name, iname->len, oname->name);
+                oname->len = ret;
+                return ret;
+        }
+        if (hinfo) {
+                memcpy(buf, &hinfo->hash, 4);
+                memcpy(buf+4, &hinfo->minor_hash, 4);
+        } else
+                memset(buf, 0, 8);
+        memcpy(buf + 8, iname->name + iname->len - 16, 16);
+        oname->name[0] = '_';
+        ret = digest_encode(buf, 24, oname->name+1);
+        oname->len = ret + 1;
+        return ret + 1;
 }
 int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+                           struct dx_hash_info *hinfo,
                           const struct ext4_dir_entry_2 *de,
                           struct ext4_str *oname)
 {
        struct ext4_str iname = {.name = (unsigned char *) de->name,
                                 .len = de->name_len };
-        return _ext4_fname_disk_to_usr(ctx, &iname, oname);
+        return _ext4_fname_disk_to_usr(ctx, hinfo, &iname, oname);
 }
@@ -640,10 +618,11 @@ int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx,
                            const struct qstr *iname,
                            struct dx_hash_info *hinfo)
 {
-        struct ext4_str tmp, tmp2;
+        struct ext4_str tmp;
        int ret = 0;
+        char buf[EXT4_FNAME_CRYPTO_DIGEST_SIZE+1];
-        if (!ctx || !ctx->has_valid_key ||
+        if (!ctx ||
            ((iname->name[0] == '.') &&
             ((iname->len == 1) ||
              ((iname->name[1] == '.') && (iname->len == 2))))) {
@@ -651,59 +630,90 @@ int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx,
                return 0;
        }
+        if (!ctx->has_valid_key && iname->name[0] == '_') {
+                if (iname->len != 33)
+                        return -ENOENT;
+                ret = digest_decode(iname->name+1, iname->len, buf);
+                if (ret != 24)
+                        return -ENOENT;
+                memcpy(&hinfo->hash, buf, 4);
+                memcpy(&hinfo->minor_hash, buf + 4, 4);
+                return 0;
+        }
+        if (!ctx->has_valid_key && iname->name[0] != '_') {
+                if (iname->len > 43)
+                        return -ENOENT;
+                ret = digest_decode(iname->name, iname->len, buf);
+                ext4fs_dirhash(buf, ret, hinfo);
+                return 0;
+        }
        /* First encrypt the plaintext name */
        ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp);
        if (ret < 0)
                return ret;
        ret = ext4_fname_encrypt(ctx, iname, &tmp);
-        if (ret < 0)
+        if (ret >= 0) {
-                goto out;
+                ext4fs_dirhash(tmp.name, tmp.len, hinfo);
+                ret = 0;
-        tmp2.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1;
-        tmp2.name = kmalloc(tmp2.len + 1, GFP_KERNEL);
-        if (tmp2.name == NULL) {
-                ret = -ENOMEM;
-                goto out;
        }
-        ret = ext4_fname_hash(ctx, &tmp, &tmp2);
-        if (ret > 0)
-                ext4fs_dirhash(tmp2.name, tmp2.len, hinfo);
-        ext4_fname_crypto_free_buffer(&tmp2);
-out:
        ext4_fname_crypto_free_buffer(&tmp);
        return ret;
 }
-/**
+int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr,
- * ext4_fname_disk_to_htree() - converts a filename from disk space to htree-access string
+                     int len, const char * const name,
- */
+                     struct ext4_dir_entry_2 *de)
-int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx,
-                            const struct ext4_dir_entry_2 *de,
-                            struct dx_hash_info *hinfo)
 {
-        struct ext4_str iname = {.name = (unsigned char *) de->name,
+        int ret = -ENOENT;
-                                 .len = de->name_len};
+        int bigname = (*name == '_');
-        struct ext4_str tmp;
-        int ret;
-        if (!ctx ||
+        if (ctx->has_valid_key) {
-            ((iname.name[0] == '.') &&
+                if (cstr->name == NULL) {
-             ((iname.len == 1) ||
+                        struct qstr istr;
-              ((iname.name[1] == '.') && (iname.len == 2))))) {
-                ext4fs_dirhash(iname.name, iname.len, hinfo);
+                        ret = ext4_fname_crypto_alloc_buffer(ctx, len, cstr);
-                return 0;
+                        if (ret < 0)
+                                goto errout;
+                        istr.name = name;
+                        istr.len = len;
+                        ret = ext4_fname_encrypt(ctx, &istr, cstr);
+                        if (ret < 0)
+                                goto errout;
+                }
+        } else {
+                if (cstr->name == NULL) {
+                        cstr->name = kmalloc(32, GFP_KERNEL);
+                        if (cstr->name == NULL)
+                                return -ENOMEM;
+                        if ((bigname && (len != 33)) ||
+                            (!bigname && (len > 43)))
+                                goto errout;
+                        ret = digest_decode(name+bigname, len-bigname,
+                                            cstr->name);
+                        if (ret < 0) {
+                                ret = -ENOENT;
+                                goto errout;
+                        }
+                        cstr->len = ret;
+                }
+                if (bigname) {
+                        if (de->name_len < 16)
+                                return 0;
+                        ret = memcmp(de->name + de->name_len - 16,
+                                     cstr->name + 8, 16);
+                        return (ret == 0) ? 1 : 0;
+                }
        }
+        if (de->name_len != cstr->len)
-        tmp.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1;
+                return 0;
-        tmp.name = kmalloc(tmp.len + 1, GFP_KERNEL);
+        ret = memcmp(de->name, cstr->name, cstr->len);
-        if (tmp.name == NULL)
+        return (ret == 0) ? 1 : 0;
-                return -ENOMEM;
+errout:
+        kfree(cstr->name);
-        ret = ext4_fname_hash(ctx, &iname, &tmp);
+        cstr->name = NULL;
-        if (ret > 0)
-                ext4fs_dirhash(tmp.name, tmp.len, hinfo);
-        ext4_fname_crypto_free_buffer(&tmp);
        return ret;
 }
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index c8392af8abbb..52170d0b7c40 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -110,6 +110,7 @@ int ext4_generate_encryption_key(struct inode *inode)
        }
        res = 0;
+        ei->i_crypt_policy_flags = ctx.flags;
        if (S_ISREG(inode->i_mode))
                crypt_key->mode = ctx.contents_encryption_mode;
        else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
index 30eaf9e9864a..a6d6291aea16 100644
--- a/fs/ext4/crypto_policy.c
+++ b/fs/ext4/crypto_policy.c
@@ -37,6 +37,8 @@ static int ext4_is_encryption_context_consistent_with_policy(
                return 0;
        return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
                        EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
+                (ctx.flags ==
+                 policy->flags) &&
                (ctx.contents_encryption_mode ==
                 policy->contents_encryption_mode) &&
                (ctx.filenames_encryption_mode ==
@@ -56,25 +58,25 @@ static int ext4_create_encryption_context_from_policy(
                printk(KERN_WARNING
                       "%s: Invalid contents encryption mode %d\n", __func__,
                        policy->contents_encryption_mode);
-                res = -EINVAL;
+                return -EINVAL;
-                goto out;
        }
        if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
                printk(KERN_WARNING
                       "%s: Invalid filenames encryption mode %d\n", __func__,
                        policy->filenames_encryption_mode);
-                res = -EINVAL;
+                return -EINVAL;
-                goto out;
        }
+        if (policy->flags & ~EXT4_POLICY_FLAGS_VALID)
+                return -EINVAL;
        ctx.contents_encryption_mode = policy->contents_encryption_mode;
        ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+        ctx.flags = policy->flags;
        BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE);
        get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
        res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
                             EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
                             sizeof(ctx), 0);
-out:
        if (!res)
                ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
        return res;
@@ -115,6 +117,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy)
        policy->version = 0;
        policy->contents_encryption_mode = ctx.contents_encryption_mode;
        policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
+        policy->flags = ctx.flags;
        memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
               EXT4_KEY_DESCRIPTOR_SIZE);
        return 0;
@@ -176,6 +179,7 @@ int ext4_inherit_context(struct inode *parent, struct inode *child)
                                EXT4_ENCRYPTION_MODE_AES_256_XTS;
                        ctx.filenames_encryption_mode =
                                EXT4_ENCRYPTION_MODE_AES_256_CTS;
+                        ctx.flags = 0;
                        memset(ctx.master_key_descriptor, 0x42,
                               EXT4_KEY_DESCRIPTOR_SIZE);
                        res = 0;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 61db51a5ce4c..5665d82d2332 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -249,7 +249,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
                                } else {
                                        /* Directory is encrypted */
                                        err = ext4_fname_disk_to_usr(enc_ctx,
-                                                        de, &fname_crypto_str);
+                                                NULL, de, &fname_crypto_str);
                                        if (err < 0)
                                                goto errout;
                                        if (!dir_emit(ctx,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ef267adce19a..9a83f149ac85 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -911,6 +911,7 @@ struct ext4_inode_info {
        /* on-disk additional length */
        __u16 i_extra_isize;
+        char i_crypt_policy_flags;
        /* Indicate the inline data space. */
        u16 i_inline_off;
@@ -1066,12 +1067,6 @@ extern void ext4_set_bits(void *bm, int cur, int len);
 /* Metadata checksum algorithm codes */
 #define EXT4_CRC32C_CHKSUM              1
-/* Encryption algorithms */
-#define EXT4_ENCRYPTION_MODE_INVALID            0
-#define EXT4_ENCRYPTION_MODE_AES_256_XTS        1
-#define EXT4_ENCRYPTION_MODE_AES_256_GCM        2
-#define EXT4_ENCRYPTION_MODE_AES_256_CBC        3
 /*
 * Structure of the super block
 */
@@ -2093,9 +2088,11 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize);
 int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx,
                                   u32 ilen, struct ext4_str *crypto_str);
 int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+                            struct dx_hash_info *hinfo,
                            const struct ext4_str *iname,
                            struct ext4_str *oname);
 int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+                           struct dx_hash_info *hinfo,
                           const struct ext4_dir_entry_2 *de,
                           struct ext4_str *oname);
 int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx,
@@ -2104,11 +2101,12 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx,
 int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx,
                           const struct qstr *iname,
                           struct dx_hash_info *hinfo);
-int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx,
-                            const struct ext4_dir_entry_2 *de,
-                            struct dx_hash_info *hinfo);
 int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx,
                                      u32 namelen);
+int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr,
+                     int len, const char * const name,
+                     struct ext4_dir_entry_2 *de);
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx);
@@ -2891,7 +2889,6 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
                                         ext4_lblk_t lblocks);
-extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
                                                   struct ext4_ext_path *path);
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
index c2ba35a914b6..d75159c101ce 100644
--- a/fs/ext4/ext4_crypto.h
+++ b/fs/ext4/ext4_crypto.h
@@ -20,12 +20,20 @@ struct ext4_encryption_policy {
        char version;
        char contents_encryption_mode;
        char filenames_encryption_mode;
+        char flags;
        char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
 } __attribute__((__packed__));
 #define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1
 #define EXT4_KEY_DERIVATION_NONCE_SIZE 16
+#define EXT4_POLICY_FLAGS_PAD_4         0x00
+#define EXT4_POLICY_FLAGS_PAD_8         0x01
+#define EXT4_POLICY_FLAGS_PAD_16        0x02
+#define EXT4_POLICY_FLAGS_PAD_32        0x03
+#define EXT4_POLICY_FLAGS_PAD_MASK      0x03
+#define EXT4_POLICY_FLAGS_VALID         0x03
 /**
 * Encryption context for inode
 *
@@ -41,7 +49,7 @@ struct ext4_encryption_context {
        char format;
        char contents_encryption_mode;
        char filenames_encryption_mode;
-        char reserved;
+        char flags;
        char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
        char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE];
 } __attribute__((__packed__));
@@ -120,6 +128,7 @@ struct ext4_fname_crypto_ctx {
        struct crypto_hash *htfm;
        struct page *workpage;
        struct ext4_encryption_key key;
+        unsigned flags : 8;
        unsigned has_valid_key : 1;
        unsigned ctfm_key_is_ready : 1;
 };
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 3445035c7e01..d41843181818 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -87,6 +87,12 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
                ext4_put_nojournal(handle);
                return 0;
        }
+        if (!handle->h_transaction) {
+                err = jbd2_journal_stop(handle);
+                return handle->h_err ? handle->h_err : err;
+        }
        sb = handle->h_transaction->t_journal->j_private;
        err = handle->h_err;
        rc = jbd2_journal_stop(handle);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 973816bfe4a9..e003a1e81dc3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -377,7 +377,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
        ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
        ext4_lblk_t last = lblock + len - 1;
-        if (lblock > last)
+        if (len == 0 || lblock > last)
                return 0;
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 }
@@ -4927,13 +4927,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (ret)
                return ret;
-        /*
-         * currently supporting (pre)allocate mode for extent-based
-         * files _only_
-         */
-        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-                return -EOPNOTSUPP;
        if (mode & FALLOC_FL_COLLAPSE_RANGE)
                return ext4_collapse_range(inode, offset, len);
@@ -4955,6 +4948,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        mutex_lock(&inode->i_mutex);
+        /*
+         * We only support preallocation for extent-based files only
+         */
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+                ret = -EOPNOTSUPP;
+                goto out;
+        }
        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
             offset + len > i_size_read(inode)) {
                new_size = offset + len;
@@ -5395,6 +5396,14 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
        loff_t new_size, ioffset;
        int ret;
+        /*
+         * We need to test this early because xfstests assumes that a
+         * collapse range of (0, 1) will return EOPNOTSUPP if the file
+         * system does not support collapse range.
+         */
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+                return -EOPNOTSUPP;
        /* Collapse range works only on fs block size aligned offsets. */
        if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
            len & (EXT4_CLUSTER_SIZE(sb) - 1))
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index d33d5a6852b9..26724aeece73 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -703,6 +703,14 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
        BUG_ON(end < lblk);
+        if ((status & EXTENT_STATUS_DELAYED) &&
+            (status & EXTENT_STATUS_WRITTEN)) {
+                ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
+                                " delayed and written which can potentially "
+                                " cause data loss.\n", lblk, len);
+                WARN_ON(1);
+        }
        newes.es_lblk = lblk;
        newes.es_len = len;
        ext4_es_store_pblock_status(&newes, pblk, status);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cbd0654a2675..0554b0b5957b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -531,6 +531,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+                    !(status & EXTENT_STATUS_WRITTEN) &&
                    ext4_find_delalloc_range(inode, map->m_lblk,
                                             map->m_lblk + map->m_len - 1))
                        status |= EXTENT_STATUS_DELAYED;
@@ -635,6 +636,7 @@ found:
                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+                    !(status & EXTENT_STATUS_WRITTEN) &&
                    ext4_find_delalloc_range(inode, map->m_lblk,
                                             map->m_lblk + map->m_len - 1))
                        status |= EXTENT_STATUS_DELAYED;
@@ -4343,7 +4345,7 @@ static void ext4_update_other_inodes_time(struct super_block *sb,
        int inode_size = EXT4_INODE_SIZE(sb);
        oi.orig_ino = orig_ino;
-        ino = orig_ino & ~(inodes_per_block - 1);
+        ino = (orig_ino & ~(inodes_per_block - 1)) + 1;
        for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
                if (ino == orig_ino)
                        continue;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 7223b0b4bc38..814f3beb4369 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -640,7 +640,7 @@ static struct stats dx_show_leaf(struct inode *dir,
                                                ext4_put_fname_crypto_ctx(&ctx);
                                                ctx = NULL;
                                        }
-                                        res = ext4_fname_disk_to_usr(ctx, de,
+                                        res = ext4_fname_disk_to_usr(ctx, NULL, de,
                                                        &fname_crypto_str);
                                        if (res < 0) {
                                                printk(KERN_WARNING "Error "
@@ -653,15 +653,8 @@ static struct stats dx_show_leaf(struct inode *dir,
                                                name = fname_crypto_str.name;
                                                len = fname_crypto_str.len;
                                        }
-                                        res = ext4_fname_disk_to_hash(ctx, de,
+                                        ext4fs_dirhash(de->name, de->name_len,
-                                                                      &h);
+                                                       &h);
-                                        if (res < 0) {
-                                                printk(KERN_WARNING "Error "
-                                                        "converting filename "
-                                                        "from disk to htree"
-                                                        "\n");
-                                                h.hash = 0xDEADBEEF;
-                                        }
                                        printk("%*.s:(E)%x.%u ", len, name,
                                               h.hash, (unsigned) ((char *) de
                                                                   - base));
@@ -1008,15 +1001,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                        /* silently ignore the rest of the block */
                        break;
                }
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-                err = ext4_fname_disk_to_hash(ctx, de, hinfo);
-                if (err < 0) {
-                        count = err;
-                        goto errout;
-                }
-#else
                ext4fs_dirhash(de->name, de->name_len, hinfo);
-#endif
                if ((hinfo->hash < start_hash) ||
                    ((hinfo->hash == start_hash) &&
                     (hinfo->minor_hash < start_minor_hash)))
@@ -1032,7 +1017,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                   &tmp_str);
                } else {
                        /* Directory is encrypted */
-                        err = ext4_fname_disk_to_usr(ctx, de,
+                        err = ext4_fname_disk_to_usr(ctx, hinfo, de,
                                                     &fname_crypto_str);
                        if (err < 0) {
                                count = err;
@@ -1193,26 +1178,10 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
        int count = 0;
        char *base = (char *) de;
        struct dx_hash_info h = *hinfo;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-        struct ext4_fname_crypto_ctx *ctx = NULL;
-        int err;
-        ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
-        if (IS_ERR(ctx))
-                return PTR_ERR(ctx);
-#endif
        while ((char *) de < base + blocksize) {
                if (de->name_len && de->inode) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-                        err = ext4_fname_disk_to_hash(ctx, de, &h);
-                        if (err < 0) {
-                                ext4_put_fname_crypto_ctx(&ctx);
-                                return err;
-                        }
-#else
                        ext4fs_dirhash(de->name, de->name_len, &h);
-#endif
                        map_tail--;
                        map_tail->hash = h.hash;
                        map_tail->offs = ((char *) de - base)>>2;
@@ -1223,9 +1192,6 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
                /* XXX: do we need to check rec_len == 0 case? -Chris */
                de = ext4_next_entry(de, blocksize);
        }
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-        ext4_put_fname_crypto_ctx(&ctx);
-#endif
        return count;
 }
@@ -1287,16 +1253,8 @@ static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx,
                return 0;
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
-        if (ctx) {
+        if (ctx)
-                /* Directory is encrypted */
+                return ext4_fname_match(ctx, fname_crypto_str, len, name, de);
-                res = ext4_fname_disk_to_usr(ctx, de, fname_crypto_str);
-                if (res < 0)
-                        return res;
-                if (len != res)
-                        return 0;
-                res = memcmp(name, fname_crypto_str->name, len);
-                return (res == 0) ? 1 : 0;
-        }
 #endif
        if (len != de->name_len)
                return 0;
@@ -1324,16 +1282,6 @@ int search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
        if (IS_ERR(ctx))
                return -1;
-        if (ctx != NULL) {
-                /* Allocate buffer to hold maximum name length */
-                res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
-                                                     &fname_crypto_str);
-                if (res < 0) {
-                        ext4_put_fname_crypto_ctx(&ctx);
-                        return -1;
-                }
-        }
        de = (struct ext4_dir_entry_2 *)search_buf;
        dlimit = search_buf + buf_size;
        while ((char *) de < dlimit) {
@@ -1872,14 +1820,6 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
                        return res;
                }
                reclen = EXT4_DIR_REC_LEN(res);
-                /* Allocate buffer to hold maximum name length */
-                res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
-                                                     &fname_crypto_str);
-                if (res < 0) {
-                        ext4_put_fname_crypto_ctx(&ctx);
-                        return -1;
-                }
        }
        de = (struct ext4_dir_entry_2 *)buf;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 8a8ec6293b19..cf0c472047e3 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1432,12 +1432,15 @@ static int ext4_flex_group_add(struct super_block *sb,
                goto exit;
        /*
         * We will always be modifying at least the superblock and  GDT
-         * block.  If we are adding a group past the last current GDT block,
+         * blocks.  If we are adding a group past the last current GDT block,
         * we will also modify the inode and the dindirect block.  If we
         * are adding a group with superblock/GDT backups  we will also
         * modify each of the reserved GDT dindirect blocks.
         */
-        credit = flex_gd->count * 4 + reserved_gdb;
+        credit = 3;     /* sb, resize inode, resize inode dindirect */
+        /* GDT blocks */
+        credit += 1 + DIV_ROUND_UP(flex_gd->count, EXT4_DESC_PER_BLOCK(sb));
+        credit += reserved_gdb; /* Reserved GDT dindirect blocks */
        handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f06d0589ddba..ca9d4a2fed41 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -294,6 +294,8 @@ static void __save_error_info(struct super_block *sb, const char *func,
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+        if (bdev_read_only(sb->s_bdev))
+                return;
        es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
        es->s_last_error_time = cpu_to_le32(get_seconds());
        strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 19f78f20975e..187b78920314 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -74,7 +74,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
                goto errout;
        }
        pstr.name = paddr;
-        res = _ext4_fname_disk_to_usr(ctx, &cstr, &pstr);
+        res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr);
        if (res < 0)
                goto errout;
        /* Null-terminate the name */
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b91b0e10678e..1e1aae669fa8 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1513,6 +1513,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 {
        struct inode *inode = mapping->host;
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        bool locked = false;
        int ret;
        long diff;
@@ -1533,7 +1534,13 @@ static int f2fs_write_data_pages(struct address_space *mapping,
        diff = nr_pages_to_write(sbi, DATA, wbc);
+        if (!S_ISDIR(inode->i_mode)) {
+                mutex_lock(&sbi->writepages);
+                locked = true;
+        }
        ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
+        if (locked)
+                mutex_unlock(&sbi->writepages);
        f2fs_submit_merged_bio(sbi, DATA, WRITE);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d8921cf2ba9a..8de34ab6d5b1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -625,6 +625,7 @@ struct f2fs_sb_info {
        struct mutex cp_mutex;                  /* checkpoint procedure lock */
        struct rw_semaphore cp_rwsem;           /* blocking FS operations */
        struct rw_semaphore node_write;         /* locking node writes */
+        struct mutex writepages;                /* mutex for writepages() */
        wait_queue_head_t cp_wait;
        struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 7e3794edae42..658e8079aaf9 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -298,16 +298,14 @@ fail:
 static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-        struct page *page;
+        struct page *page = page_follow_link_light(dentry, nd);
-        page = page_follow_link_light(dentry, nd);
+        if (IS_ERR_OR_NULL(page))
-        if (IS_ERR(page))
                return page;
        /* this is broken symlink case */
        if (*nd_get_link(nd) == 0) {
-                kunmap(page);
+                page_put_link(dentry, nd, page);
-                page_cache_release(page);
                return ERR_PTR(-ENOENT);
        }
        return page;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 160b88346b24..b2dd1b01f076 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1035,6 +1035,7 @@ try_onemore:
        sbi->raw_super = raw_super;
        sbi->raw_super_buf = raw_super_buf;
        mutex_init(&sbi->gc_mutex);
+        mutex_init(&sbi->writepages);
        mutex_init(&sbi->cp_mutex);
        init_rwsem(&sbi->node_write);
        clear_sbi_flag(sbi, SBI_POR_DOING);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index ef263174acd2..07d8d8f52faf 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -581,7 +581,7 @@ static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        if (name == NULL)
                goto out_put;
-        fd = file_create(name, mode & S_IFMT);
+        fd = file_create(name, mode & 0777);
        if (fd < 0)
                error = fd;
        else
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index b5128c6e63ad..a9079d035ae5 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -842,15 +842,23 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
 {
        jbd2_journal_revoke_header_t *header;
        int offset, max;
+        int csum_size = 0;
+        __u32 rcount;
        int record_len = 4;
        header = (jbd2_journal_revoke_header_t *) bh->b_data;
        offset = sizeof(jbd2_journal_revoke_header_t);
-        max = be32_to_cpu(header->r_count);
+        rcount = be32_to_cpu(header->r_count);
        if (!jbd2_revoke_block_csum_verify(journal, header))
                return -EINVAL;
+        if (jbd2_journal_has_csum_v2or3(journal))
+                csum_size = sizeof(struct jbd2_journal_revoke_tail);
+        if (rcount > journal->j_blocksize - csum_size)
+                return -EINVAL;
+        max = rcount;
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
                record_len = 8;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index c6cbaef2bda1..14214da80eb8 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -577,7 +577,7 @@ static void write_one_revoke_record(journal_t *journal,
 {
        int csum_size = 0;
        struct buffer_head *descriptor;
-        int offset;
+        int sz, offset;
        journal_header_t *header;
        /* If we are already aborting, this all becomes a noop.  We
@@ -594,9 +594,14 @@ static void write_one_revoke_record(journal_t *journal,
        if (jbd2_journal_has_csum_v2or3(journal))
                csum_size = sizeof(struct jbd2_journal_revoke_tail);
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+                sz = 8;
+        else
+                sz = 4;
        /* Make sure we have a descriptor with space left for the record */
        if (descriptor) {
-                if (offset >= journal->j_blocksize - csum_size) {
+                if (offset + sz > journal->j_blocksize - csum_size) {
                        flush_descriptor(journal, descriptor, offset, write_op);
                        descriptor = NULL;
                }
@@ -619,16 +624,13 @@ static void write_one_revoke_record(journal_t *journal,
                *descriptorp = descriptor;
        }
-        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
                * ((__be64 *)(&descriptor->b_data[offset])) =
                        cpu_to_be64(record->blocknr);
-                offset += 8;
+        else
-        } else {
                * ((__be32 *)(&descriptor->b_data[offset])) =
                        cpu_to_be32(record->blocknr);
-                offset += 4;
+        offset += sz;
-        }
        *offsetp = offset;
 }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 5f09370c90a8..ff2f2e6ad311 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -551,7 +551,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
        int result;
        int wanted;
-        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;
@@ -627,7 +626,6 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
        tid_t           tid;
        int             need_to_start, ret;
-        WARN_ON(!transaction);
        /* If we've had an abort of any type, don't even think about
         * actually doing the restart! */
        if (is_handle_aborted(handle))
@@ -785,7 +783,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
        int need_copy = 0;
        unsigned long start_lock, time_lock;
-        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;
@@ -1051,7 +1048,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
        int err;
        jbd_debug(5, "journal_head %p\n", jh);
-        WARN_ON(!transaction);
        err = -EROFS;
        if (is_handle_aborted(handle))
                goto out;
@@ -1266,7 +1262,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
        struct journal_head *jh;
        int ret = 0;
-        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;
@@ -1397,7 +1392,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
        int err = 0;
        int was_modified = 0;
-        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;
@@ -1530,8 +1524,22 @@ int jbd2_journal_stop(handle_t *handle)
        tid_t tid;
        pid_t pid;
-        if (!transaction)
+        if (!transaction) {
-                goto free_and_exit;
+                /*
+                 * Handle is already detached from the transaction so
+                 * there is nothing to do other than decrease a refcount,
+                 * or free the handle if refcount drops to zero
+                 */
+                if (--handle->h_ref > 0) {
+                        jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+                                                         handle->h_ref);
+                        return err;
+                } else {
+                        if (handle->h_rsv_handle)
+                                jbd2_free_handle(handle->h_rsv_handle);
+                        goto free_and_exit;
+                }
+        }
        journal = transaction->t_journal;
        J_ASSERT(journal_current_handle() == handle);
@@ -2373,7 +2381,6 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
-        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index f131fc23ffc4..fffca9517321 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -518,7 +518,14 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
        if (!kn)
                goto err_out1;
-        ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
+        /*
+         * If the ino of the sysfs entry created for a kmem cache gets
+         * allocated from an ida layer, which is accounted to the memcg that
+         * owns the cache, the memcg will get pinned forever. So do not account
+         * ino ida allocations.
+         */
+        ret = ida_simple_get(&root->ino_ida, 1, 0,
+                             GFP_KERNEL | __GFP_NOACCOUNT);
        if (ret < 0)
                goto err_out2;
        kn->ino = ret;
diff --git a/fs/namei.c b/fs/namei.c
index 4a8d998b7274..fe30d3be43a8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1415,6 +1415,7 @@ static int lookup_fast(struct nameidata *nd,
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
+                bool negative;
                dentry = __d_lookup_rcu(parent, &nd->last, &seq);
                if (!dentry)
                        goto unlazy;
@@ -1424,8 +1425,11 @@ static int lookup_fast(struct nameidata *nd,
                 * the dentry name information from lookup.
                 */
                *inode = dentry->d_inode;
+                negative = d_is_negative(dentry);
                if (read_seqcount_retry(&dentry->d_seq, seq))
                        return -ECHILD;
+                if (negative)
+                        return -ENOENT;
                /*
                 * This sequence count validates that the parent had no
@@ -1472,6 +1476,10 @@ unlazy:
                goto need_lookup;
        }
+        if (unlikely(d_is_negative(dentry))) {
+                dput(dentry);
+                return -ENOENT;
+        }
        path->mnt = mnt;
        path->dentry = dentry;
        err = follow_managed(path, nd->flags);
@@ -1583,10 +1591,10 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
                        goto out_err;
                inode = path->dentry->d_inode;
+                err = -ENOENT;
+                if (d_is_negative(path->dentry))
+                        goto out_path_put;
        }
-        err = -ENOENT;
-        if (d_is_negative(path->dentry))
-                goto out_path_put;
        if (should_follow_link(path->dentry, follow)) {
                if (nd->flags & LOOKUP_RCU) {
@@ -3036,14 +3044,13 @@ retry_lookup:
        BUG_ON(nd->flags & LOOKUP_RCU);
        inode = path->dentry->d_inode;
-finish_lookup:
-        /* we _can_ be in RCU mode here */
        error = -ENOENT;
        if (d_is_negative(path->dentry)) {
                path_to_nameidata(path, nd);
                goto out;
        }
+finish_lookup:
+        /* we _can_ be in RCU mode here */
        if (should_follow_link(path->dentry, !symlink_ok)) {
                if (nd->flags & LOOKUP_RCU) {
                        if (unlikely(nd->path.mnt != path->mnt ||
@@ -3226,7 +3233,7 @@ static struct file *path_openat(int dfd, struct filename *pathname,
        if (unlikely(file->f_flags & __O_TMPFILE)) {
                error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
-                goto out;
+                goto out2;
        }
        error = path_init(dfd, pathname, flags, nd);
@@ -3256,6 +3263,7 @@ static struct file *path_openat(int dfd, struct filename *pathname,
        }
 out:
        path_cleanup(nd);
+out2:
        if (!(opened & FILE_OPENED)) {
                BUG_ON(!error);
                put_filp(file);
diff --git a/fs/namespace.c b/fs/namespace.c
index 1f4f9dac6e5a..1b9e11167bae 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3179,6 +3179,12 @@ bool fs_fully_visible(struct file_system_type *type)
                if (mnt->mnt.mnt_sb->s_type != type)
                        continue;
+                /* This mount is not fully visible if it's root directory
+                 * is not the root directory of the filesystem.
+                 */
+                if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
+                        continue;
                /* This mount is not fully visible if there are any child mounts
                 * that cover anything except for empty directories.
                 */
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 03d647bf195d..cdefaa331a07 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -181,6 +181,17 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
 }
 const struct nfsd4_layout_ops bl_layout_ops = {
+        /*
+         * Pretend that we send notification to the client.  This is a blatant
+         * lie to force recent Linux clients to cache our device IDs.
+         * We rarely ever change the device ID, so the harm of leaking deviceids
+         * for a while isn't too bad.  Unfortunately RFC5661 is a complete mess
+         * in this regard, but I filed errata 4119 for this a while ago, and
+         * hopefully the Linux client will eventually start caching deviceids
+         * without this again.
+         */
+        .notify_types           =
+                        NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
        .proc_getdeviceinfo     = nfsd4_block_proc_getdeviceinfo,
        .encode_getdeviceinfo   = nfsd4_block_encode_getdeviceinfo,
        .proc_layoutget         = nfsd4_block_proc_layoutget,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 58277859a467..5694cfb7a47b 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -224,7 +224,7 @@ static int nfs_cb_stat_to_errno(int status)
 }
 static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
-                               enum nfsstat4 *status)
+                               int *status)
 {
        __be32 *p;
        u32 op;
@@ -235,7 +235,7 @@ static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
        op = be32_to_cpup(p++);
        if (unlikely(op != expected))
                goto out_unexpected;
-        *status = be32_to_cpup(p);
+        *status = nfs_cb_stat_to_errno(be32_to_cpup(p));
        return 0;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -446,22 +446,16 @@ out_overflow:
 static int decode_cb_sequence4res(struct xdr_stream *xdr,
                                  struct nfsd4_callback *cb)
 {
-        enum nfsstat4 nfserr;
        int status;
        if (cb->cb_minorversion == 0)
                return 0;
-        status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr);
+        status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_status);
-        if (unlikely(status))
+        if (unlikely(status || cb->cb_status))
-                goto out;
+                return status;
-        if (unlikely(nfserr != NFS4_OK))
-                goto out_default;
+        return decode_cb_sequence4resok(xdr, cb);
-        status = decode_cb_sequence4resok(xdr, cb);
-out:
-        return status;
-out_default:
-        return nfs_cb_stat_to_errno(nfserr);
 }
 /*
@@ -524,26 +518,19 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
                                  struct nfsd4_callback *cb)
 {
        struct nfs4_cb_compound_hdr hdr;
-        enum nfsstat4 nfserr;
        int status;
        status = decode_cb_compound4res(xdr, &hdr);
        if (unlikely(status))
-                goto out;
+                return status;
        if (cb != NULL) {
                status = decode_cb_sequence4res(xdr, cb);
-                if (unlikely(status))
+                if (unlikely(status || cb->cb_status))
-                        goto out;
+                        return status;
        }
-        status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
+        return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status);
-        if (unlikely(status))
-                goto out;
-        if (unlikely(nfserr != NFS4_OK))
-                status = nfs_cb_stat_to_errno(nfserr);
-out:
-        return status;
 }
 #ifdef CONFIG_NFSD_PNFS
@@ -621,24 +608,18 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
                                  struct nfsd4_callback *cb)
 {
        struct nfs4_cb_compound_hdr hdr;
-        enum nfsstat4 nfserr;
        int status;
        status = decode_cb_compound4res(xdr, &hdr);
        if (unlikely(status))
-                goto out;
+                return status;
        if (cb) {
                status = decode_cb_sequence4res(xdr, cb);
-                if (unlikely(status))
+                if (unlikely(status || cb->cb_status))
-                        goto out;
+                        return status;
        }
-        status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
+        return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status);
-        if (unlikely(status))
-                goto out;
-        if (unlikely(nfserr != NFS4_OK))
-                status = nfs_cb_stat_to_errno(nfserr);
-out:
-        return status;
 }
 #endif /* CONFIG_NFSD_PNFS */
@@ -898,13 +879,6 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
                if (!nfsd41_cb_get_slot(clp, task))
                        return;
        }
-        spin_lock(&clp->cl_lock);
-        if (list_empty(&cb->cb_per_client)) {
-                /* This is the first call, not a restart */
-                cb->cb_done = false;
-                list_add(&cb->cb_per_client, &clp->cl_callbacks);
-        }
-        spin_unlock(&clp->cl_lock);
        rpc_call_start(task);
 }
@@ -918,22 +892,33 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
        if (clp->cl_minorversion) {
                /* No need for lock, access serialized in nfsd4_cb_prepare */
-                ++clp->cl_cb_session->se_cb_seq_nr;
+                if (!task->tk_status)
+                        ++clp->cl_cb_session->se_cb_seq_nr;
                clear_bit(0, &clp->cl_cb_slot_busy);
                rpc_wake_up_next(&clp->cl_cb_waitq);
                dprintk("%s: freed slot, new seqid=%d\n", __func__,
                        clp->cl_cb_session->se_cb_seq_nr);
        }
-        if (clp->cl_cb_client != task->tk_client) {
+        /*
-                /* We're shutting down or changing cl_cb_client; leave
+         * If the backchannel connection was shut down while this
-                 * it to nfsd4_process_cb_update to restart the call if
+         * task was queued, we need to resubmit it after setting up
-                 * necessary. */
+         * a new backchannel connection.
+         *
+         * Note that if we lost our callback connection permanently
+         * the submission code will error out, so we don't need to
+         * handle that case here.
+         */
+        if (task->tk_flags & RPC_TASK_KILLED) {
+                task->tk_status = 0;
+                cb->cb_need_restart = true;
                return;
        }
-        if (cb->cb_done)
+        if (cb->cb_status) {
-                return;
+                WARN_ON_ONCE(task->tk_status);
+                task->tk_status = cb->cb_status;
+        }
        switch (cb->cb_ops->done(cb, task)) {
        case 0:
@@ -949,21 +934,17 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
        default:
                BUG();
        }
-        cb->cb_done = true;
 }
 static void nfsd4_cb_release(void *calldata)
 {
        struct nfsd4_callback *cb = calldata;
-        struct nfs4_client *clp = cb->cb_clp;
-        if (cb->cb_done) {
-                spin_lock(&clp->cl_lock);
-                list_del(&cb->cb_per_client);
-                spin_unlock(&clp->cl_lock);
+        if (cb->cb_need_restart)
+                nfsd4_run_cb(cb);
+        else
                cb->cb_ops->release(cb);
-        }
 }
 static const struct rpc_call_ops nfsd4_cb_ops = {
@@ -1058,9 +1039,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
                nfsd4_mark_cb_down(clp, err);
                return;
        }
-        /* Yay, the callback channel's back! Restart any callbacks: */
-        list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
-                queue_work(callback_wq, &cb->cb_work);
 }
 static void
@@ -1071,8 +1049,12 @@ nfsd4_run_cb_work(struct work_struct *work)
        struct nfs4_client *clp = cb->cb_clp;
        struct rpc_clnt *clnt;
-        if (cb->cb_ops && cb->cb_ops->prepare)
+        if (cb->cb_need_restart) {
-                cb->cb_ops->prepare(cb);
+                cb->cb_need_restart = false;
+        } else {
+                if (cb->cb_ops && cb->cb_ops->prepare)
+                        cb->cb_ops->prepare(cb);
+        }
        if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
                nfsd4_process_cb_update(cb);
@@ -1084,6 +1066,15 @@ nfsd4_run_cb_work(struct work_struct *work)
                        cb->cb_ops->release(cb);
                return;
        }
+        /*
+         * Don't send probe messages for 4.1 or later.
+         */
+        if (!cb->cb_ops && clp->cl_minorversion) {
+                clp->cl_cb_state = NFSD4_CB_UP;
+                return;
+        }
        cb->cb_msg.rpc_cred = clp->cl_cb_cred;
        rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
                        cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
@@ -1098,8 +1089,8 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
        cb->cb_msg.rpc_resp = cb;
        cb->cb_ops = ops;
        INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
-        INIT_LIST_HEAD(&cb->cb_per_client);
+        cb->cb_status = 0;
-        cb->cb_done = true;
+        cb->cb_need_restart = false;
 }
 void nfsd4_run_cb(struct nfsd4_callback *cb)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 38f2d7abe3a7..039f9c8a95e8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -94,6 +94,7 @@ static struct kmem_cache *lockowner_slab;
 static struct kmem_cache *file_slab;
 static struct kmem_cache *stateid_slab;
 static struct kmem_cache *deleg_slab;
+static struct kmem_cache *odstate_slab;
 static void free_session(struct nfsd4_session *);
@@ -281,6 +282,7 @@ put_nfs4_file(struct nfs4_file *fi)
        if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) {
                hlist_del_rcu(&fi->fi_hash);
                spin_unlock(&state_lock);
+                WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
                WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
                call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
        }
@@ -471,6 +473,86 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
                __nfs4_file_put_access(fp, O_RDONLY);
 }
+/*
+ * Allocate a new open/delegation state counter. This is needed for
+ * pNFS for proper return on close semantics.
+ *
+ * Note that we only allocate it for pNFS-enabled exports, otherwise
+ * all pointers to struct nfs4_clnt_odstate are always NULL.
+ */
+static struct nfs4_clnt_odstate *
+alloc_clnt_odstate(struct nfs4_client *clp)
+{
+        struct nfs4_clnt_odstate *co;
+        co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL);
+        if (co) {
+                co->co_client = clp;
+                atomic_set(&co->co_odcount, 1);
+        }
+        return co;
+}
+static void
+hash_clnt_odstate_locked(struct nfs4_clnt_odstate *co)
+{
+        struct nfs4_file *fp = co->co_file;
+        lockdep_assert_held(&fp->fi_lock);
+        list_add(&co->co_perfile, &fp->fi_clnt_odstate);
+}
+static inline void
+get_clnt_odstate(struct nfs4_clnt_odstate *co)
+{
+        if (co)
+                atomic_inc(&co->co_odcount);
+}
+static void
+put_clnt_odstate(struct nfs4_clnt_odstate *co)
+{
+        struct nfs4_file *fp;
+        if (!co)
+                return;
+        fp = co->co_file;
+        if (atomic_dec_and_lock(&co->co_odcount, &fp->fi_lock)) {
+                list_del(&co->co_perfile);
+                spin_unlock(&fp->fi_lock);
+                nfsd4_return_all_file_layouts(co->co_client, fp);
+                kmem_cache_free(odstate_slab, co);
+        }
+}
+static struct nfs4_clnt_odstate *
+find_or_hash_clnt_odstate(struct nfs4_file *fp, struct nfs4_clnt_odstate *new)
+{
+        struct nfs4_clnt_odstate *co;
+        struct nfs4_client *cl;
+        if (!new)
+                return NULL;
+        cl = new->co_client;
+        spin_lock(&fp->fi_lock);
+        list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) {
+                if (co->co_client == cl) {
+                        get_clnt_odstate(co);
+                        goto out;
+                }
+        }
+        co = new;
+        co->co_file = fp;
+        hash_clnt_odstate_locked(new);
+out:
+        spin_unlock(&fp->fi_lock);
+        return co;
+}
 struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
                                         struct kmem_cache *slab)
 {
@@ -606,7 +688,8 @@ static void block_delegations(struct knfsd_fh *fh)
 }
 static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh)
+alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh,
+                 struct nfs4_clnt_odstate *odstate)
 {
        struct nfs4_delegation *dp;
        long n;
@@ -631,6 +714,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh)
        INIT_LIST_HEAD(&dp->dl_perfile);
        INIT_LIST_HEAD(&dp->dl_perclnt);
        INIT_LIST_HEAD(&dp->dl_recall_lru);
+        dp->dl_clnt_odstate = odstate;
+        get_clnt_odstate(odstate);
        dp->dl_type = NFS4_OPEN_DELEGATE_READ;
        dp->dl_retries = 1;
        nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
@@ -714,6 +799,7 @@ static void destroy_delegation(struct nfs4_delegation *dp)
        spin_lock(&state_lock);
        unhash_delegation_locked(dp);
        spin_unlock(&state_lock);
+        put_clnt_odstate(dp->dl_clnt_odstate);
        nfs4_put_deleg_lease(dp->dl_stid.sc_file);
        nfs4_put_stid(&dp->dl_stid);
 }
@@ -724,6 +810,7 @@ static void revoke_delegation(struct nfs4_delegation *dp)
        WARN_ON(!list_empty(&dp->dl_recall_lru));
+        put_clnt_odstate(dp->dl_clnt_odstate);
        nfs4_put_deleg_lease(dp->dl_stid.sc_file);
        if (clp->cl_minorversion == 0)
@@ -933,6 +1020,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
 {
        struct nfs4_ol_stateid *stp = openlockstateid(stid);
+        put_clnt_odstate(stp->st_clnt_odstate);
        release_all_access(stp);
        if (stp->st_stateowner)
                nfs4_put_stateowner(stp->st_stateowner);
@@ -1538,7 +1626,6 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
        INIT_LIST_HEAD(&clp->cl_lru);
-        INIT_LIST_HEAD(&clp->cl_callbacks);
        INIT_LIST_HEAD(&clp->cl_revoked);
 #ifdef CONFIG_NFSD_PNFS
        INIT_LIST_HEAD(&clp->cl_lo_states);
@@ -1634,6 +1721,7 @@ __destroy_client(struct nfs4_client *clp)
        while (!list_empty(&reaplist)) {
                dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
                list_del_init(&dp->dl_recall_lru);
+                put_clnt_odstate(dp->dl_clnt_odstate);
                nfs4_put_deleg_lease(dp->dl_stid.sc_file);
                nfs4_put_stid(&dp->dl_stid);
        }
@@ -3057,6 +3145,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
        spin_lock_init(&fp->fi_lock);
        INIT_LIST_HEAD(&fp->fi_stateids);
        INIT_LIST_HEAD(&fp->fi_delegations);
+        INIT_LIST_HEAD(&fp->fi_clnt_odstate);
        fh_copy_shallow(&fp->fi_fhandle, fh);
        fp->fi_deleg_file = NULL;
        fp->fi_had_conflict = false;
@@ -3073,6 +3162,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
 void
 nfsd4_free_slabs(void)
 {
+        kmem_cache_destroy(odstate_slab);
        kmem_cache_destroy(openowner_slab);
        kmem_cache_destroy(lockowner_slab);
        kmem_cache_destroy(file_slab);
@@ -3103,8 +3193,14 @@ nfsd4_init_slabs(void)
                        sizeof(struct nfs4_delegation), 0, 0, NULL);
        if (deleg_slab == NULL)
                goto out_free_stateid_slab;
+        odstate_slab = kmem_cache_create("nfsd4_odstate",
+                        sizeof(struct nfs4_clnt_odstate), 0, 0, NULL);
+        if (odstate_slab == NULL)
+                goto out_free_deleg_slab;
        return 0;
+out_free_deleg_slab:
+        kmem_cache_destroy(deleg_slab);
 out_free_stateid_slab:
        kmem_cache_destroy(stateid_slab);
 out_free_file_slab:
@@ -3581,6 +3677,14 @@ alloc_stateid:
        open->op_stp = nfs4_alloc_open_stateid(clp);
        if (!open->op_stp)
                return nfserr_jukebox;
+        if (nfsd4_has_session(cstate) &&
+            (cstate->current_fh.fh_export->ex_flags & NFSEXP_PNFS)) {
+                open->op_odstate = alloc_clnt_odstate(clp);
+                if (!open->op_odstate)
+                        return nfserr_jukebox;
+        }
        return nfs_ok;
 }
@@ -3869,7 +3973,7 @@ out_fput:
 static struct nfs4_delegation *
 nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
-                    struct nfs4_file *fp)
+                    struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
 {
        int status;
        struct nfs4_delegation *dp;
@@ -3877,7 +3981,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
        if (fp->fi_had_conflict)
                return ERR_PTR(-EAGAIN);
-        dp = alloc_init_deleg(clp, fh);
+        dp = alloc_init_deleg(clp, fh, odstate);
        if (!dp)
                return ERR_PTR(-ENOMEM);
@@ -3903,6 +4007,7 @@ out_unlock:
        spin_unlock(&state_lock);
 out:
        if (status) {
+                put_clnt_odstate(dp->dl_clnt_odstate);
                nfs4_put_stid(&dp->dl_stid);
                return ERR_PTR(status);
        }
@@ -3980,7 +4085,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
                default:
                        goto out_no_deleg;
        }
-        dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file);
+        dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate);
        if (IS_ERR(dp))
                goto out_no_deleg;
@@ -4069,6 +4174,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
                        release_open_stateid(stp);
                        goto out;
                }
+                stp->st_clnt_odstate = find_or_hash_clnt_odstate(fp,
+                                                        open->op_odstate);
+                if (stp->st_clnt_odstate == open->op_odstate)
+                        open->op_odstate = NULL;
        }
        update_stateid(&stp->st_stid.sc_stateid);
        memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
@@ -4129,6 +4239,8 @@ void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
                kmem_cache_free(file_slab, open->op_file);
        if (open->op_stp)
                nfs4_put_stid(&open->op_stp->st_stid);
+        if (open->op_odstate)
+                kmem_cache_free(odstate_slab, open->op_odstate);
 }
 __be32
@@ -4385,10 +4497,17 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s
        return nfserr_old_stateid;
 }
+static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols)
+{
+        if (ols->st_stateowner->so_is_open_owner &&
+            !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+                return nfserr_bad_stateid;
+        return nfs_ok;
+}
 static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 {
        struct nfs4_stid *s;
-        struct nfs4_ol_stateid *ols;
        __be32 status = nfserr_bad_stateid;
        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
@@ -4418,13 +4537,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
                break;
        case NFS4_OPEN_STID:
        case NFS4_LOCK_STID:
-                ols = openlockstateid(s);
+                status = nfsd4_check_openowner_confirmed(openlockstateid(s));
-                if (ols->st_stateowner->so_is_open_owner
-                                && !(openowner(ols->st_stateowner)->oo_flags
-                                                & NFS4_OO_CONFIRMED))
-                        status = nfserr_bad_stateid;
-                else
-                        status = nfs_ok;
                break;
        default:
                printk("unknown stateid type %x\n", s->sc_type);
@@ -4516,8 +4629,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
                status = nfs4_check_fh(current_fh, stp);
                if (status)
                        goto out;
-                if (stp->st_stateowner->so_is_open_owner
+                status = nfsd4_check_openowner_confirmed(stp);
-                    && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+                if (status)
                        goto out;
                status = nfs4_check_openmode(stp, flags);
                if (status)
@@ -4852,9 +4965,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        update_stateid(&stp->st_stid.sc_stateid);
        memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
-        nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
-                                      stp->st_stid.sc_file);
        nfsd4_close_open_stateid(stp);
        /* put reference from nfs4_preprocess_seqid_op */
@@ -6488,6 +6598,7 @@ nfs4_state_shutdown_net(struct net *net)
        list_for_each_safe(pos, next, &reaplist) {
                dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
                list_del_init(&dp->dl_recall_lru);
+                put_clnt_odstate(dp->dl_clnt_odstate);
                nfs4_put_deleg_lease(dp->dl_stid.sc_file);
                nfs4_put_stid(&dp->dl_stid);
        }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4f3bfeb11766..dbc4f85a5008 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -63,12 +63,12 @@ typedef struct {
 struct nfsd4_callback {
        struct nfs4_client *cb_clp;
-        struct list_head cb_per_client;
        u32 cb_minorversion;
        struct rpc_message cb_msg;
        struct nfsd4_callback_ops *cb_ops;
        struct work_struct cb_work;
-        bool cb_done;
+        int cb_status;
+        bool cb_need_restart;
 };
 struct nfsd4_callback_ops {
@@ -126,6 +126,7 @@ struct nfs4_delegation {
        struct list_head        dl_perfile;
        struct list_head        dl_perclnt;
        struct list_head        dl_recall_lru;  /* delegation recalled */
+        struct nfs4_clnt_odstate *dl_clnt_odstate;
        u32                     dl_type;
        time_t                  dl_time;
 /* For recall: */
@@ -332,7 +333,6 @@ struct nfs4_client {
        int                     cl_cb_state;
        struct nfsd4_callback   cl_cb_null;
        struct nfsd4_session    *cl_cb_session;
-        struct list_head        cl_callbacks; /* list of in-progress callbacks */
        /* for all client information that callback code might need: */
        spinlock_t              cl_lock;
@@ -465,6 +465,17 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
 }
 /*
+ * Per-client state indicating no. of opens and outstanding delegations
+ * on a file from a particular client.'od' stands for 'open & delegation'
+ */
+struct nfs4_clnt_odstate {
+        struct nfs4_client      *co_client;
+        struct nfs4_file        *co_file;
+        struct list_head        co_perfile;
+        atomic_t                co_odcount;
+};
+/*
 * nfs4_file: a file opened by some number of (open) nfs4_stateowners.
 *
 * These objects are global. nfsd keeps one instance of a nfs4_file per
@@ -485,6 +496,7 @@ struct nfs4_file {
                struct list_head        fi_delegations;
                struct rcu_head         fi_rcu;
        };
+        struct list_head        fi_clnt_odstate;
        /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
        struct file *           fi_fds[3];
        /*
@@ -526,6 +538,7 @@ struct nfs4_ol_stateid {
        struct list_head              st_perstateowner;
        struct list_head              st_locks;
        struct nfs4_stateowner      * st_stateowner;
+        struct nfs4_clnt_odstate    * st_clnt_odstate;
        unsigned char                 st_access_bmap;
        unsigned char                 st_deny_bmap;
        struct nfs4_ol_stateid         * st_openstp;
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index f982ae84f0cd..2f8c092be2b3 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -247,6 +247,7 @@ struct nfsd4_open {
        struct nfs4_openowner *op_openowner; /* used during processing */
        struct nfs4_file *op_file;          /* used during processing */
        struct nfs4_ol_stateid *op_stp;     /* used during processing */
+        struct nfs4_clnt_odstate *op_odstate; /* used during processing */
        struct nfs4_acl *op_acl;
        struct xdr_netobj op_label;
 };
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 059f37137f9a..919fd5bb14a8 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -388,7 +388,7 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
        nchildren = nilfs_btree_node_get_nchildren(node);
        if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
-                     level > NILFS_BTREE_LEVEL_MAX ||
+                     level >= NILFS_BTREE_LEVEL_MAX ||
                     nchildren < 0 ||
                     nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
                pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n",
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a6944b25fd5b..fdf4b41d0609 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -757,6 +757,19 @@ lookup:
        if (tmpres) {
                spin_unlock(&dlm->spinlock);
                spin_lock(&tmpres->spinlock);
+                /*
+                 * Right after dlm spinlock was released, dlm_thread could have
+                 * purged the lockres. Check if lockres got unhashed. If so
+                 * start over.
+                 */
+                if (hlist_unhashed(&tmpres->hash_node)) {
+                        spin_unlock(&tmpres->spinlock);
+                        dlm_lockres_put(tmpres);
+                        tmpres = NULL;
+                        goto lookup;
+                }
                /* Wait on the thread that is mastering the resource */
                if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
                        __dlm_wait_on_lockres(tmpres);
diff --git a/fs/splice.c b/fs/splice.c
index 476024bb6546..bfe62ae40f40 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1161,7 +1161,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
        long ret, bytes;
        umode_t i_mode;
        size_t len;
-        int i, flags;
+        int i, flags, more;
        /*
         * We require the input being a regular file, as we don't want to
@@ -1204,6 +1204,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
         * Don't block on output, we have to drain the direct pipe.
         */
        sd->flags &= ~SPLICE_F_NONBLOCK;
+        more = sd->flags & SPLICE_F_MORE;
        while (len) {
                size_t read_len;
@@ -1217,6 +1218,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
                sd->total_len = read_len;
                /*
+                 * If more data is pending, set SPLICE_F_MORE
+                 * If this is the last data and SPLICE_F_MORE was not set
+                 * initially, clears it.
+                 */
+                if (read_len < len)
+                        sd->flags |= SPLICE_F_MORE;
+                else if (!more)
+                        sd->flags &= ~SPLICE_F_MORE;
+                /*
                 * NOTE: nonblocking mode only applies to the input. We
                 * must not do the output in nonblocking mode as then we
                 * could get stuck data in the internal pipe: