57 files changed, 918 insertions, 752 deletions
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0548c53f41d5..22fc7c802d69 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -511,8 +511,6 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
        pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
                 page->index, to);
        BUG_ON(to > PAGE_CACHE_SIZE);
-        kmap(page);
-        data = page_address(page);
        bsize = AFFS_SB(sb)->s_data_blksize;
        tmp = page->index << PAGE_CACHE_SHIFT;
        bidx = tmp / bsize;
@@ -524,14 +522,15 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
                        return PTR_ERR(bh);
                tmp = min(bsize - boff, to - pos);
                BUG_ON(pos + tmp > to || tmp > bsize);
+                data = kmap_atomic(page);
                memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
+                kunmap_atomic(data);
                affs_brelse(bh);
                bidx++;
                pos += tmp;
                boff = 0;
        }
        flush_dcache_page(page);
-        kunmap(page);
        return 0;
 }
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c6d7d3dbd52a..75dd739ac3e6 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -537,8 +537,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
                ino->dentry = dentry;
                autofs4_add_active(dentry);
-                d_instantiate(dentry, NULL);
        }
        return NULL;
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7b9cd49622b1..39b3a174a425 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1730,43 +1730,25 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return __dax_fault(vma, vmf, blkdev_get_block, NULL);
 }
-static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
-                pmd_t *pmd, unsigned int flags)
+                struct vm_fault *vmf)
-{
-        return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
-}
-static void blkdev_vm_open(struct vm_area_struct *vma)
 {
-        struct inode *bd_inode = bdev_file_inode(vma->vm_file);
+        return dax_pfn_mkwrite(vma, vmf);
-        struct block_device *bdev = I_BDEV(bd_inode);
-        inode_lock(bd_inode);
-        bdev->bd_map_count++;
-        inode_unlock(bd_inode);
 }
-static void blkdev_vm_close(struct vm_area_struct *vma)
+static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+                pmd_t *pmd, unsigned int flags)
 {
-        struct inode *bd_inode = bdev_file_inode(vma->vm_file);
+        return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
-        struct block_device *bdev = I_BDEV(bd_inode);
-        inode_lock(bd_inode);
-        bdev->bd_map_count--;
-        inode_unlock(bd_inode);
 }
 static const struct vm_operations_struct blkdev_dax_vm_ops = {
-        .open           = blkdev_vm_open,
-        .close          = blkdev_vm_close,
        .fault          = blkdev_dax_fault,
        .pmd_fault      = blkdev_dax_pmd_fault,
-        .pfn_mkwrite    = blkdev_dax_fault,
+        .pfn_mkwrite    = blkdev_dax_pfn_mkwrite,
 };
 static const struct vm_operations_struct blkdev_default_vm_ops = {
-        .open           = blkdev_vm_open,
-        .close          = blkdev_vm_close,
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
 };
@@ -1774,18 +1756,14 @@ static const struct vm_operations_struct blkdev_default_vm_ops = {
 static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct inode *bd_inode = bdev_file_inode(file);
-        struct block_device *bdev = I_BDEV(bd_inode);
        file_accessed(file);
-        inode_lock(bd_inode);
-        bdev->bd_map_count++;
        if (IS_DAX(bd_inode)) {
                vma->vm_ops = &blkdev_dax_vm_ops;
                vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
        } else {
                vma->vm_ops = &blkdev_default_vm_ops;
        }
-        inode_unlock(bd_inode);
        return 0;
 }
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 88d9af3d4581..5fb60ea7eee2 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -328,8 +328,8 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
                list_add_tail(&work->ordered_list, &wq->ordered_list);
                spin_unlock_irqrestore(&wq->list_lock, flags);
        }
-        queue_work(wq->normal_wq, &work->normal_work);
        trace_btrfs_work_queued(work);
+        queue_work(wq->normal_wq, &work->normal_work);
 }
 void btrfs_queue_work(struct btrfs_workqueue *wq,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b90cd3776f8e..f6dac40f87ff 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1406,7 +1406,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                        read_extent_buffer(eb, dest + bytes_left,
                                           name_off, name_len);
                if (eb != eb_in) {
-                        btrfs_tree_read_unlock_blocking(eb);
+                        if (!path->skip_locking)
+                                btrfs_tree_read_unlock_blocking(eb);
                        free_extent_buffer(eb);
                }
                ret = btrfs_find_item(fs_root, path, parent, 0,
@@ -1426,9 +1427,10 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                eb = path->nodes[0];
                /* make sure we can use eb after releasing the path */
                if (eb != eb_in) {
-                        atomic_inc(&eb->refs);
+                        if (!path->skip_locking)
-                        btrfs_tree_read_lock(eb);
+                                btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
-                        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                        path->nodes[0] = NULL;
+                        path->locks[0] = 0;
                }
                btrfs_release_path(path);
                iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c473c42d7d6c..3346cd8f9910 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -637,11 +637,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        faili = nr_pages - 1;
        cb->nr_pages = nr_pages;
-        /* In the parent-locked case, we only locked the range we are
+        add_ra_bio_pages(inode, em_start + em_len, cb);
-         * interested in.  In all other cases, we can opportunistically
-         * cache decompressed data that goes beyond the requested range. */
-        if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED))
-                add_ra_bio_pages(inode, em_start + em_len, cb);
        /* include any pages we added in add_ra-bio_pages */
        uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0be47e4b8136..b57daa895cea 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1689,7 +1689,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
 *
 */
 int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
-                                    struct list_head *ins_list)
+                                    struct list_head *ins_list, bool *emitted)
 {
        struct btrfs_dir_item *di;
        struct btrfs_delayed_item *curr, *next;
@@ -1733,6 +1733,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
                if (over)
                        return 1;
+                *emitted = true;
        }
        return 0;
 }
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f70119f25421..0167853c84ae 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -144,7 +144,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
 int btrfs_should_delete_dir_index(struct list_head *del_list,
                                  u64 index);
 int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
-                                    struct list_head *ins_list);
+                                    struct list_head *ins_list, bool *emitted);
 /* for init */
 int __init btrfs_delayed_inode_init(void);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dd08e29f5117..4545e2e2ad45 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -182,6 +182,7 @@ static struct btrfs_lockdep_keyset {
        { .id = BTRFS_TREE_RELOC_OBJECTID,      .name_stem = "treloc"   },
        { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc"   },
        { .id = BTRFS_UUID_TREE_OBJECTID,       .name_stem = "uuid"     },
+        { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" },
        { .id = 0,                              .name_stem = "tree"     },
 };
@@ -1787,7 +1788,6 @@ static int cleaner_kthread(void *arg)
        int again;
        struct btrfs_trans_handle *trans;
-        set_freezable();
        do {
                again = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2e7c97a3f344..392592dc7010 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2897,12 +2897,11 @@ static int __do_readpage(struct extent_io_tree *tree,
        struct block_device *bdev;
        int ret;
        int nr = 0;
-        int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
        size_t pg_offset = 0;
        size_t iosize;
        size_t disk_io_size;
        size_t blocksize = inode->i_sb->s_blocksize;
-        unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
+        unsigned long this_bio_flag = 0;
        set_page_extent_mapped(page);
@@ -2942,18 +2941,16 @@ static int __do_readpage(struct extent_io_tree *tree,
                        kunmap_atomic(userpage);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
-                        if (!parent_locked)
+                        unlock_extent_cached(tree, cur,
-                                unlock_extent_cached(tree, cur,
+                                             cur + iosize - 1,
-                                                     cur + iosize - 1,
+                                             &cached, GFP_NOFS);
-                                                     &cached, GFP_NOFS);
                        break;
                }
                em = __get_extent_map(inode, page, pg_offset, cur,
                                      end - cur + 1, get_extent, em_cached);
                if (IS_ERR_OR_NULL(em)) {
                        SetPageError(page);
-                        if (!parent_locked)
+                        unlock_extent(tree, cur, end);
-                                unlock_extent(tree, cur, end);
                        break;
                }
                extent_offset = cur - em->start;
@@ -3038,12 +3035,9 @@ static int __do_readpage(struct extent_io_tree *tree,
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
-                        if (parent_locked)
+                        unlock_extent_cached(tree, cur,
-                                free_extent_state(cached);
+                                             cur + iosize - 1,
-                        else
+                                             &cached, GFP_NOFS);
-                                unlock_extent_cached(tree, cur,
-                                                     cur + iosize - 1,
-                                                     &cached, GFP_NOFS);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
@@ -3052,8 +3046,7 @@ static int __do_readpage(struct extent_io_tree *tree,
                if (test_range_bit(tree, cur, cur_end,
                                   EXTENT_UPTODATE, 1, NULL)) {
                        check_page_uptodate(tree, page);
-                        if (!parent_locked)
+                        unlock_extent(tree, cur, cur + iosize - 1);
-                                unlock_extent(tree, cur, cur + iosize - 1);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
@@ -3063,8 +3056,7 @@ static int __do_readpage(struct extent_io_tree *tree,
                 */
                if (block_start == EXTENT_MAP_INLINE) {
                        SetPageError(page);
-                        if (!parent_locked)
+                        unlock_extent(tree, cur, cur + iosize - 1);
-                                unlock_extent(tree, cur, cur + iosize - 1);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
@@ -3083,8 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree,
                        *bio_flags = this_bio_flag;
                } else {
                        SetPageError(page);
-                        if (!parent_locked)
+                        unlock_extent(tree, cur, cur + iosize - 1);
-                                unlock_extent(tree, cur, cur + iosize - 1);
                }
                cur = cur + iosize;
                pg_offset += iosize;
@@ -3213,20 +3204,6 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
        return ret;
 }
-int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
-                                 get_extent_t *get_extent, int mirror_num)
-{
-        struct bio *bio = NULL;
-        unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
-        int ret;
-        ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
-                            &bio_flags, READ, NULL);
-        if (bio)
-                ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
-        return ret;
-}
 static noinline void update_nr_written(struct page *page,
                                      struct writeback_control *wbc,
                                      unsigned long nr_written)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 0377413bd4b9..880d5292e972 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -29,7 +29,6 @@
 */
 #define EXTENT_BIO_COMPRESSED 1
 #define EXTENT_BIO_TREE_LOG 2
-#define EXTENT_BIO_PARENT_LOCKED 4
 #define EXTENT_BIO_FLAG_SHIFT 16
 /* these are bit numbers for test/set bit */
@@ -210,8 +209,6 @@ static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
                          get_extent_t *get_extent, int mirror_num);
-int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
-                                 get_extent_t *get_extent, int mirror_num);
 int __init extent_io_init(void);
 void extent_io_exit(void);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 393e36bd5845..53dbeaf6ce94 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -153,6 +153,20 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
 static unsigned long *alloc_bitmap(u32 bitmap_size)
 {
+        void *mem;
+        /*
+         * The allocation size varies, observed numbers were < 4K up to 16K.
+         * Using vmalloc unconditionally would be too heavy, we'll try
+         * contiguous allocations first.
+         */
+        if  (bitmap_size <= PAGE_SIZE)
+                return kzalloc(bitmap_size, GFP_NOFS);
+        mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN);
+        if (mem)
+                return mem;
        return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,
                         PAGE_KERNEL);
 }
@@ -289,7 +303,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
        ret = 0;
 out:
-        vfree(bitmap);
+        kvfree(bitmap);
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
        return ret;
@@ -438,7 +452,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
        ret = 0;
 out:
-        vfree(bitmap);
+        kvfree(bitmap);
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
        return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e28f3d4691af..151b7c71b868 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5717,6 +5717,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
        char *name_ptr;
        int name_len;
        int is_curr = 0;        /* ctx->pos points to the current index? */
+        bool emitted;
        /* FIXME, use a real flag for deciding about the key type */
        if (root->fs_info->tree_root == root)
@@ -5745,6 +5746,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
        if (ret < 0)
                goto err;
+        emitted = false;
        while (1) {
                leaf = path->nodes[0];
                slot = path->slots[0];
@@ -5824,6 +5826,7 @@ skip:
                        if (over)
                                goto nopos;
+                        emitted = true;
                        di_len = btrfs_dir_name_len(leaf, di) +
                                 btrfs_dir_data_len(leaf, di) + sizeof(*di);
                        di_cur += di_len;
@@ -5836,11 +5839,20 @@ next:
        if (key_type == BTRFS_DIR_INDEX_KEY) {
                if (is_curr)
                        ctx->pos++;
-                ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
+                ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted);
                if (ret)
                        goto nopos;
        }
+        /*
+         * If we haven't emitted any dir entry, we must not touch ctx->pos as
+         * it was was set to the termination value in previous call. We assume
+         * that "." and ".." were emitted if we reach this point and set the
+         * termination value as well for an empty directory.
+         */
+        if (ctx->pos > 2 && !emitted)
+                goto nopos;
        /* Reached end of directory/root. Bump pos past the last item. */
        ctx->pos++;
@@ -7116,21 +7128,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        if (ret)
                return ERR_PTR(ret);
-        em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+        /*
-                              ins.offset, ins.offset, ins.offset, 0);
+         * Create the ordered extent before the extent map. This is to avoid
-        if (IS_ERR(em)) {
+         * races with the fast fsync path that would lead to it logging file
-                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+         * extent items that point to disk extents that were not yet written to.
-                return em;
+         * The fast fsync path collects ordered extents into a local list and
-        }
+         * then collects all the new extent maps, so we must create the ordered
+         * extent first and make sure the fast fsync path collects any new
+         * ordered extents after collecting new extent maps as well.
+         * The fsync path simply can not rely on inode_dio_wait() because it
+         * causes deadlock with AIO.
+         */
        ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
                                           ins.offset, ins.offset, 0);
        if (ret) {
                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-                free_extent_map(em);
                return ERR_PTR(ret);
        }
+        em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+                              ins.offset, ins.offset, ins.offset, 0);
+        if (IS_ERR(em)) {
+                struct btrfs_ordered_extent *oe;
+                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+                oe = btrfs_lookup_ordered_extent(inode, start);
+                ASSERT(oe);
+                if (WARN_ON(!oe))
+                        return em;
+                set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
+                set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
+                btrfs_remove_ordered_extent(inode, oe);
+                /* Once for our lookup and once for the ordered extents tree. */
+                btrfs_put_ordered_extent(oe);
+                btrfs_put_ordered_extent(oe);
+        }
        return em;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 952172ca7e45..48aee9846329 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2794,24 +2794,29 @@ out:
 static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
 {
        struct page *page;
-        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
        page = grab_cache_page(inode->i_mapping, index);
        if (!page)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        if (!PageUptodate(page)) {
-                if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
+                int ret;
-                                                 0))
-                        return NULL;
+                ret = btrfs_readpage(NULL, page);
+                if (ret)
+                        return ERR_PTR(ret);
                lock_page(page);
                if (!PageUptodate(page)) {
                        unlock_page(page);
                        page_cache_release(page);
-                        return NULL;
+                        return ERR_PTR(-EIO);
+                }
+                if (page->mapping != inode->i_mapping) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        return ERR_PTR(-EAGAIN);
                }
        }
-        unlock_page(page);
        return page;
 }
@@ -2823,17 +2828,31 @@ static int gather_extent_pages(struct inode *inode, struct page **pages,
        pgoff_t index = off >> PAGE_CACHE_SHIFT;
        for (i = 0; i < num_pages; i++) {
+again:
                pages[i] = extent_same_get_page(inode, index + i);
-                if (!pages[i])
+                if (IS_ERR(pages[i])) {
-                        return -ENOMEM;
+                        int err = PTR_ERR(pages[i]);
+                        if (err == -EAGAIN)
+                                goto again;
+                        pages[i] = NULL;
+                        return err;
+                }
        }
        return 0;
 }
-static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+static int lock_extent_range(struct inode *inode, u64 off, u64 len,
+                             bool retry_range_locking)
 {
-        /* do any pending delalloc/csum calc on src, one way or
+        /*
-           another, and lock file content */
+         * Do any pending delalloc/csum calculations on inode, one way or
+         * another, and lock file content.
+         * The locking order is:
+         *
+         *   1) pages
+         *   2) range in the inode's io tree
+         */
        while (1) {
                struct btrfs_ordered_extent *ordered;
                lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
@@ -2851,8 +2870,11 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
                unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
+                if (!retry_range_locking)
+                        return -EAGAIN;
                btrfs_wait_ordered_range(inode, off, len);
        }
+        return 0;
 }
 static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
@@ -2877,15 +2899,24 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
        unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
 }
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
-                                     struct inode *inode2, u64 loff2, u64 len)
+                                    struct inode *inode2, u64 loff2, u64 len,
+                                    bool retry_range_locking)
 {
+        int ret;
        if (inode1 < inode2) {
                swap(inode1, inode2);
                swap(loff1, loff2);
        }
-        lock_extent_range(inode1, loff1, len);
+        ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
-        lock_extent_range(inode2, loff2, len);
+        if (ret)
+                return ret;
+        ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
+        if (ret)
+                unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
+                              loff1 + len - 1);
+        return ret;
 }
 struct cmp_pages {
@@ -2901,11 +2932,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
        for (i = 0; i < cmp->num_pages; i++) {
                pg = cmp->src_pages[i];
-                if (pg)
+                if (pg) {
+                        unlock_page(pg);
                        page_cache_release(pg);
+                }
                pg = cmp->dst_pages[i];
-                if (pg)
+                if (pg) {
+                        unlock_page(pg);
                        page_cache_release(pg);
+                }
        }
        kfree(cmp->src_pages);
        kfree(cmp->dst_pages);
@@ -2966,6 +3001,8 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
                src_page = cmp->src_pages[i];
                dst_page = cmp->dst_pages[i];
+                ASSERT(PageLocked(src_page));
+                ASSERT(PageLocked(dst_page));
                addr = kmap_atomic(src_page);
                dst_addr = kmap_atomic(dst_page);
@@ -3078,14 +3115,46 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
                goto out_unlock;
        }
+again:
        ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
        if (ret)
                goto out_unlock;
        if (same_inode)
-                lock_extent_range(src, same_lock_start, same_lock_len);
+                ret = lock_extent_range(src, same_lock_start, same_lock_len,
+                                        false);
        else
-                btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+                ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
+                                               false);
+        /*
+         * If one of the inodes has dirty pages in the respective range or
+         * ordered extents, we need to flush dellaloc and wait for all ordered
+         * extents in the range. We must unlock the pages and the ranges in the
+         * io trees to avoid deadlocks when flushing delalloc (requires locking
+         * pages) and when waiting for ordered extents to complete (they require
+         * range locking).
+         */
+        if (ret == -EAGAIN) {
+                /*
+                 * Ranges in the io trees already unlocked. Now unlock all
+                 * pages before waiting for all IO to complete.
+                 */
+                btrfs_cmp_data_free(&cmp);
+                if (same_inode) {
+                        btrfs_wait_ordered_range(src, same_lock_start,
+                                                 same_lock_len);
+                } else {
+                        btrfs_wait_ordered_range(src, loff, len);
+                        btrfs_wait_ordered_range(dst, dst_loff, len);
+                }
+                goto again;
+        }
+        ASSERT(ret == 0);
+        if (WARN_ON(ret)) {
+                /* ranges in the io trees already unlocked */
+                btrfs_cmp_data_free(&cmp);
+                return ret;
+        }
        /* pass original length for comparison so we stay within i_size */
        ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
@@ -3795,9 +3864,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
                u64 lock_start = min_t(u64, off, destoff);
                u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
-                lock_extent_range(src, lock_start, lock_len);
+                ret = lock_extent_range(src, lock_start, lock_len, true);
        } else {
-                btrfs_double_extent_lock(src, off, inode, destoff, len);
+                ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
+                                               true);
+        }
+        ASSERT(ret == 0);
+        if (WARN_ON(ret)) {
+                /* ranges in the io trees already unlocked */
+                goto out_unlock;
        }
        ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fd1c4d982463..2bd0011450df 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -575,7 +575,8 @@ static int is_cowonly_root(u64 root_objectid)
            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
            root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
            root_objectid == BTRFS_UUID_TREE_OBJECTID ||
-            root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
+            root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
+            root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
                return 1;
        return 0;
 }
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e0ac85949067..539e7b5e3f86 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -202,6 +202,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
 BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
 BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
 BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
 static struct attribute *btrfs_supported_feature_attrs[] = {
        BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -213,6 +214,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
        BTRFS_FEAT_ATTR_PTR(raid56),
        BTRFS_FEAT_ATTR_PTR(skinny_metadata),
        BTRFS_FEAT_ATTR_PTR(no_holes),
+        BTRFS_FEAT_ATTR_PTR(free_space_tree),
        NULL
 };
@@ -780,6 +782,39 @@ failure:
        return error;
 }
+/*
+ * Change per-fs features in /sys/fs/btrfs/UUID/features to match current
+ * values in superblock. Call after any changes to incompat/compat_ro flags
+ */
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+                u64 bit, enum btrfs_feature_set set)
+{
+        struct btrfs_fs_devices *fs_devs;
+        struct kobject *fsid_kobj;
+        u64 features;
+        int ret;
+        if (!fs_info)
+                return;
+        features = get_features(fs_info, set);
+        ASSERT(bit & supported_feature_masks[set]);
+        fs_devs = fs_info->fs_devices;
+        fsid_kobj = &fs_devs->fsid_kobj;
+        if (!fsid_kobj->state_initialized)
+                return;
+        /*
+         * FIXME: this is too heavy to update just one value, ideally we'd like
+         * to use sysfs_update_group but some refactoring is needed first.
+         */
+        sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
+        ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+}
 static int btrfs_init_debugfs(void)
 {
 #ifdef CONFIG_DEBUG_FS
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 9c09522125a6..d7da1a4c2f6c 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -56,7 +56,7 @@ static struct btrfs_feature_attr btrfs_attr_##_name = {			     \
 #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
        BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
 #define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
-        BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature)
+        BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature)
 #define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
        BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
@@ -90,4 +90,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
                                struct kobject *parent);
 int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
 void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+                u64 bit, enum btrfs_feature_set set);
 #endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index b1d920b30070..0e1e61a7ec23 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -82,18 +82,18 @@ void btrfs_destroy_test_fs(void)
 struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
 {
        struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
-                                                GFP_NOFS);
+                                                GFP_KERNEL);
        if (!fs_info)
                return fs_info;
        fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
-                                      GFP_NOFS);
+                                      GFP_KERNEL);
        if (!fs_info->fs_devices) {
                kfree(fs_info);
                return NULL;
        }
        fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
-                                      GFP_NOFS);
+                                      GFP_KERNEL);
        if (!fs_info->super_copy) {
                kfree(fs_info->fs_devices);
                kfree(fs_info);
@@ -180,11 +180,11 @@ btrfs_alloc_dummy_block_group(unsigned long length)
 {
        struct btrfs_block_group_cache *cache;
-        cache = kzalloc(sizeof(*cache), GFP_NOFS);
+        cache = kzalloc(sizeof(*cache), GFP_KERNEL);
        if (!cache)
                return NULL;
        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-                                        GFP_NOFS);
+                                        GFP_KERNEL);
        if (!cache->free_space_ctl) {
                kfree(cache);
                return NULL;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index e29fa297e053..669b58201e36 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -94,7 +94,7 @@ static int test_find_delalloc(void)
         * test.
         */
        for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
-                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+                page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
                if (!page) {
                        test_msg("Failed to allocate test page\n");
                        ret = -ENOMEM;
@@ -113,7 +113,7 @@ static int test_find_delalloc(void)
         * |--- delalloc ---|
         * |---  search  ---|
         */
-        set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+        set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL);
        start = 0;
        end = 0;
        found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -144,7 +144,7 @@ static int test_find_delalloc(void)
                test_msg("Couldn't find the locked page\n");
                goto out_bits;
        }
-        set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+        set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL);
        start = test_start;
        end = 0;
        found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -199,7 +199,7 @@ static int test_find_delalloc(void)
         *
         * We are re-using our test_start from above since it works out well.
         */
-        set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+        set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL);
        start = test_start;
        end = 0;
        found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -262,7 +262,7 @@ static int test_find_delalloc(void)
        }
        ret = 0;
 out_bits:
-        clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
+        clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL);
 out:
        if (locked_page)
                page_cache_release(locked_page);
@@ -360,7 +360,7 @@ static int test_eb_bitmaps(void)
        test_msg("Running extent buffer bitmap tests\n");
-        bitmap = kmalloc(len, GFP_NOFS);
+        bitmap = kmalloc(len, GFP_KERNEL);
        if (!bitmap) {
                test_msg("Couldn't allocate test bitmap\n");
                return -ENOMEM;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 5de55fdd28bc..e2d3da02deee 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -974,7 +974,7 @@ static int test_extent_accounting(void)
                               (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
                               EXTENT_DELALLOC | EXTENT_DIRTY |
                               EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
-                               NULL, GFP_NOFS);
+                               NULL, GFP_KERNEL);
        if (ret) {
                test_msg("clear_extent_bit returned %d\n", ret);
                goto out;
@@ -1045,7 +1045,7 @@ static int test_extent_accounting(void)
                               BTRFS_MAX_EXTENT_SIZE+8191,
                               EXTENT_DIRTY | EXTENT_DELALLOC |
                               EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
-                               NULL, GFP_NOFS);
+                               NULL, GFP_KERNEL);
        if (ret) {
                test_msg("clear_extent_bit returned %d\n", ret);
                goto out;
@@ -1079,7 +1079,7 @@ static int test_extent_accounting(void)
        ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
                               EXTENT_DIRTY | EXTENT_DELALLOC |
                               EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
-                               NULL, GFP_NOFS);
+                               NULL, GFP_KERNEL);
        if (ret) {
                test_msg("clear_extent_bit returned %d\n", ret);
                goto out;
@@ -1096,7 +1096,7 @@ out:
                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
                                 EXTENT_DIRTY | EXTENT_DELALLOC |
                                 EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
-                                 NULL, GFP_NOFS);
+                                 NULL, GFP_KERNEL);
        iput(inode);
        btrfs_free_dummy_root(root);
        return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 323e12cc9d2f..978c3a810893 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                                     struct inode *inode,
                                     struct btrfs_path *path,
                                     struct list_head *logged_list,
-                                     struct btrfs_log_ctx *ctx)
+                                     struct btrfs_log_ctx *ctx,
+                                     const u64 start,
+                                     const u64 end)
 {
        struct extent_map *em, *n;
        struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
        }
        list_sort(NULL, &extents, extent_cmp);
+        /*
+         * Collect any new ordered extents within the range. This is to
+         * prevent logging file extent items without waiting for the disk
+         * location they point to being written. We do this only to deal
+         * with races against concurrent lockless direct IO writes.
+         */
+        btrfs_get_logged_extents(inode, logged_list, start, end);
 process:
        while (!list_empty(&extents)) {
                em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ log_extents:
                        goto out_unlock;
                }
                ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
-                                                &logged_list, ctx);
+                                                &logged_list, ctx, start, end);
                if (ret) {
                        err = ret;
                        goto out_unlock;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 86a9c383955e..eb9028e8cfc5 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -698,8 +698,8 @@ static void ceph_aio_retry_work(struct work_struct *work)
        req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
                        false, GFP_NOFS);
-        if (IS_ERR(req)) {
+        if (!req) {
-                ret = PTR_ERR(req);
+                ret = -ENOMEM;
                req = orig_req;
                goto out;
        }
@@ -716,7 +716,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
        ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
                                snapc, CEPH_NOSNAP, &aio_req->mtime);
-        ceph_put_snap_context(snapc);
        ceph_osdc_put_request(orig_req);
        req->r_callback = ceph_aio_complete_req;
@@ -731,6 +730,7 @@ out:
                ceph_aio_complete_req(req, NULL);
        }
+        ceph_put_snap_context(snapc);
        kfree(aio_work);
 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fb4ba2e4e2a5..be2d87f33177 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -975,13 +975,8 @@ out_unlock:
 /*
 * splice a dentry to an inode.
 * caller must hold directory i_mutex for this to be safe.
- *
- * we will only rehash the resulting dentry if @prehash is
- * true; @prehash will be set to false (for the benefit of
- * the caller) if we fail.
 */
-static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
-                                    bool *prehash)
 {
        struct dentry *realdn;
@@ -994,8 +989,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
        if (IS_ERR(realdn)) {
                pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
                       PTR_ERR(realdn), dn, in, ceph_vinop(in));
-                if (prehash)
-                        *prehash = false; /* don't rehash on error */
                dn = realdn; /* note realdn contains the error */
                goto out;
        } else if (realdn) {
@@ -1011,8 +1004,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
                dout("dn %p attached to %p ino %llx.%llx\n",
                     dn, d_inode(dn), ceph_vinop(d_inode(dn)));
        }
-        if ((!prehash || *prehash) && d_unhashed(dn))
-                d_rehash(dn);
 out:
        return dn;
 }
@@ -1245,10 +1236,8 @@ retry_lookup:
                                dout("d_delete %p\n", dn);
                                d_delete(dn);
                        } else {
-                                dout("d_instantiate %p NULL\n", dn);
-                                d_instantiate(dn, NULL);
                                if (have_lease && d_unhashed(dn))
-                                        d_rehash(dn);
+                                        d_add(dn, NULL);
                                update_dentry_lease(dn, rinfo->dlease,
                                                    session,
                                                    req->r_request_started);
@@ -1260,7 +1249,7 @@ retry_lookup:
                if (d_really_is_negative(dn)) {
                        ceph_dir_clear_ordered(dir);
                        ihold(in);
-                        dn = splice_dentry(dn, in, &have_lease);
+                        dn = splice_dentry(dn, in);
                        if (IS_ERR(dn)) {
                                err = PTR_ERR(dn);
                                goto done;
@@ -1290,7 +1279,7 @@ retry_lookup:
                dout(" linking snapped dir %p to dn %p\n", in, dn);
                ceph_dir_clear_ordered(dir);
                ihold(in);
-                dn = splice_dentry(dn, in, NULL);
+                dn = splice_dentry(dn, in);
                if (IS_ERR(dn)) {
                        err = PTR_ERR(dn);
                        goto done;
@@ -1501,7 +1490,7 @@ retry_lookup:
                }
                if (d_really_is_negative(dn)) {
-                        struct dentry *realdn = splice_dentry(dn, in, NULL);
+                        struct dentry *realdn = splice_dentry(dn, in);
                        if (IS_ERR(realdn)) {
                                err = PTR_ERR(realdn);
                                d_drop(dn);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c48ca13673e3..09b1db2cac31 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -642,9 +642,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
                while (*s && *s != sep)
                        s++;
-                inode_lock(dir);
+                child = lookup_one_len_unlocked(p, dentry, s - p);
-                child = lookup_one_len(p, dentry, s - p);
-                inode_unlock(dir);
                dput(dentry);
                dentry = child;
        } while (!IS_ERR(dentry));
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a5b8eb69a8f4..6402eaf8ab95 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1261,6 +1261,9 @@ COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
 COMPATIBLE_IOCTL(HCIINQUIRY)
 COMPATIBLE_IOCTL(HCIUARTSETPROTO)
 COMPATIBLE_IOCTL(HCIUARTGETPROTO)
+COMPATIBLE_IOCTL(HCIUARTGETDEVICE)
+COMPATIBLE_IOCTL(HCIUARTSETFLAGS)
+COMPATIBLE_IOCTL(HCIUARTGETFLAGS)
 COMPATIBLE_IOCTL(RFCOMMCREATEDEV)
 COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
 COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index f419519ec41f..214ec14149d9 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -432,14 +432,9 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
                                (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ?
                                        configfs_init_bin_file :
                                        configfs_init_file);
-        if (error) {
+        if (error)
                configfs_put(sd);
-                return error;
+        return error;
-        }
-        d_rehash(dentry);
-        return 0;
 }
 static struct dentry * configfs_lookup(struct inode *dir,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cee087d8f7e0..45811ea3fd87 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -199,9 +199,17 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in
        configfs_set_inode_lock_class(sd, inode);
        init(inode);
-        d_instantiate(dentry, inode);
+        if (S_ISDIR(mode) || S_ISLNK(mode)) {
-        if (S_ISDIR(mode) || S_ISLNK(mode))
+                /*
+                 * ->symlink(), ->mkdir(), configfs_register_subsystem() or
+                 * create_default_group() - already hashed.
+                 */
+                d_instantiate(dentry, inode);
                dget(dentry);  /* pin link and directory dentries in core */
+        } else {
+                /* ->lookup() */
+                d_add(dentry, inode);
+        }
        return error;
 }
diff --git a/fs/dax.c b/fs/dax.c
index 4fd6b0c5c6b5..fc2e3141138b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -58,6 +58,26 @@ static void dax_unmap_atomic(struct block_device *bdev,
        blk_queue_exit(bdev->bd_queue);
 }
+struct page *read_dax_sector(struct block_device *bdev, sector_t n)
+{
+        struct page *page = alloc_pages(GFP_KERNEL, 0);
+        struct blk_dax_ctl dax = {
+                .size = PAGE_SIZE,
+                .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
+        };
+        long rc;
+        if (!page)
+                return ERR_PTR(-ENOMEM);
+        rc = dax_map_atomic(bdev, &dax);
+        if (rc < 0)
+                return ERR_PTR(rc);
+        memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
+        dax_unmap_atomic(bdev, &dax);
+        return page;
+}
 /*
 * dax_clear_blocks() is called from within transaction context from XFS,
 * and hence this means the stack from this point must follow GFP_NOFS
@@ -338,7 +358,8 @@ static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
        void *entry;
        WARN_ON_ONCE(pmd_entry && !dirty);
-        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+        if (dirty)
+                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        spin_lock_irq(&mapping->tree_lock);
diff --git a/fs/dcache.c b/fs/dcache.c
index 92d5140de851..32ceae3e6112 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -269,9 +269,6 @@ static inline int dname_external(const struct dentry *dentry)
        return dentry->d_name.name != dentry->d_iname;
 }
-/*
- * Make sure other CPUs see the inode attached before the type is set.
- */
 static inline void __d_set_inode_and_type(struct dentry *dentry,
                                          struct inode *inode,
                                          unsigned type_flags)
@@ -279,28 +276,18 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
        unsigned flags;
        dentry->d_inode = inode;
-        smp_wmb();
        flags = READ_ONCE(dentry->d_flags);
        flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
        flags |= type_flags;
        WRITE_ONCE(dentry->d_flags, flags);
 }
-/*
- * Ideally, we want to make sure that other CPUs see the flags cleared before
- * the inode is detached, but this is really a violation of RCU principles
- * since the ordering suggests we should always set inode before flags.
- *
- * We should instead replace or discard the entire dentry - but that sucks
- * performancewise on mass deletion/rename.
- */
 static inline void __d_clear_type_and_inode(struct dentry *dentry)
 {
        unsigned flags = READ_ONCE(dentry->d_flags);
        flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
        WRITE_ONCE(dentry->d_flags, flags);
-        smp_wmb();
        dentry->d_inode = NULL;
 }
@@ -370,9 +357,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
        __releases(dentry->d_inode->i_lock)
 {
        struct inode *inode = dentry->d_inode;
+        raw_write_seqcount_begin(&dentry->d_seq);
        __d_clear_type_and_inode(dentry);
        hlist_del_init(&dentry->d_u.d_alias);
-        dentry_rcuwalk_invalidate(dentry);
+        raw_write_seqcount_end(&dentry->d_seq);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);
        if (!inode->i_nlink)
@@ -1756,12 +1745,12 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
        unsigned add_flags = d_flags_for_inode(inode);
        spin_lock(&dentry->d_lock);
-        if (inode)
+        hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
-                hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+        raw_write_seqcount_begin(&dentry->d_seq);
        __d_set_inode_and_type(dentry, inode, add_flags);
-        dentry_rcuwalk_invalidate(dentry);
+        raw_write_seqcount_end(&dentry->d_seq);
+        __fsnotify_d_instantiate(dentry);
        spin_unlock(&dentry->d_lock);
-        fsnotify_d_instantiate(dentry, inode);
 }
 /**
@@ -1782,91 +1771,16 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 void d_instantiate(struct dentry *entry, struct inode * inode)
 {
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
-        if (inode)
+        if (inode) {
                spin_lock(&inode->i_lock);
-        __d_instantiate(entry, inode);
+                __d_instantiate(entry, inode);
-        if (inode)
                spin_unlock(&inode->i_lock);
+        }
        security_d_instantiate(entry, inode);
 }
 EXPORT_SYMBOL(d_instantiate);
 /**
- * d_instantiate_unique - instantiate a non-aliased dentry
- * @entry: dentry to instantiate
- * @inode: inode to attach to this dentry
- *
- * Fill in inode information in the entry. On success, it returns NULL.
- * If an unhashed alias of "entry" already exists, then we return the
- * aliased dentry instead and drop one reference to inode.
- *
- * Note that in order to avoid conflicts with rename() etc, the caller
- * had better be holding the parent directory semaphore.
- *
- * This also assumes that the inode count has been incremented
- * (or otherwise set) by the caller to indicate that it is now
- * in use by the dcache.
- */
-static struct dentry *__d_instantiate_unique(struct dentry *entry,
-                                             struct inode *inode)
-{
-        struct dentry *alias;
-        int len = entry->d_name.len;
-        const char *name = entry->d_name.name;
-        unsigned int hash = entry->d_name.hash;
-        if (!inode) {
-                __d_instantiate(entry, NULL);
-                return NULL;
-        }
-        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
-                /*
-                 * Don't need alias->d_lock here, because aliases with
-                 * d_parent == entry->d_parent are not subject to name or
-                 * parent changes, because the parent inode i_mutex is held.
-                 */
-                if (alias->d_name.hash != hash)
-                        continue;
-                if (alias->d_parent != entry->d_parent)
-                        continue;
-                if (alias->d_name.len != len)
-                        continue;
-                if (dentry_cmp(alias, name, len))
-                        continue;
-                __dget(alias);
-                return alias;
-        }
-        __d_instantiate(entry, inode);
-        return NULL;
-}
-struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
-{
-        struct dentry *result;
-        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
-        if (inode)
-                spin_lock(&inode->i_lock);
-        result = __d_instantiate_unique(entry, inode);
-        if (inode)
-                spin_unlock(&inode->i_lock);
-        if (!result) {
-                security_d_instantiate(entry, inode);
-                return NULL;
-        }
-        BUG_ON(!d_unhashed(result));
-        iput(inode);
-        return result;
-}
-EXPORT_SYMBOL(d_instantiate_unique);
-/**
 * d_instantiate_no_diralias - instantiate a non-aliased dentry
 * @entry: dentry to complete
 * @inode: inode to attach to this dentry
@@ -2446,6 +2360,86 @@ void d_rehash(struct dentry * entry)
 }
 EXPORT_SYMBOL(d_rehash);
+/* inode->i_lock held if inode is non-NULL */
+static inline void __d_add(struct dentry *dentry, struct inode *inode)
+{
+        if (inode) {
+                __d_instantiate(dentry, inode);
+                spin_unlock(&inode->i_lock);
+        }
+        security_d_instantiate(dentry, inode);
+        d_rehash(dentry);
+}
+/**
+ * d_add - add dentry to hash queues
+ * @entry: dentry to add
+ * @inode: The inode to attach to this dentry
+ *
+ * This adds the entry to the hash queues and initializes @inode.
+ * The entry was actually filled in earlier during d_alloc().
+ */
+void d_add(struct dentry *entry, struct inode *inode)
+{
+        if (inode)
+                spin_lock(&inode->i_lock);
+        __d_add(entry, inode);
+}
+EXPORT_SYMBOL(d_add);
+/**
+ * d_exact_alias - find and hash an exact unhashed alias
+ * @entry: dentry to add
+ * @inode: The inode to go with this dentry
+ *
+ * If an unhashed dentry with the same name/parent and desired
+ * inode already exists, hash and return it.  Otherwise, return
+ * NULL.
+ *
+ * Parent directory should be locked.
+ */
+struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
+{
+        struct dentry *alias;
+        int len = entry->d_name.len;
+        const char *name = entry->d_name.name;
+        unsigned int hash = entry->d_name.hash;
+        spin_lock(&inode->i_lock);
+        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+                /*
+                 * Don't need alias->d_lock here, because aliases with
+                 * d_parent == entry->d_parent are not subject to name or
+                 * parent changes, because the parent inode i_mutex is held.
+                 */
+                if (alias->d_name.hash != hash)
+                        continue;
+                if (alias->d_parent != entry->d_parent)
+                        continue;
+                if (alias->d_name.len != len)
+                        continue;
+                if (dentry_cmp(alias, name, len))
+                        continue;
+                spin_lock(&alias->d_lock);
+                if (!d_unhashed(alias)) {
+                        spin_unlock(&alias->d_lock);
+                        alias = NULL;
+                } else {
+                        __dget_dlock(alias);
+                        _d_rehash(alias);
+                        spin_unlock(&alias->d_lock);
+                }
+                spin_unlock(&inode->i_lock);
+                return alias;
+        }
+        spin_unlock(&inode->i_lock);
+        return NULL;
+}
+EXPORT_SYMBOL(d_exact_alias);
 /**
 * dentry_update_name_case - update case insensitive dentry with a new name
 * @dentry: dentry to be updated
@@ -2782,10 +2776,9 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
        BUG_ON(!d_unhashed(dentry));
-        if (!inode) {
+        if (!inode)
-                __d_instantiate(dentry, NULL);
                goto out;
-        }
        spin_lock(&inode->i_lock);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *new = __d_find_any_alias(inode);
@@ -2819,12 +2812,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
                        return new;
                }
        }
-        /* already taking inode->i_lock, so d_add() by hand */
-        __d_instantiate(dentry, inode);
-        spin_unlock(&inode->i_lock);
 out:
-        security_d_instantiate(dentry, inode);
+        __d_add(dentry, inode);
-        d_rehash(dentry);
        return NULL;
 }
 EXPORT_SYMBOL(d_splice_alias);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1f107fd51328..655f21f99160 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -575,6 +575,26 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
        mutex_unlock(&allocated_ptys_lock);
 }
+/*
+ * pty code needs to hold extra references in case of last /dev/tty close
+ */
+void devpts_add_ref(struct inode *ptmx_inode)
+{
+        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+        atomic_inc(&sb->s_active);
+        ihold(ptmx_inode);
+}
+void devpts_del_ref(struct inode *ptmx_inode)
+{
+        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+        iput(ptmx_inode);
+        deactivate_super(sb);
+}
 /**
 * devpts_pty_new -- create a new inode in /dev/pts/
 * @ptmx_inode: inode of the master
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1b2f7ffc8b84..85463171053b 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -445,7 +445,8 @@ static struct bio *dio_await_one(struct dio *dio)
                __set_current_state(TASK_UNINTERRUPTIBLE);
                dio->waiter = current;
                spin_unlock_irqrestore(&dio->bio_lock, flags);
-                if (!blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+                if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
+                    !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
                        io_schedule();
                /* wake up sets us TASK_RUNNING */
                spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 80d6901493cf..87dbdd4881ab 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1499,16 +1499,14 @@ out:
 */
 static int
 ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
-                          struct ecryptfs_crypt_stat *crypt_stat,
                          struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
 {
        int rc = 0;
        filename->encrypted_filename = NULL;
        filename->encrypted_filename_size = 0;
-        if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+        if (mount_crypt_stat && (mount_crypt_stat->flags
-            || (mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) {
-                                     & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
                size_t packet_size;
                size_t remaining_bytes;
@@ -1944,7 +1942,6 @@ out:
 int ecryptfs_encrypt_and_encode_filename(
        char **encoded_name,
        size_t *encoded_name_size,
-        struct ecryptfs_crypt_stat *crypt_stat,
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
        const char *name, size_t name_size)
 {
@@ -1953,9 +1950,8 @@ int ecryptfs_encrypt_and_encode_filename(
        (*encoded_name) = NULL;
        (*encoded_name_size) = 0;
-        if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+        if (mount_crypt_stat && (mount_crypt_stat->flags
-            || (mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
-                                     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
                struct ecryptfs_filename *filename;
                filename = kzalloc(sizeof(*filename), GFP_KERNEL);
@@ -1968,8 +1964,7 @@ int ecryptfs_encrypt_and_encode_filename(
                }
                filename->filename = (char *)name;
                filename->filename_size = name_size;
-                rc = ecryptfs_encrypt_filename(filename, crypt_stat,
+                rc = ecryptfs_encrypt_filename(filename, mount_crypt_stat);
-                                               mount_crypt_stat);
                if (rc) {
                        printk(KERN_ERR "%s: Error attempting to encrypt "
                               "filename; rc = [%d]\n", __func__, rc);
@@ -1980,11 +1975,9 @@ int ecryptfs_encrypt_and_encode_filename(
                        NULL, &encoded_name_no_prefix_size,
                        filename->encrypted_filename,
                        filename->encrypted_filename_size);
-                if ((crypt_stat && (crypt_stat->flags
+                if (mount_crypt_stat
-                                    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
-                    || (mount_crypt_stat
                        && (mount_crypt_stat->flags
-                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
+                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))
                        (*encoded_name_size) =
                                (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
                                 + encoded_name_no_prefix_size);
@@ -2002,11 +1995,9 @@ int ecryptfs_encrypt_and_encode_filename(
                        kfree(filename);
                        goto out;
                }
-                if ((crypt_stat && (crypt_stat->flags
+                if (mount_crypt_stat
-                                    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
-                    || (mount_crypt_stat
                        && (mount_crypt_stat->flags
-                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) {
                        memcpy((*encoded_name),
                               ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
                               ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 7b39260c7bba..67e16128c572 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -569,7 +569,6 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
 int ecryptfs_encrypt_and_encode_filename(
        char **encoded_name,
        size_t *encoded_name_size,
-        struct ecryptfs_crypt_stat *crypt_stat,
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
        const char *name, size_t name_size);
 struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4e685ac1024d..26651636cd1d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -397,11 +397,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        int rc = 0;
        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-        inode_lock(d_inode(lower_dir_dentry));
+        lower_dentry = lookup_one_len_unlocked(ecryptfs_dentry->d_name.name,
-        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
                                      lower_dir_dentry,
                                      ecryptfs_dentry->d_name.len);
-        inode_unlock(d_inode(lower_dir_dentry));
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -419,18 +417,16 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        dput(lower_dentry);
        rc = ecryptfs_encrypt_and_encode_filename(
                &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
-                NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+                mount_crypt_stat, ecryptfs_dentry->d_name.name,
                ecryptfs_dentry->d_name.len);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to encrypt and encode "
                       "filename; rc = [%d]\n", __func__, rc);
                goto out;
        }
-        inode_lock(d_inode(lower_dir_dentry));
+        lower_dentry = lookup_one_len_unlocked(encrypted_and_encoded_name,
-        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
                                      lower_dir_dentry,
                                      encrypted_and_encoded_name_size);
-        inode_unlock(d_inode(lower_dir_dentry));
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -502,7 +498,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
                dir->i_sb)->mount_crypt_stat;
        rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
                                                  &encoded_symlen,
-                                                  NULL,
                                                  mount_crypt_stat, symname,
                                                  strlen(symname));
        if (rc)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ae1dbcf47e97..cde60741cad2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -94,6 +94,11 @@
 /* Epoll private bits inside the event mask */
 #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
+#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
+#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
+                                EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
 /* Maximum number of nesting allowed inside epoll sets */
 #define EP_MAX_NESTS 4
@@ -1068,7 +1073,22 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
         * wait list.
         */
        if (waitqueue_active(&ep->wq)) {
-                ewake = 1;
+                if ((epi->event.events & EPOLLEXCLUSIVE) &&
+                                        !((unsigned long)key & POLLFREE)) {
+                        switch ((unsigned long)key & EPOLLINOUT_BITS) {
+                        case POLLIN:
+                                if (epi->event.events & POLLIN)
+                                        ewake = 1;
+                                break;
+                        case POLLOUT:
+                                if (epi->event.events & POLLOUT)
+                                        ewake = 1;
+                                break;
+                        case 0:
+                                ewake = 1;
+                                break;
+                        }
+                }
                wake_up_locked(&ep->wq);
        }
        if (waitqueue_active(&ep->poll_wait))
@@ -1875,9 +1895,13 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
         * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
         * Also, we do not currently supported nested exclusive wakeups.
         */
-        if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD ||
+        if (epds.events & EPOLLEXCLUSIVE) {
-                (op == EPOLL_CTL_ADD && is_file_epoll(tf.file))))
+                if (op == EPOLL_CTL_MOD)
-                goto error_tgt_fput;
+                        goto error_tgt_fput;
+                if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+                                (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
+                        goto error_tgt_fput;
+        }
        /*
         * At this point it is safe to assume that the "private_data" contains
@@ -1950,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                break;
        case EPOLL_CTL_MOD:
                if (epi) {
-                        epds.events |= POLLERR | POLLHUP;
+                        if (!(epi->event.events & EPOLLEXCLUSIVE)) {
-                        error = ep_modify(ep, epi, &epds);
+                                epds.events |= POLLERR | POLLHUP;
+                                error = ep_modify(ep, epi, &epds);
+                        }
                } else
                        error = -ENOENT;
                break;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 506765afa1a3..bb8d67e2740a 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -376,12 +376,11 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
        struct inode *inode = d_inode(dentry);
        dnode_secno dno;
        int r;
-        int rep = 0;
        int err;
        hpfs_lock(dir->i_sb);
        hpfs_adjust_length(name, &len);
-again:
        err = -ENOENT;
        de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
        if (!de)
@@ -401,33 +400,9 @@ again:
                hpfs_error(dir->i_sb, "there was error when removing dirent");
                err = -EFSERROR;
                break;
-        case 2:         /* no space for deleting, try to truncate file */
+        case 2:         /* no space for deleting */
                err = -ENOSPC;
-                if (rep++)
+                break;
-                        break;
-                dentry_unhash(dentry);
-                if (!d_unhashed(dentry)) {
-                        hpfs_unlock(dir->i_sb);
-                        return -ENOSPC;
-                }
-                if (generic_permission(inode, MAY_WRITE) ||
-                    !S_ISREG(inode->i_mode) ||
-                    get_write_access(inode)) {
-                        d_rehash(dentry);
-                } else {
-                        struct iattr newattrs;
-                        /*pr_info("truncating file before delete.\n");*/
-                        newattrs.ia_size = 0;
-                        newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
-                        err = notify_change(dentry, &newattrs, NULL);
-                        put_write_access(inode);
-                        if (!err)
-                                goto again;
-                }
-                hpfs_unlock(dir->i_sb);
-                return -ENOSPC;
        default:
                drop_nlink(inode);
                err = 0;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index d211b8e18566..30c4c9ebb693 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
                pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
                          __func__, ret);
-                /* Might as well let the VFS know */
+                /*
-                d_instantiate(new_dentry, d_inode(old_dentry));
+                 * We can't keep the target in dcache after that.
-                ihold(d_inode(old_dentry));
+                 * For one thing, we can't afford dentry aliases for directories.
+                 * For another, if there was a victim, we _can't_ set new inode
+                 * for that sucker and we have to trigger mount eviction - the
+                 * caller won't do it on its own since we are returning an error.
+                 */
+                d_invalidate(new_dentry);
                new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
                return ret;
        }
diff --git a/fs/namei.c b/fs/namei.c
index f624d132e01e..794f81dce766 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1220,8 +1220,8 @@ static int follow_managed(struct path *path, struct nameidata *nd)
        if (need_mntput && path->mnt == mnt)
                mntput(path->mnt);
-        if (ret == -EISDIR)
+        if (ret == -EISDIR || !ret)
-                ret = 0;
+                ret = 1;
        if (need_mntput)
                nd->flags |= LOOKUP_JUMPED;
        if (unlikely(ret < 0))
@@ -1444,40 +1444,26 @@ static int follow_dotdot(struct nameidata *nd)
 * This looks up the name in dcache, possibly revalidates the old dentry and
 * allocates a new one if not found or not valid.  In the need_lookup argument
 * returns whether i_op->lookup is necessary.
- *
- * dir->d_inode->i_mutex must be held
 */
-static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
+static struct dentry *lookup_dcache(const struct qstr *name,
-                                    unsigned int flags, bool *need_lookup)
+                                    struct dentry *dir,
+                                    unsigned int flags)
 {
        struct dentry *dentry;
        int error;
-        *need_lookup = false;
        dentry = d_lookup(dir, name);
        if (dentry) {
                if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
                        error = d_revalidate(dentry, flags);
                        if (unlikely(error <= 0)) {
-                                if (error < 0) {
+                                if (!error)
-                                        dput(dentry);
-                                        return ERR_PTR(error);
-                                } else {
                                        d_invalidate(dentry);
-                                        dput(dentry);
+                                dput(dentry);
-                                        dentry = NULL;
+                                return ERR_PTR(error);
-                                }
                        }
                }
        }
-        if (!dentry) {
-                dentry = d_alloc(dir, name);
-                if (unlikely(!dentry))
-                        return ERR_PTR(-ENOMEM);
-                *need_lookup = true;
-        }
        return dentry;
 }
@@ -1506,45 +1492,44 @@ static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
        return dentry;
 }
-static struct dentry *__lookup_hash(struct qstr *name,
+static struct dentry *__lookup_hash(const struct qstr *name,
                struct dentry *base, unsigned int flags)
 {
-        bool need_lookup;
+        struct dentry *dentry = lookup_dcache(name, base, flags);
-        struct dentry *dentry;
-        dentry = lookup_dcache(name, base, flags, &need_lookup);
+        if (dentry)
-        if (!need_lookup)
                return dentry;
+        dentry = d_alloc(base, name);
+        if (unlikely(!dentry))
+                return ERR_PTR(-ENOMEM);
        return lookup_real(base->d_inode, dentry, flags);
 }
-/*
- *  It's more convoluted than I'd like it to be, but... it's still fairly
- *  small and for now I'd prefer to have fast path as straight as possible.
- *  It _is_ time-critical.
- */
 static int lookup_fast(struct nameidata *nd,
                       struct path *path, struct inode **inode,
                       unsigned *seqp)
 {
        struct vfsmount *mnt = nd->path.mnt;
        struct dentry *dentry, *parent = nd->path.dentry;
-        int need_reval = 1;
        int status = 1;
        int err;
        /*
         * Rename seqlock is not required here because in the off chance
-         * of a false negative due to a concurrent rename, we're going to
+         * of a false negative due to a concurrent rename, the caller is
-         * do the non-racy lookup, below.
+         * going to fall back to non-racy lookup.
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
                bool negative;
                dentry = __d_lookup_rcu(parent, &nd->last, &seq);
-                if (!dentry)
+                if (unlikely(!dentry)) {
-                        goto unlazy;
+                        if (unlazy_walk(nd, NULL, 0))
+                                return -ECHILD;
+                        return 0;
+                }
                /*
                 * This sequence count validates that the inode matches
@@ -1552,7 +1537,7 @@ static int lookup_fast(struct nameidata *nd,
                 */
                *inode = d_backing_inode(dentry);
                negative = d_is_negative(dentry);
-                if (read_seqcount_retry(&dentry->d_seq, seq))
+                if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
                        return -ECHILD;
                /*
@@ -1562,81 +1547,89 @@ static int lookup_fast(struct nameidata *nd,
                 * The memory barrier in read_seqcount_begin of child is
                 *  enough, we can use __read_seqcount_retry here.
                 */
-                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+                if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
                        return -ECHILD;
                *seqp = seq;
-                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
                        status = d_revalidate(dentry, nd->flags);
-                        if (unlikely(status <= 0)) {
+                if (unlikely(status <= 0)) {
-                                if (status != -ECHILD)
+                        if (unlazy_walk(nd, dentry, seq))
-                                        need_reval = 0;
+                                return -ECHILD;
-                                goto unlazy;
+                        if (status == -ECHILD)
-                        }
+                                status = d_revalidate(dentry, nd->flags);
+                } else {
+                        /*
+                         * Note: do negative dentry check after revalidation in
+                         * case that drops it.
+                         */
+                        if (unlikely(negative))
+                                return -ENOENT;
+                        path->mnt = mnt;
+                        path->dentry = dentry;
+                        if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
+                                return 1;
+                        if (unlazy_walk(nd, dentry, seq))
+                                return -ECHILD;
                }
-                /*
-                 * Note: do negative dentry check after revalidation in
-                 * case that drops it.
-                 */
-                if (negative)
-                        return -ENOENT;
-                path->mnt = mnt;
-                path->dentry = dentry;
-                if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
-                        return 0;
-unlazy:
-                if (unlazy_walk(nd, dentry, seq))
-                        return -ECHILD;
        } else {
                dentry = __d_lookup(parent, &nd->last);
+                if (unlikely(!dentry))
+                        return 0;
+                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
+                        status = d_revalidate(dentry, nd->flags);
        }
-        if (unlikely(!dentry))
-                goto need_lookup;
-        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
-                status = d_revalidate(dentry, nd->flags);
        if (unlikely(status <= 0)) {
-                if (status < 0) {
+                if (!status)
-                        dput(dentry);
+                        d_invalidate(dentry);
-                        return status;
-                }
-                d_invalidate(dentry);
                dput(dentry);
-                goto need_lookup;
+                return status;
        }
        if (unlikely(d_is_negative(dentry))) {
                dput(dentry);
                return -ENOENT;
        }
        path->mnt = mnt;
        path->dentry = dentry;
        err = follow_managed(path, nd);
-        if (likely(!err))
+        if (likely(err > 0))
                *inode = d_backing_inode(path->dentry);
        return err;
-need_lookup:
-        return 1;
 }
 /* Fast lookup failed, do it the slow way */
-static int lookup_slow(struct nameidata *nd, struct path *path)
+static struct dentry *lookup_slow(const struct qstr *name,
+                                  struct dentry *dir,
+                                  unsigned int flags)
 {
-        struct dentry *dentry, *parent;
+        struct dentry *dentry;
+        inode_lock(dir->d_inode);
-        parent = nd->path.dentry;
+        dentry = d_lookup(dir, name);
-        BUG_ON(nd->inode != parent->d_inode);
+        if (unlikely(dentry)) {
+                if ((dentry->d_flags & DCACHE_OP_REVALIDATE) &&
-        inode_lock(parent->d_inode);
+                    !(flags & LOOKUP_NO_REVAL)) {
-        dentry = __lookup_hash(&nd->last, parent, nd->flags);
+                        int error = d_revalidate(dentry, flags);
-        inode_unlock(parent->d_inode);
+                        if (unlikely(error <= 0)) {
-        if (IS_ERR(dentry))
+                                if (!error)
-                return PTR_ERR(dentry);
+                                        d_invalidate(dentry);
-        path->mnt = nd->path.mnt;
+                                dput(dentry);
-        path->dentry = dentry;
+                                dentry = ERR_PTR(error);
-        return follow_managed(path, nd);
+                        }
+                }
+                if (dentry) {
+                        inode_unlock(dir->d_inode);
+                        return dentry;
+                }
+        }
+        dentry = d_alloc(dir, name);
+        if (unlikely(!dentry)) {
+                inode_unlock(dir->d_inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        dentry = lookup_real(dir->d_inode, dentry, flags);
+        inode_unlock(dir->d_inode);
+        return dentry;
 }
 static inline int may_lookup(struct nameidata *nd)
@@ -1712,6 +1705,11 @@ static inline int should_follow_link(struct nameidata *nd, struct path *link,
                return 0;
        if (!follow)
                return 0;
+        /* make sure that d_is_symlink above matches inode */
+        if (nd->flags & LOOKUP_RCU) {
+                if (read_seqcount_retry(&link->dentry->d_seq, seq))
+                        return -ECHILD;
+        }
        return pick_link(nd, link, inode, seq);
 }
@@ -1735,19 +1733,24 @@ static int walk_component(struct nameidata *nd, int flags)
                return err;
        }
        err = lookup_fast(nd, &path, &inode, &seq);
-        if (unlikely(err)) {
+        if (unlikely(err <= 0)) {
                if (err < 0)
                        return err;
+                path.dentry = lookup_slow(&nd->last, nd->path.dentry,
-                err = lookup_slow(nd, &path);
+                                          nd->flags);
-                if (err < 0)
+                if (IS_ERR(path.dentry))
+                        return PTR_ERR(path.dentry);
+                if (unlikely(d_is_negative(path.dentry))) {
+                        dput(path.dentry);
+                        return -ENOENT;
+                }
+                path.mnt = nd->path.mnt;
+                err = follow_managed(&path, nd);
+                if (unlikely(err < 0))
                        return err;
-                inode = d_backing_inode(path.dentry);
                seq = 0;        /* we are already out of RCU mode */
-                err = -ENOENT;
+                inode = d_backing_inode(path.dentry);
-                if (d_is_negative(path.dentry))
-                        goto out_path_put;
        }
        if (flags & WALK_PUT)
@@ -1759,10 +1762,6 @@ static int walk_component(struct nameidata *nd, int flags)
        nd->inode = inode;
        nd->seq = seq;
        return 0;
-out_path_put:
-        path_to_nameidata(&path, nd);
-        return err;
 }
 /*
@@ -2368,21 +2367,9 @@ struct dentry *lookup_one_len_unlocked(const char *name,
        if (err)
                return ERR_PTR(err);
-        /*
+        ret = lookup_dcache(&this, base, 0);
-         * __d_lookup() is used to try to get a quick answer and avoid the
+        if (!ret)
-         * mutex.  A false-negative does no harm.
+                ret = lookup_slow(&this, base, 0);
-         */
-        ret = __d_lookup(base, &this);
-        if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) {
-                dput(ret);
-                ret = NULL;
-        }
-        if (ret)
-                return ret;
-        inode_lock(base->d_inode);
-        ret =  __lookup_hash(&this, base, 0);
-        inode_unlock(base->d_inode);
        return ret;
 }
 EXPORT_SYMBOL(lookup_one_len_unlocked);
@@ -2460,31 +2447,21 @@ mountpoint_last(struct nameidata *nd, struct path *path)
                if (error)
                        return error;
                dentry = dget(nd->path.dentry);
-                goto done;
+        } else {
-        }
+                dentry = d_lookup(dir, &nd->last);
-        inode_lock(dir->d_inode);
-        dentry = d_lookup(dir, &nd->last);
-        if (!dentry) {
-                /*
-                 * No cached dentry. Mounted dentries are pinned in the cache,
-                 * so that means that this dentry is probably a symlink or the
-                 * path doesn't actually point to a mounted dentry.
-                 */
-                dentry = d_alloc(dir, &nd->last);
                if (!dentry) {
-                        inode_unlock(dir->d_inode);
+                        /*
-                        return -ENOMEM;
+                         * No cached dentry. Mounted dentries are pinned in the
-                }
+                         * cache, so that means that this dentry is probably
-                dentry = lookup_real(dir->d_inode, dentry, nd->flags);
+                         * a symlink or the path doesn't actually point
-                if (IS_ERR(dentry)) {
+                         * to a mounted dentry.
-                        inode_unlock(dir->d_inode);
+                         */
-                        return PTR_ERR(dentry);
+                        dentry = lookup_slow(&nd->last, dir,
+                                             nd->flags | LOOKUP_NO_REVAL);
+                        if (IS_ERR(dentry))
+                                return PTR_ERR(dentry);
                }
        }
-        inode_unlock(dir->d_inode);
-done:
        if (d_is_negative(dentry)) {
                dput(dentry);
                return -ENOENT;
@@ -3013,16 +2990,22 @@ static int lookup_open(struct nameidata *nd, struct path *path,
        struct inode *dir_inode = dir->d_inode;
        struct dentry *dentry;
        int error;
-        bool need_lookup;
+        bool need_lookup = false;
        *opened &= ~FILE_CREATED;
-        dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
+        dentry = lookup_dcache(&nd->last, dir, nd->flags);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
-        /* Cached positive dentry: will open in f_op->open */
+        if (!dentry) {
-        if (!need_lookup && dentry->d_inode)
+                dentry = d_alloc(dir, &nd->last);
+                if (unlikely(!dentry))
+                        return -ENOMEM;
+                need_lookup = true;
+        } else if (dentry->d_inode) {
+                /* Cached positive dentry: will open in f_op->open */
                goto out_no_open;
+        }
        if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
                return atomic_open(nd, dentry, path, file, op, got_write,
@@ -3106,13 +3089,14 @@ static int do_last(struct nameidata *nd,
                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
                /* we _can_ be in RCU mode here */
                error = lookup_fast(nd, &path, &inode, &seq);
-                if (likely(!error))
+                if (likely(error > 0))
                        goto finish_lookup;
                if (error < 0)
                        return error;
                BUG_ON(nd->inode != dir->d_inode);
+                BUG_ON(nd->flags & LOOKUP_RCU);
        } else {
                /* create side of things */
                /*
@@ -3167,12 +3151,6 @@ retry_lookup:
        }
        /*
-         * create/update audit record if it already exists.
-         */
-        if (d_is_positive(path.dentry))
-                audit_inode(nd->name, path.dentry, 0);
-        /*
         * If atomic_open() acquired write access it is dropped now due to
         * possible mount and symlink following (this might be optimized away if
         * necessary...)
@@ -3182,6 +3160,16 @@ retry_lookup:
                got_write = false;
        }
+        if (unlikely(d_is_negative(path.dentry))) {
+                path_to_nameidata(&path, nd);
+                return -ENOENT;
+        }
+        /*
+         * create/update audit record if it already exists.
+         */
+        audit_inode(nd->name, path.dentry, 0);
        if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
                path_to_nameidata(&path, nd);
                return -EEXIST;
@@ -3191,13 +3179,8 @@ retry_lookup:
        if (unlikely(error < 0))
                return error;
-        BUG_ON(nd->flags & LOOKUP_RCU);
-        inode = d_backing_inode(path.dentry);
        seq = 0;        /* out of RCU mode, so the value doesn't matter */
-        if (unlikely(d_is_negative(path.dentry))) {
+        inode = d_backing_inode(path.dentry);
-                path_to_nameidata(&path, nd);
-                return -ENOENT;
-        }
 finish_lookup:
        if (nd->depth)
                put_link(nd);
@@ -3206,11 +3189,6 @@ finish_lookup:
        if (unlikely(error))
                return error;
-        if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) {
-                path_to_nameidata(&path, nd);
-                return -ELOOP;
-        }
        if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
                path_to_nameidata(&path, nd);
        } else {
@@ -3229,6 +3207,10 @@ finish_open:
                return error;
        }
        audit_inode(nd->name, nd->path.dentry, 0);
+        if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) {
+                error = -ELOOP;
+                goto out;
+        }
        error = -EISDIR;
        if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
                goto out;
@@ -3273,6 +3255,10 @@ opened:
                        goto exit_fput;
        }
 out:
+        if (unlikely(error > 0)) {
+                WARN_ON(1);
+                error = -EINVAL;
+        }
        if (got_write)
                mnt_drop_write(nd->path.mnt);
        path_put(&save_parent);
@@ -3699,31 +3685,6 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
        return sys_mkdirat(AT_FDCWD, pathname, mode);
 }
-/*
- * The dentry_unhash() helper will try to drop the dentry early: we
- * should have a usage count of 1 if we're the only user of this
- * dentry, and if that is true (possibly after pruning the dcache),
- * then we drop the dentry now.
- *
- * A low-level filesystem can, if it choses, legally
- * do a
- *
- *      if (!d_unhashed(dentry))
- *              return -EBUSY;
- *
- * if it cannot handle the case of removing a directory
- * that is still in use by something else..
- */
-void dentry_unhash(struct dentry *dentry)
-{
-        shrink_dcache_parent(dentry);
-        spin_lock(&dentry->d_lock);
-        if (dentry->d_lockref.count == 1)
-                __d_drop(dentry);
-        spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL(dentry_unhash);
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        int error = may_delete(dir, dentry, 1);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 26c2de2de13f..b7f8eaeea5d8 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
                                d_rehash(newdent);
                } else {
                        spin_lock(&dentry->d_lock);
-                        NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+                        NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
                        spin_unlock(&dentry->d_lock);
                }
        } else {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9cce67043f92..4bfa7d8bcade 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1360,19 +1360,15 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
        dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);
        nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
-        res = ERR_PTR(-ENAMETOOLONG);
+        if (unlikely(dentry->d_name.len > NFS_SERVER(dir)->namelen))
-        if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+                return ERR_PTR(-ENAMETOOLONG);
-                goto out;
        /*
         * If we're doing an exclusive create, optimize away the lookup
         * but don't hash the dentry.
         */
-        if (nfs_is_exclusive_create(dir, flags)) {
+        if (nfs_is_exclusive_create(dir, flags))
-                d_instantiate(dentry, NULL);
+                return NULL;
-                res = NULL;
-                goto out;
-        }
        res = ERR_PTR(-ENOMEM);
        fhandle = nfs_alloc_fhandle();
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 5bcd92d50e82..0cb1abd535e3 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1215,7 +1215,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
                                        hdr->pgio_mirror_idx + 1,
                                        &hdr->pgio_mirror_idx))
                        goto out_eagain;
-                set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                set_bit(NFS_LAYOUT_RETURN_REQUESTED,
                        &hdr->lseg->pls_layout->plh_flags);
                pnfs_read_resend_pnfs(hdr);
                return task->tk_status;
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 29898a9550fa..eb370460ce20 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -412,7 +412,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                                         OP_ILLEGAL, GFP_NOIO);
                if (!fail_return) {
                        if (ff_layout_has_available_ds(lseg))
-                                set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                                set_bit(NFS_LAYOUT_RETURN_REQUESTED,
                                        &lseg->pls_layout->plh_flags);
                        else
                                pnfs_error_mark_layout_for_return(ino, lseg);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4bfc33ad0563..400a70b3be7b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2461,14 +2461,15 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
        dentry = opendata->dentry;
        if (d_really_is_negative(dentry)) {
-                /* FIXME: Is this d_drop() ever needed? */
+                struct dentry *alias;
                d_drop(dentry);
-                dentry = d_add_unique(dentry, igrab(state->inode));
+                alias = d_exact_alias(dentry, state->inode);
-                if (dentry == NULL) {
+                if (!alias)
-                        dentry = opendata->dentry;
+                        alias = d_splice_alias(igrab(state->inode), dentry);
-                } else if (dentry != ctx->dentry) {
+                /* d_splice_alias() can't fail here - it's a non-directory */
+                if (alias) {
                        dput(ctx->dentry);
-                        ctx->dentry = dget(dentry);
+                        ctx->dentry = dentry = alias;
                }
                nfs_set_verifier(dentry,
                                nfs_save_change_attribute(d_inode(opendata->dir)));
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a3592cc34a20..482b6e94bb37 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -52,9 +52,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
 */
 static LIST_HEAD(pnfs_modules_tbl);
-static int
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
-                       enum pnfs_iomode iomode, bool sync);
 /* Return the registered pnfs layout driver module matching given id */
 static struct pnfs_layoutdriver_type *
@@ -243,6 +241,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 {
        struct inode *inode = lo->plh_inode;
+        pnfs_layoutreturn_before_put_layout_hdr(lo);
        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
                if (!list_empty(&lo->plh_segs))
                        WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
@@ -345,58 +345,6 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
 }
-/* Return true if layoutreturn is needed */
-static bool
-pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
-                        struct pnfs_layout_segment *lseg)
-{
-        struct pnfs_layout_segment *s;
-        if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
-                return false;
-        list_for_each_entry(s, &lo->plh_segs, pls_list)
-                if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
-                        return false;
-        return true;
-}
-static bool
-pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
-{
-        if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
-                return false;
-        lo->plh_return_iomode = 0;
-        pnfs_get_layout_hdr(lo);
-        clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
-        return true;
-}
-static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
-                struct pnfs_layout_hdr *lo, struct inode *inode)
-{
-        lo = lseg->pls_layout;
-        inode = lo->plh_inode;
-        spin_lock(&inode->i_lock);
-        if (pnfs_layout_need_return(lo, lseg)) {
-                nfs4_stateid stateid;
-                enum pnfs_iomode iomode;
-                bool send;
-                nfs4_stateid_copy(&stateid, &lo->plh_stateid);
-                iomode = lo->plh_return_iomode;
-                send = pnfs_prepare_layoutreturn(lo);
-                spin_unlock(&inode->i_lock);
-                if (send) {
-                        /* Send an async layoutreturn so we dont deadlock */
-                        pnfs_send_layoutreturn(lo, &stateid, iomode, false);
-                }
-        } else
-                spin_unlock(&inode->i_lock);
-}
 void
 pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 {
@@ -410,15 +358,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
                atomic_read(&lseg->pls_refcount),
                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-        /* Handle the case where refcount != 1 */
-        if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
-                return;
        lo = lseg->pls_layout;
        inode = lo->plh_inode;
-        /* Do we need a layoutreturn? */
-        if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
-                pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
@@ -937,6 +878,17 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
        rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 }
+static bool
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+{
+        if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+                return false;
+        lo->plh_return_iomode = 0;
+        pnfs_get_layout_hdr(lo);
+        clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+        return true;
+}
 static int
 pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
                       enum pnfs_iomode iomode, bool sync)
@@ -971,6 +923,48 @@ out:
        return status;
 }
+/* Return true if layoutreturn is needed */
+static bool
+pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
+{
+        struct pnfs_layout_segment *s;
+        if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+                return false;
+        /* Defer layoutreturn until all lsegs are done */
+        list_for_each_entry(s, &lo->plh_segs, pls_list) {
+                if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
+                        return false;
+        }
+        return true;
+}
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        struct inode *inode= lo->plh_inode;
+        if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+                return;
+        spin_lock(&inode->i_lock);
+        if (pnfs_layout_need_return(lo)) {
+                nfs4_stateid stateid;
+                enum pnfs_iomode iomode;
+                bool send;
+                nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+                iomode = lo->plh_return_iomode;
+                send = pnfs_prepare_layoutreturn(lo);
+                spin_unlock(&inode->i_lock);
+                if (send) {
+                        /* Send an async layoutreturn so we dont deadlock */
+                        pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+                }
+        } else
+                spin_unlock(&inode->i_lock);
+}
 /*
 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
 * when the layout segment list is empty.
@@ -1091,7 +1085,7 @@ bool pnfs_roc(struct inode *ino)
        nfs4_stateid_copy(&stateid, &lo->plh_stateid);
        /* always send layoutreturn if being marked so */
-        if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+        if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED,
                                   &lo->plh_flags))
                layoutreturn = pnfs_prepare_layoutreturn(lo);
@@ -1772,7 +1766,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                        pnfs_set_plh_return_iomode(lo, return_range->iomode);
                        if (!mark_lseg_invalid(lseg, tmp_list))
                                remaining++;
-                        set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                        set_bit(NFS_LAYOUT_RETURN_REQUESTED,
                                        &lo->plh_flags);
                }
        return remaining;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9f4e2a47f4aa..1ac1db5f6dad 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,8 +94,8 @@ enum {
        NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
        NFS_LAYOUT_BULK_RECALL,         /* bulk recall affecting layout */
-        NFS_LAYOUT_RETURN,              /* Return this layout ASAP */
+        NFS_LAYOUT_RETURN,              /* layoutreturn in progress */
-        NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
+        NFS_LAYOUT_RETURN_REQUESTED,    /* Return this layout ASAP */
        NFS_LAYOUT_INVALID_STID,        /* layout stateid id is invalid */
        NFS_LAYOUT_FIRST_LAYOUTGET,     /* Serialize first layoutget */
 };
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5d2a57e4c03a..d40010e4f1a9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -870,7 +870,7 @@ __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
        oldfs = get_fs();
        set_fs(KERNEL_DS);
-        host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+        host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0);
        set_fs(oldfs);
        return nfsd_finish_read(file, count, host_err);
 }
@@ -957,7 +957,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        /* Write the data. */
        oldfs = get_fs(); set_fs(KERNEL_DS);
-        host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos);
+        host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, 0);
        set_fs(oldfs);
        if (host_err < 0)
                goto out_nfserr;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a3cc6d2fc896..a76b9ea7722e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1254,15 +1254,15 @@ static const struct file_operations o2hb_debug_fops = {
 void o2hb_exit(void)
 {
-        kfree(o2hb_db_livenodes);
-        kfree(o2hb_db_liveregions);
-        kfree(o2hb_db_quorumregions);
-        kfree(o2hb_db_failedregions);
        debugfs_remove(o2hb_debug_failedregions);
        debugfs_remove(o2hb_debug_quorumregions);
        debugfs_remove(o2hb_debug_liveregions);
        debugfs_remove(o2hb_debug_livenodes);
        debugfs_remove(o2hb_debug_dir);
+        kfree(o2hb_db_livenodes);
+        kfree(o2hb_db_liveregions);
+        kfree(o2hb_db_quorumregions);
+        kfree(o2hb_db_failedregions);
 }
 static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
@@ -1438,13 +1438,15 @@ static void o2hb_region_release(struct config_item *item)
        kfree(reg->hr_slots);
-        kfree(reg->hr_db_regnum);
-        kfree(reg->hr_db_livenodes);
        debugfs_remove(reg->hr_debug_livenodes);
        debugfs_remove(reg->hr_debug_regnum);
        debugfs_remove(reg->hr_debug_elapsed_time);
        debugfs_remove(reg->hr_debug_pinned);
        debugfs_remove(reg->hr_debug_dir);
+        kfree(reg->hr_db_livenodes);
+        kfree(reg->hr_db_regnum);
+        kfree(reg->hr_debug_elapsed_time);
+        kfree(reg->hr_debug_pinned);
        spin_lock(&o2hb_live_lock);
        list_del(&reg->hr_all_item);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index c5bdf02c213b..b94a425f0175 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2367,6 +2367,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
                                                break;
                                        }
                                }
+                                dlm_lockres_clear_refmap_bit(dlm, res,
+                                                dead_node);
                                spin_unlock(&res->spinlock);
                                continue;
                        }
diff --git a/fs/pnode.c b/fs/pnode.c
index 6367e1e435c6..c524fdddc7fb 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -202,6 +202,11 @@ static struct mount *last_dest, *last_source, *dest_master;
 static struct mountpoint *mp;
 static struct hlist_head *list;
+static inline bool peers(struct mount *m1, struct mount *m2)
+{
+        return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
+}
 static int propagate_one(struct mount *m)
 {
        struct mount *child;
@@ -212,7 +217,7 @@ static int propagate_one(struct mount *m)
        /* skip if mountpoint isn't covered by it */
        if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
                return 0;
-        if (m->mnt_group_id == last_dest->mnt_group_id) {
+        if (peers(m, last_dest)) {
                type = CL_MAKE_SHARED;
        } else {
                struct mount *n, *p;
@@ -223,7 +228,7 @@ static int propagate_one(struct mount *m)
                                        last_source = last_source->mnt_master;
                                        last_dest = last_source->mnt_parent;
                                }
-                                if (n->mnt_group_id != last_dest->mnt_group_id) {
+                                if (!peers(n, last_dest)) {
                                        last_source = last_source->mnt_master;
                                        last_dest = last_source->mnt_parent;
                                }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 85d16c67c33e..fa95ab2d3674 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -259,23 +259,29 @@ static int do_maps_open(struct inode *inode, struct file *file,
                                sizeof(struct proc_maps_private));
 }
-static pid_t pid_of_stack(struct proc_maps_private *priv,
+/*
-                                struct vm_area_struct *vma, bool is_pid)
+ * Indicate if the VMA is a stack for the given task; for
+ * /proc/PID/maps that is the stack of the main task.
+ */
+static int is_stack(struct proc_maps_private *priv,
+                    struct vm_area_struct *vma, int is_pid)
 {
-        struct inode *inode = priv->inode;
+        int stack = 0;
-        struct task_struct *task;
-        pid_t ret = 0;
+        if (is_pid) {
+                stack = vma->vm_start <= vma->vm_mm->start_stack &&
+                        vma->vm_end >= vma->vm_mm->start_stack;
+        } else {
+                struct inode *inode = priv->inode;
+                struct task_struct *task;
-        rcu_read_lock();
+                rcu_read_lock();
-        task = pid_task(proc_pid(inode), PIDTYPE_PID);
+                task = pid_task(proc_pid(inode), PIDTYPE_PID);
-        if (task) {
-                task = task_of_stack(task, vma, is_pid);
                if (task)
-                        ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+                        stack = vma_is_stack_for_task(vma, task);
+                rcu_read_unlock();
        }
-        rcu_read_unlock();
+        return stack;
-        return ret;
 }
 static void
@@ -335,8 +341,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
        name = arch_vma_name(vma);
        if (!name) {
-                pid_t tid;
                if (!mm) {
                        name = "[vdso]";
                        goto done;
@@ -348,21 +352,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
                        goto done;
                }
-                tid = pid_of_stack(priv, vma, is_pid);
+                if (is_stack(priv, vma, is_pid))
-                if (tid != 0) {
+                        name = "[stack]";
-                        /*
-                         * Thread stack in /proc/PID/task/TID/maps or
-                         * the main process stack.
-                         */
-                        if (!is_pid || (vma->vm_start <= mm->start_stack &&
-                            vma->vm_end >= mm->start_stack)) {
-                                name = "[stack]";
-                        } else {
-                                /* Thread stack in /proc/PID/maps */
-                                seq_pad(m, ' ');
-                                seq_printf(m, "[stack:%d]", tid);
-                        }
-                }
        }
 done:
@@ -1552,18 +1543,19 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
                unsigned long addr, unsigned long end, struct mm_walk *walk)
 {
+        pte_t huge_pte = huge_ptep_get(pte);
        struct numa_maps *md;
        struct page *page;
-        if (!pte_present(*pte))
+        if (!pte_present(huge_pte))
                return 0;
-        page = pte_page(*pte);
+        page = pte_page(huge_pte);
        if (!page)
                return 0;
        md = walk->private;
-        gather_stats(page, md, pte_dirty(*pte), 1);
+        gather_stats(page, md, pte_dirty(huge_pte), 1);
        return 0;
 }
@@ -1617,19 +1609,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
                seq_file_path(m, file, "\n\t= ");
        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
                seq_puts(m, " heap");
-        } else {
+        } else if (is_stack(proc_priv, vma, is_pid)) {
-                pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
+                seq_puts(m, " stack");
-                if (tid != 0) {
-                        /*
-                         * Thread stack in /proc/PID/task/TID/maps or
-                         * the main process stack.
-                         */
-                        if (!is_pid || (vma->vm_start <= mm->start_stack &&
-                            vma->vm_end >= mm->start_stack))
-                                seq_puts(m, " stack");
-                        else
-                                seq_printf(m, " stack:%d", tid);
-                }
        }
        if (is_vm_hugetlb_page(vma))
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index e0d64c92e4f6..faacb0c0d857 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm,
        return size;
 }
-static pid_t pid_of_stack(struct proc_maps_private *priv,
+static int is_stack(struct proc_maps_private *priv,
-                                struct vm_area_struct *vma, bool is_pid)
+                    struct vm_area_struct *vma, int is_pid)
 {
-        struct inode *inode = priv->inode;
+        struct mm_struct *mm = vma->vm_mm;
-        struct task_struct *task;
+        int stack = 0;
-        pid_t ret = 0;
+        if (is_pid) {
-        rcu_read_lock();
+                stack = vma->vm_start <= mm->start_stack &&
-        task = pid_task(proc_pid(inode), PIDTYPE_PID);
+                        vma->vm_end >= mm->start_stack;
-        if (task) {
+        } else {
-                task = task_of_stack(task, vma, is_pid);
+                struct inode *inode = priv->inode;
+                struct task_struct *task;
+                rcu_read_lock();
+                task = pid_task(proc_pid(inode), PIDTYPE_PID);
                if (task)
-                        ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+                        stack = vma_is_stack_for_task(vma, task);
+                rcu_read_unlock();
        }
-        rcu_read_unlock();
+        return stack;
-        return ret;
 }
 /*
@@ -181,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
        if (file) {
                seq_pad(m, ' ');
                seq_file_path(m, file, "");
-        } else if (mm) {
+        } else if (mm && is_stack(priv, vma, is_pid)) {
-                pid_t tid = pid_of_stack(priv, vma, is_pid);
+                seq_pad(m, ' ');
+                seq_printf(m, "[stack]");
-                if (tid != 0) {
-                        seq_pad(m, ' ');
-                        /*
-                         * Thread stack in /proc/PID/task/TID/maps or
-                         * the main process stack.
-                         */
-                        if (!is_pid || (vma->vm_start <= mm->start_stack &&
-                            vma->vm_end >= mm->start_stack))
-                                seq_printf(m, "[stack]");
-                        else
-                                seq_printf(m, "[stack:%d]", tid);
-                }
        }
        seq_putc(m, '\n');
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3c3b81bb6dfe..04ca0cc6d065 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2430,9 +2430,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
        struct dentry *dentry;
        int error;
-        inode_lock(d_inode(sb->s_root));
+        dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name));
-        dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
-        inode_unlock(d_inode(sb->s_root));
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
diff --git a/fs/read_write.c b/fs/read_write.c
index 324ec271cc4e..cf377cf9dfe3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -17,6 +17,7 @@
 #include <linux/splice.h>
 #include <linux/compat.h>
 #include <linux/mount.h>
+#include <linux/fs.h>
 #include "internal.h"
 #include <asm/uaccess.h>
@@ -183,7 +184,7 @@ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
        switch (whence) {
        case SEEK_SET: case SEEK_CUR:
                return generic_file_llseek_size(file, offset, whence,
-                                                ~0ULL, 0);
+                                                OFFSET_MAX, 0);
        default:
                return -EINVAL;
        }
@@ -692,12 +693,17 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 EXPORT_SYMBOL(iov_shorten);
 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
-                loff_t *ppos, iter_fn_t fn)
+                loff_t *ppos, iter_fn_t fn, int flags)
 {
        struct kiocb kiocb;
        ssize_t ret;
+        if (flags & ~RWF_HIPRI)
+                return -EOPNOTSUPP;
        init_sync_kiocb(&kiocb, filp);
+        if (flags & RWF_HIPRI)
+                kiocb.ki_flags |= IOCB_HIPRI;
        kiocb.ki_pos = *ppos;
        ret = fn(&kiocb, iter);
@@ -708,10 +714,13 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 /* Do it by hand, with file-ops */
 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
-                loff_t *ppos, io_fn_t fn)
+                loff_t *ppos, io_fn_t fn, int flags)
 {
        ssize_t ret = 0;
+        if (flags & ~RWF_HIPRI)
+                return -EOPNOTSUPP;
        while (iov_iter_count(iter)) {
                struct iovec iovec = iov_iter_iovec(iter);
                ssize_t nr;
@@ -812,7 +821,8 @@ out:
 static ssize_t do_readv_writev(int type, struct file *file,
                               const struct iovec __user * uvector,
-                               unsigned long nr_segs, loff_t *pos)
+                               unsigned long nr_segs, loff_t *pos,
+                               int flags)
 {
        size_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
@@ -844,9 +854,9 @@ static ssize_t do_readv_writev(int type, struct file *file,
        }
        if (iter_fn)
-                ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
+                ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
        else
-                ret = do_loop_readv_writev(file, &iter, pos, fn);
+                ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
        if (type != READ)
                file_end_write(file);
@@ -863,40 +873,40 @@ out:
 }
 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
-                  unsigned long vlen, loff_t *pos)
+                  unsigned long vlen, loff_t *pos, int flags)
 {
        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;
-        return do_readv_writev(READ, file, vec, vlen, pos);
+        return do_readv_writev(READ, file, vec, vlen, pos, flags);
 }
 EXPORT_SYMBOL(vfs_readv);
 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
-                   unsigned long vlen, loff_t *pos)
+                   unsigned long vlen, loff_t *pos, int flags)
 {
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
-        return do_readv_writev(WRITE, file, vec, vlen, pos);
+        return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
 }
 EXPORT_SYMBOL(vfs_writev);
-SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
+static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
-                unsigned long, vlen)
+                        unsigned long vlen, int flags)
 {
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;
        if (f.file) {
                loff_t pos = file_pos_read(f.file);
-                ret = vfs_readv(f.file, vec, vlen, &pos);
+                ret = vfs_readv(f.file, vec, vlen, &pos, flags);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
                fdput_pos(f);
@@ -908,15 +918,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
        return ret;
 }
-SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
+static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
-                unsigned long, vlen)
+                         unsigned long vlen, int flags)
 {
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;
        if (f.file) {
                loff_t pos = file_pos_read(f.file);
-                ret = vfs_writev(f.file, vec, vlen, &pos);
+                ret = vfs_writev(f.file, vec, vlen, &pos, flags);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
                fdput_pos(f);
@@ -934,10 +944,9 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 }
-SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
-                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+                         unsigned long vlen, loff_t pos, int flags)
 {
-        loff_t pos = pos_from_hilo(pos_h, pos_l);
        struct fd f;
        ssize_t ret = -EBADF;
@@ -948,7 +957,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PREAD)
-                        ret = vfs_readv(f.file, vec, vlen, &pos);
+                        ret = vfs_readv(f.file, vec, vlen, &pos, flags);
                fdput(f);
        }
@@ -958,10 +967,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
        return ret;
 }
-SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
-                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+                          unsigned long vlen, loff_t pos, int flags)
 {
-        loff_t pos = pos_from_hilo(pos_h, pos_l);
        struct fd f;
        ssize_t ret = -EBADF;
@@ -972,7 +980,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PWRITE)
-                        ret = vfs_writev(f.file, vec, vlen, &pos);
+                        ret = vfs_writev(f.file, vec, vlen, &pos, flags);
                fdput(f);
        }
@@ -982,11 +990,64 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
        return ret;
 }
+SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen)
+{
+        return do_readv(fd, vec, vlen, 0);
+}
+SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen)
+{
+        return do_writev(fd, vec, vlen, 0);
+}
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+        loff_t pos = pos_from_hilo(pos_h, pos_l);
+        return do_preadv(fd, vec, vlen, pos, 0);
+}
+SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
+                int, flags)
+{
+        loff_t pos = pos_from_hilo(pos_h, pos_l);
+        if (pos == -1)
+                return do_readv(fd, vec, vlen, flags);
+        return do_preadv(fd, vec, vlen, pos, flags);
+}
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+        loff_t pos = pos_from_hilo(pos_h, pos_l);
+        return do_pwritev(fd, vec, vlen, pos, 0);
+}
+SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
+                int, flags)
+{
+        loff_t pos = pos_from_hilo(pos_h, pos_l);
+        if (pos == -1)
+                return do_writev(fd, vec, vlen, flags);
+        return do_pwritev(fd, vec, vlen, pos, flags);
+}
 #ifdef CONFIG_COMPAT
 static ssize_t compat_do_readv_writev(int type, struct file *file,
                               const struct compat_iovec __user *uvector,
-                               unsigned long nr_segs, loff_t *pos)
+                               unsigned long nr_segs, loff_t *pos,
+                               int flags)
 {
        compat_ssize_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
@@ -1018,9 +1079,9 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
        }
        if (iter_fn)
-                ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
+                ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
        else
-                ret = do_loop_readv_writev(file, &iter, pos, fn);
+                ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
        if (type != READ)
                file_end_write(file);
@@ -1038,7 +1099,7 @@ out:
 static size_t compat_readv(struct file *file,
                           const struct compat_iovec __user *vec,
-                           unsigned long vlen, loff_t *pos)
+                           unsigned long vlen, loff_t *pos, int flags)
 {
        ssize_t ret = -EBADF;
@@ -1049,7 +1110,7 @@ static size_t compat_readv(struct file *file,
        if (!(file->f_mode & FMODE_CAN_READ))
                goto out;
-        ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
+        ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
 out:
        if (ret > 0)
@@ -1058,9 +1119,9 @@ out:
        return ret;
 }
-COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
+static size_t do_compat_readv(compat_ulong_t fd,
-                const struct compat_iovec __user *,vec,
+                                 const struct compat_iovec __user *vec,
-                compat_ulong_t, vlen)
+                                 compat_ulong_t vlen, int flags)
 {
        struct fd f = fdget_pos(fd);
        ssize_t ret;
@@ -1069,16 +1130,24 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
        if (!f.file)
                return -EBADF;
        pos = f.file->f_pos;
-        ret = compat_readv(f.file, vec, vlen, &pos);
+        ret = compat_readv(f.file, vec, vlen, &pos, flags);
        if (ret >= 0)
                f.file->f_pos = pos;
        fdput_pos(f);
        return ret;
 }
-static long __compat_sys_preadv64(unsigned long fd,
+COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
+                const struct compat_iovec __user *,vec,
+                compat_ulong_t, vlen)
+{
+        return do_compat_readv(fd, vec, vlen, 0);
+}
+static long do_compat_preadv64(unsigned long fd,
                                  const struct compat_iovec __user *vec,
-                                  unsigned long vlen, loff_t pos)
+                                  unsigned long vlen, loff_t pos, int flags)
 {
        struct fd f;
        ssize_t ret;
@@ -1090,7 +1159,7 @@ static long __compat_sys_preadv64(unsigned long fd,
                return -EBADF;
        ret = -ESPIPE;
        if (f.file->f_mode & FMODE_PREAD)
-                ret = compat_readv(f.file, vec, vlen, &pos);
+                ret = compat_readv(f.file, vec, vlen, &pos, flags);
        fdput(f);
        return ret;
 }
@@ -1100,7 +1169,7 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
                const struct compat_iovec __user *,vec,
                unsigned long, vlen, loff_t, pos)
 {
-        return __compat_sys_preadv64(fd, vec, vlen, pos);
+        return do_compat_preadv64(fd, vec, vlen, pos, 0);
 }
 #endif
@@ -1110,12 +1179,25 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
 {
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
-        return __compat_sys_preadv64(fd, vec, vlen, pos);
+        return do_compat_preadv64(fd, vec, vlen, pos, 0);
+}
+COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
+                const struct compat_iovec __user *,vec,
+                compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
+                int, flags)
+{
+        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+        if (pos == -1)
+                return do_compat_readv(fd, vec, vlen, flags);
+        return do_compat_preadv64(fd, vec, vlen, pos, flags);
 }
 static size_t compat_writev(struct file *file,
                            const struct compat_iovec __user *vec,
-                            unsigned long vlen, loff_t *pos)
+                            unsigned long vlen, loff_t *pos, int flags)
 {
        ssize_t ret = -EBADF;
@@ -1126,7 +1208,7 @@ static size_t compat_writev(struct file *file,
        if (!(file->f_mode & FMODE_CAN_WRITE))
                goto out;
-        ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
+        ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0);
 out:
        if (ret > 0)
@@ -1135,9 +1217,9 @@ out:
        return ret;
 }
-COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
+static size_t do_compat_writev(compat_ulong_t fd,
-                const struct compat_iovec __user *, vec,
+                                  const struct compat_iovec __user* vec,
-                compat_ulong_t, vlen)
+                                  compat_ulong_t vlen, int flags)
 {
        struct fd f = fdget_pos(fd);
        ssize_t ret;
@@ -1146,16 +1228,23 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
        if (!f.file)
                return -EBADF;
        pos = f.file->f_pos;
-        ret = compat_writev(f.file, vec, vlen, &pos);
+        ret = compat_writev(f.file, vec, vlen, &pos, flags);
        if (ret >= 0)
                f.file->f_pos = pos;
        fdput_pos(f);
        return ret;
 }
-static long __compat_sys_pwritev64(unsigned long fd,
+COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
+                const struct compat_iovec __user *, vec,
+                compat_ulong_t, vlen)
+{
+        return do_compat_writev(fd, vec, vlen, 0);
+}
+static long do_compat_pwritev64(unsigned long fd,
                                   const struct compat_iovec __user *vec,
-                                   unsigned long vlen, loff_t pos)
+                                   unsigned long vlen, loff_t pos, int flags)
 {
        struct fd f;
        ssize_t ret;
@@ -1167,7 +1256,7 @@ static long __compat_sys_pwritev64(unsigned long fd,
                return -EBADF;
        ret = -ESPIPE;
        if (f.file->f_mode & FMODE_PWRITE)
-                ret = compat_writev(f.file, vec, vlen, &pos);
+                ret = compat_writev(f.file, vec, vlen, &pos, flags);
        fdput(f);
        return ret;
 }
@@ -1177,7 +1266,7 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
                const struct compat_iovec __user *,vec,
                unsigned long, vlen, loff_t, pos)
 {
-        return __compat_sys_pwritev64(fd, vec, vlen, pos);
+        return do_compat_pwritev64(fd, vec, vlen, pos, 0);
 }
 #endif
@@ -1187,8 +1276,21 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
 {
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
-        return __compat_sys_pwritev64(fd, vec, vlen, pos);
+        return do_compat_pwritev64(fd, vec, vlen, pos, 0);
 }
+COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
+                const struct compat_iovec __user *,vec,
+                compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
+{
+        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+        if (pos == -1)
+                return do_compat_writev(fd, vec, vlen, flags);
+        return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+}
 #endif
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
@@ -1532,10 +1634,12 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
        if (!(file_in->f_mode & FMODE_READ) ||
            !(file_out->f_mode & FMODE_WRITE) ||
-            (file_out->f_flags & O_APPEND) ||
+            (file_out->f_flags & O_APPEND))
-            !file_in->f_op->clone_file_range)
                return -EBADF;
+        if (!file_in->f_op->clone_file_range)
+                return -EOPNOTSUPP;
        ret = clone_verify_area(file_in, pos_in, len, false);
        if (ret)
                return ret;
diff --git a/fs/splice.c b/fs/splice.c
index 19e0b103d253..9947b5c69664 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -580,7 +580,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
        old_fs = get_fs();
        set_fs(get_ds());
        /* The cast to a user pointer is valid due to the set_fs() */
-        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
        set_fs(old_fs);
        return res;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b94fa6c3c6eb..053818dd6c18 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -153,7 +153,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
        if (isalarm(ctx))
                remaining = alarm_expires_remaining(&ctx->t.alarm);
        else
-                remaining = hrtimer_expires_remaining(&ctx->t.tmr);
+                remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr);
        return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
 }
diff --git a/fs/xattr.c b/fs/xattr.c
index 07d0e47f6a7f..4861322e28e8 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -940,7 +940,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
        bool trusted = capable(CAP_SYS_ADMIN);
        struct simple_xattr *xattr;
        ssize_t remaining_size = size;
-        int err;
+        int err = 0;
 #ifdef CONFIG_FS_POSIX_ACL
        if (inode->i_acl) {
@@ -965,11 +965,11 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
                err = xattr_list_one(&buffer, &remaining_size, xattr->name);
                if (err)
-                        return err;
+                        break;
        }
        spin_unlock(&xattrs->lock);
-        return size - remaining_size;
+        return err ? err : size - remaining_size;
 }
 /*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index da37beb76f6e..594f7e63b432 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4491,7 +4491,7 @@ xlog_recover_process(
         * know precisely what failed.
         */
        if (pass == XLOG_RECOVER_CRCPASS) {
-                if (rhead->h_crc && crc != le32_to_cpu(rhead->h_crc))
+                if (rhead->h_crc && crc != rhead->h_crc)
                        return -EFSBADCRC;
                return 0;
        }
@@ -4502,7 +4502,7 @@ xlog_recover_process(
         * zero CRC check prevents warnings from being emitted when upgrading
         * the kernel from one that does not add CRCs by default.
         */
-        if (crc != le32_to_cpu(rhead->h_crc)) {
+        if (crc != rhead->h_crc) {
                if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
                        xfs_alert(log->l_mp,
                "log record CRC mismatch: found 0x%x, expected 0x%x.",