22 files changed, 436 insertions, 185 deletions
diff --git a/fs/aio.c b/fs/aio.c
index a062d75109cb..6bcd3fb5265a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -68,9 +68,9 @@ struct aio_ring {
 #define AIO_RING_PAGES  8
 struct kioctx_table {
-        struct rcu_head rcu;
+        struct rcu_head         rcu;
-        unsigned        nr;
+        unsigned                nr;
-        struct kioctx   *table[];
+        struct kioctx __rcu     *table[];
 };
 struct kioctx_cpu {
@@ -115,7 +115,8 @@ struct kioctx {
        struct page             **ring_pages;
        long                    nr_pages;
-        struct work_struct      free_work;
+        struct rcu_head         free_rcu;
+        struct work_struct      free_work;      /* see free_ioctx() */
        /*
         * signals when all in-flight requests are done
@@ -329,7 +330,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma)
        for (i = 0; i < table->nr; i++) {
                struct kioctx *ctx;
-                ctx = table->table[i];
+                ctx = rcu_dereference(table->table[i]);
                if (ctx && ctx->aio_ring_file == file) {
                        if (!atomic_read(&ctx->dead)) {
                                ctx->user_id = ctx->mmap_base = vma->vm_start;
@@ -588,6 +589,12 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
        return cancel(&kiocb->common);
 }
+/*
+ * free_ioctx() should be RCU delayed to synchronize against the RCU
+ * protected lookup_ioctx() and also needs process context to call
+ * aio_free_ring(), so the double bouncing through kioctx->free_rcu and
+ * ->free_work.
+ */
 static void free_ioctx(struct work_struct *work)
 {
        struct kioctx *ctx = container_of(work, struct kioctx, free_work);
@@ -601,6 +608,14 @@ static void free_ioctx(struct work_struct *work)
        kmem_cache_free(kioctx_cachep, ctx);
 }
+static void free_ioctx_rcufn(struct rcu_head *head)
+{
+        struct kioctx *ctx = container_of(head, struct kioctx, free_rcu);
+        INIT_WORK(&ctx->free_work, free_ioctx);
+        schedule_work(&ctx->free_work);
+}
 static void free_ioctx_reqs(struct percpu_ref *ref)
 {
        struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
@@ -609,8 +624,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
        if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
                complete(&ctx->rq_wait->comp);
-        INIT_WORK(&ctx->free_work, free_ioctx);
+        /* Synchronize against RCU protected table->table[] dereferences */
-        schedule_work(&ctx->free_work);
+        call_rcu(&ctx->free_rcu, free_ioctx_rcufn);
 }
 /*
@@ -651,9 +666,9 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
        while (1) {
                if (table)
                        for (i = 0; i < table->nr; i++)
-                                if (!table->table[i]) {
+                                if (!rcu_access_pointer(table->table[i])) {
                                        ctx->id = i;
-                                        table->table[i] = ctx;
+                                        rcu_assign_pointer(table->table[i], ctx);
                                        spin_unlock(&mm->ioctx_lock);
                                        /* While kioctx setup is in progress,
@@ -834,11 +849,11 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
        }
        table = rcu_dereference_raw(mm->ioctx_table);
-        WARN_ON(ctx != table->table[ctx->id]);
+        WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
-        table->table[ctx->id] = NULL;
+        RCU_INIT_POINTER(table->table[ctx->id], NULL);
        spin_unlock(&mm->ioctx_lock);
-        /* percpu_ref_kill() will do the necessary call_rcu() */
+        /* free_ioctx_reqs() will do the necessary RCU synchronization */
        wake_up_all(&ctx->wait);
        /*
@@ -880,7 +895,8 @@ void exit_aio(struct mm_struct *mm)
        skipped = 0;
        for (i = 0; i < table->nr; ++i) {
-                struct kioctx *ctx = table->table[i];
+                struct kioctx *ctx =
+                        rcu_dereference_protected(table->table[i], true);
                if (!ctx) {
                        skipped++;
@@ -1069,7 +1085,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
        if (!table || id >= table->nr)
                goto out;
-        ctx = table->table[id];
+        ctx = rcu_dereference(table->table[id]);
        if (ctx && ctx->user_id == ctx_id) {
                percpu_ref_get(&ctx->users);
                ret = ctx;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f94b2d8c744a..26484648d090 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1519,6 +1519,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
                if (!node)
                        break;
                bytenr = node->val;
+                shared.share_count = 0;
                cond_resched();
        }
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index dec0907dfb8a..fcfc20de2df3 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1370,6 +1370,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
                stripe_start = stripe->physical;
                if (physical >= stripe_start &&
                    physical < stripe_start + rbio->stripe_len &&
+                    stripe->dev->bdev &&
                    bio->bi_disk == stripe->dev->bdev->bd_disk &&
                    bio->bi_partno == stripe->dev->bdev->bd_partno) {
                        return i;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index d11c70bff5a9..a8bafed931f4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -423,7 +423,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
-        return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->nodesize);
+        return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
 }
 BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -433,7 +433,8 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
-        return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->sectorsize);
+        return snprintf(buf, PAGE_SIZE, "%u\n",
+                        fs_info->super_copy->sectorsize);
 }
 BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -443,7 +444,8 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
-        return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->sectorsize);
+        return snprintf(buf, PAGE_SIZE, "%u\n",
+                        fs_info->super_copy->sectorsize);
 }
 BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9220f004001c..04f07144b45c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1722,23 +1722,19 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
        super = fs_info->super_copy;
-        /* update latest btrfs_super_block::chunk_root refs */
        root_item = &fs_info->chunk_root->root_item;
-        btrfs_set_super_chunk_root(super, root_item->bytenr);
+        super->chunk_root = root_item->bytenr;
-        btrfs_set_super_chunk_root_generation(super, root_item->generation);
+        super->chunk_root_generation = root_item->generation;
-        btrfs_set_super_chunk_root_level(super, root_item->level);
+        super->chunk_root_level = root_item->level;
-        /* update latest btrfs_super_block::root refs */
        root_item = &fs_info->tree_root->root_item;
-        btrfs_set_super_root(super, root_item->bytenr);
+        super->root = root_item->bytenr;
-        btrfs_set_super_generation(super, root_item->generation);
+        super->generation = root_item->generation;
-        btrfs_set_super_root_level(super, root_item->level);
+        super->root_level = root_item->level;
        if (btrfs_test_opt(fs_info, SPACE_CACHE))
-                btrfs_set_super_cache_generation(super, root_item->generation);
+                super->cache_generation = root_item->generation;
        if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
-                btrfs_set_super_uuid_tree_generation(super,
+                super->uuid_tree_generation = root_item->generation;
-                                                     root_item->generation);
 }
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
diff --git a/fs/dcache.c b/fs/dcache.c
index 7c38f39958bc..8945e6cabd93 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -647,11 +647,16 @@ again:
                spin_unlock(&parent->d_lock);
                goto again;
        }
-        rcu_read_unlock();
+        if (parent != dentry) {
-        if (parent != dentry)
                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-        else
+                if (unlikely(dentry->d_lockref.count < 0)) {
+                        spin_unlock(&parent->d_lock);
+                        parent = NULL;
+                }
+        } else {
                parent = NULL;
+        }
+        rcu_read_unlock();
        return parent;
 }
@@ -2474,7 +2479,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
 retry:
        rcu_read_lock();
-        seq = smp_load_acquire(&parent->d_inode->i_dir_seq) & ~1;
+        seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
        r_seq = read_seqbegin(&rename_lock);
        dentry = __d_lookup_rcu(parent, name, &d_seq);
        if (unlikely(dentry)) {
@@ -2495,8 +2500,14 @@ retry:
                rcu_read_unlock();
                goto retry;
        }
+        if (unlikely(seq & 1)) {
+                rcu_read_unlock();
+                goto retry;
+        }
        hlist_bl_lock(b);
-        if (unlikely(parent->d_inode->i_dir_seq != seq)) {
+        if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
                hlist_bl_unlock(b);
                rcu_read_unlock();
                goto retry;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 86d6a4435c87..51f940e76c5e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -807,9 +807,6 @@ do_alloc:
                        iomap->length = hole_size(inode, lblock, &mp);
                else
                        iomap->length = size - pos;
-        } else {
-                if (height <= ip->i_height)
-                        iomap->length = hole_size(inode, lblock, &mp);
        }
        goto out_release;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8fe1b0aa2896..b9a254dcc0e7 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec)
        pagevec_reinit(pvec);
 }
+/*
+ * Mask used when checking the page offset value passed in via system
+ * calls.  This value will be converted to a loff_t which is signed.
+ * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
+ * value.  The extra bit (- 1 in the shift value) is to take the sign
+ * bit into account.
+ */
+#define PGOFF_LOFFT_MAX \
+        (((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct inode *inode = file_inode(file);
@@ -127,12 +137,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
        vma->vm_ops = &hugetlb_vm_ops;
        /*
-         * Offset passed to mmap (before page shift) could have been
+         * page based offset in vm_pgoff could be sufficiently large to
-         * negative when represented as a (l)off_t.
+         * overflow a (l)off_t when converted to byte offset.
         */
-        if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0)
+        if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
                return -EINVAL;
+        /* must be huge page aligned */
        if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
                return -EINVAL;
diff --git a/fs/namei.c b/fs/namei.c
index 921ae32dbc80..cafa365eeb70 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -559,9 +559,10 @@ static int __nd_alloc_stack(struct nameidata *nd)
 static bool path_connected(const struct path *path)
 {
        struct vfsmount *mnt = path->mnt;
+        struct super_block *sb = mnt->mnt_sb;
-        /* Only bind mounts can have disconnected paths */
+        /* Bind mounts and multi-root filesystems can have disconnected paths */
-        if (mnt->mnt_root == mnt->mnt_sb->s_root)
+        if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
                return true;
        return is_subdir(path->dentry, mnt->mnt_root);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 8c10b0562e75..621c517b325c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -86,10 +86,10 @@ struct nfs_direct_req {
        struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
        int                     mirror_count;
+        loff_t                  io_start;       /* Start offset for I/O */
        ssize_t                 count,          /* bytes actually processed */
                                max_count,      /* max expected count */
                                bytes_left,     /* bytes left to be sent */
-                                io_start,       /* start of IO */
                                error;          /* any reported error */
        struct completion       completion;     /* wait for i/o completion */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c13e826614b5..ee723aa153a3 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -292,8 +292,11 @@ pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
 void
 pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-        struct inode *inode = lo->plh_inode;
+        struct inode *inode;
+        if (!lo)
+                return;
+        inode = lo->plh_inode;
        pnfs_layoutreturn_before_put_layout_hdr(lo);
        if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
@@ -1241,10 +1244,12 @@ retry:
        spin_lock(&ino->i_lock);
        lo = nfsi->layout;
        if (!lo || !pnfs_layout_is_valid(lo) ||
-            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+                lo = NULL;
                goto out_noroc;
+        }
+        pnfs_get_layout_hdr(lo);
        if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
-                pnfs_get_layout_hdr(lo);
                spin_unlock(&ino->i_lock);
                wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
                                TASK_UNINTERRUPTIBLE);
@@ -1312,10 +1317,12 @@ out_noroc:
                struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
                if (ld->prepare_layoutreturn)
                        ld->prepare_layoutreturn(args);
+                pnfs_put_layout_hdr(lo);
                return true;
        }
        if (layoutreturn)
                pnfs_send_layoutreturn(lo, &stateid, iomode, true);
+        pnfs_put_layout_hdr(lo);
        return false;
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 29bacdc56f6a..5e470e233c83 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2631,6 +2631,8 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
                /* initial superblock/root creation */
                mount_info->fill_super(s, mount_info);
                nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
+                if (!(server->flags & NFS_MOUNT_UNSHARED))
+                        s->s_iflags |= SB_I_MULTIROOT;
        }
        mntroot = nfs_get_root(s, mount_info->mntfh, dev_name);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7428a669d7a7..e7d8ceae8f26 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1876,40 +1876,43 @@ int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
        return status;
 }
-int nfs_commit_inode(struct inode *inode, int how)
+static int __nfs_commit_inode(struct inode *inode, int how,
+                struct writeback_control *wbc)
 {
        LIST_HEAD(head);
        struct nfs_commit_info cinfo;
        int may_wait = how & FLUSH_SYNC;
-        int error = 0;
+        int ret, nscan;
-        int res;
        nfs_init_cinfo_from_inode(&cinfo, inode);
        nfs_commit_begin(cinfo.mds);
-        res = nfs_scan_commit(inode, &head, &cinfo);
+        for (;;) {
-        if (res)
+                ret = nscan = nfs_scan_commit(inode, &head, &cinfo);
-                error = nfs_generic_commit_list(inode, &head, how, &cinfo);
+                if (ret <= 0)
+                        break;
+                ret = nfs_generic_commit_list(inode, &head, how, &cinfo);
+                if (ret < 0)
+                        break;
+                ret = 0;
+                if (wbc && wbc->sync_mode == WB_SYNC_NONE) {
+                        if (nscan < wbc->nr_to_write)
+                                wbc->nr_to_write -= nscan;
+                        else
+                                wbc->nr_to_write = 0;
+                }
+                if (nscan < INT_MAX)
+                        break;
+                cond_resched();
+        }
        nfs_commit_end(cinfo.mds);
-        if (res == 0)
+        if (ret || !may_wait)
-                return res;
+                return ret;
-        if (error < 0)
+        return wait_on_commit(cinfo.mds);
-                goto out_error;
+}
-        if (!may_wait)
-                goto out_mark_dirty;
+int nfs_commit_inode(struct inode *inode, int how)
-        error = wait_on_commit(cinfo.mds);
+{
-        if (error < 0)
+        return __nfs_commit_inode(inode, how, NULL);
-                return error;
-        return res;
-out_error:
-        res = error;
-        /* Note: If we exit without ensuring that the commit is complete,
-         * we must mark the inode as dirty. Otherwise, future calls to
-         * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
-         * that the data is on the disk.
-         */
-out_mark_dirty:
-        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-        return res;
 }
 EXPORT_SYMBOL_GPL(nfs_commit_inode);
@@ -1919,11 +1922,11 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        int flags = FLUSH_SYNC;
        int ret = 0;
-        /* no commits means nothing needs to be done */
-        if (!atomic_long_read(&nfsi->commit_info.ncommit))
-                return ret;
        if (wbc->sync_mode == WB_SYNC_NONE) {
+                /* no commits means nothing needs to be done */
+                if (!atomic_long_read(&nfsi->commit_info.ncommit))
+                        goto check_requests_outstanding;
                /* Don't commit yet if this is a non-blocking flush and there
                 * are a lot of outstanding writes for this mapping.
                 */
@@ -1934,16 +1937,16 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
                flags = 0;
        }
-        ret = nfs_commit_inode(inode, flags);
+        ret = __nfs_commit_inode(inode, flags, wbc);
-        if (ret >= 0) {
+        if (!ret) {
-                if (wbc->sync_mode == WB_SYNC_NONE) {
+                if (flags & FLUSH_SYNC)
-                        if (ret < wbc->nr_to_write)
+                        return 0;
-                                wbc->nr_to_write -= ret;
+        } else if (atomic_long_read(&nfsi->commit_info.ncommit))
-                        else
+                goto out_mark_dirty;
-                                wbc->nr_to_write = 0;
-                }
+check_requests_outstanding:
-                return 0;
+        if (!atomic_read(&nfsi->commit_info.rpcs_out))
-        }
+                return ret;
 out_mark_dirty:
        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
        return ret;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 150521c9671b..61b770e39809 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -268,6 +268,35 @@ free_blocked_lock(struct nfsd4_blocked_lock *nbl)
        kfree(nbl);
 }
+static void
+remove_blocked_locks(struct nfs4_lockowner *lo)
+{
+        struct nfs4_client *clp = lo->lo_owner.so_client;
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        struct nfsd4_blocked_lock *nbl;
+        LIST_HEAD(reaplist);
+        /* Dequeue all blocked locks */
+        spin_lock(&nn->blocked_locks_lock);
+        while (!list_empty(&lo->lo_blocked)) {
+                nbl = list_first_entry(&lo->lo_blocked,
+                                        struct nfsd4_blocked_lock,
+                                        nbl_list);
+                list_del_init(&nbl->nbl_list);
+                list_move(&nbl->nbl_lru, &reaplist);
+        }
+        spin_unlock(&nn->blocked_locks_lock);
+        /* Now free them */
+        while (!list_empty(&reaplist)) {
+                nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock,
+                                        nbl_lru);
+                list_del_init(&nbl->nbl_lru);
+                posix_unblock_lock(&nbl->nbl_lock);
+                free_blocked_lock(nbl);
+        }
+}
 static int
 nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
 {
@@ -1866,6 +1895,7 @@ static __be32 mark_client_expired_locked(struct nfs4_client *clp)
 static void
 __destroy_client(struct nfs4_client *clp)
 {
+        int i;
        struct nfs4_openowner *oo;
        struct nfs4_delegation *dp;
        struct list_head reaplist;
@@ -1895,6 +1925,16 @@ __destroy_client(struct nfs4_client *clp)
                nfs4_get_stateowner(&oo->oo_owner);
                release_openowner(oo);
        }
+        for (i = 0; i < OWNER_HASH_SIZE; i++) {
+                struct nfs4_stateowner *so, *tmp;
+                list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i],
+                                         so_strhash) {
+                        /* Should be no openowners at this point */
+                        WARN_ON_ONCE(so->so_is_open_owner);
+                        remove_blocked_locks(lockowner(so));
+                }
+        }
        nfsd4_return_all_client_layouts(clp);
        nfsd4_shutdown_callback(clp);
        if (clp->cl_cb_conn.cb_xprt)
@@ -6355,6 +6395,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
        }
        spin_unlock(&clp->cl_lock);
        free_ol_stateid_reaplist(&reaplist);
+        remove_blocked_locks(lo);
        nfs4_put_stateowner(&lo->lo_owner);
        return status;
@@ -7140,6 +7181,8 @@ nfs4_state_destroy_net(struct net *net)
                }
        }
+        WARN_ON(!list_empty(&nn->blocked_locks_lru));
        for (i = 0; i < CLIENT_HASH_SIZE; i++) {
                while (!list_empty(&nn->unconf_id_hashtbl[i])) {
                        clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
@@ -7206,7 +7249,6 @@ nfs4_state_shutdown_net(struct net *net)
        struct nfs4_delegation *dp = NULL;
        struct list_head *pos, *next, reaplist;
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-        struct nfsd4_blocked_lock *nbl;
        cancel_delayed_work_sync(&nn->laundromat_work);
        locks_end_grace(&nn->nfsd4_manager);
@@ -7227,24 +7269,6 @@ nfs4_state_shutdown_net(struct net *net)
                nfs4_put_stid(&dp->dl_stid);
        }
-        BUG_ON(!list_empty(&reaplist));
-        spin_lock(&nn->blocked_locks_lock);
-        while (!list_empty(&nn->blocked_locks_lru)) {
-                nbl = list_first_entry(&nn->blocked_locks_lru,
-                                        struct nfsd4_blocked_lock, nbl_lru);
-                list_move(&nbl->nbl_lru, &reaplist);
-                list_del_init(&nbl->nbl_list);
-        }
-        spin_unlock(&nn->blocked_locks_lock);
-        while (!list_empty(&reaplist)) {
-                nbl = list_first_entry(&reaplist,
-                                        struct nfsd4_blocked_lock, nbl_lru);
-                list_del_init(&nbl->nbl_lru);
-                posix_unblock_lock(&nbl->nbl_lock);
-                free_blocked_lock(nbl);
-        }
        nfsd4_client_tracking_exit(net);
        nfs4_state_destroy_net(net);
 }
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index 406e72de88f6..ce6ff5a0a6e4 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -24,6 +24,8 @@ config OVERLAY_FS_REDIRECT_DIR
          an overlay which has redirects on a kernel that doesn't support this
          feature will have unexpected results.
+          If unsure, say N.
 config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW
        bool "Overlayfs: follow redirects even if redirects are turned off"
        default y
@@ -32,8 +34,13 @@ config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW
          Disable this to get a possibly more secure configuration, but that
          might not be backward compatible with previous kernels.
+          If backward compatibility is not an issue, then it is safe and
+          recommended to say N here.
          For more information, see Documentation/filesystems/overlayfs.txt
+          If unsure, say Y.
 config OVERLAY_FS_INDEX
        bool "Overlayfs: turn on inodes index feature by default"
        depends on OVERLAY_FS
@@ -51,6 +58,8 @@ config OVERLAY_FS_INDEX
          That is, mounting an overlay which has an inodes index on a kernel
          that doesn't support this feature will have unexpected results.
+          If unsure, say N.
 config OVERLAY_FS_NFS_EXPORT
        bool "Overlayfs: turn on NFS export feature by default"
        depends on OVERLAY_FS
@@ -72,3 +81,8 @@ config OVERLAY_FS_NFS_EXPORT
          Note, that the NFS export feature is not backward compatible.
          That is, mounting an overlay which has a full index on a kernel
          that doesn't support this feature will have unexpected results.
+          Most users should say N here and enable this feature on a case-by-
+          case basis with the "nfs_export=on" mount option.
+          Say N unless you fully understand the consequences.
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index bb94ce9da5c8..87bd4148f4fb 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -19,6 +19,142 @@
 #include <linux/ratelimit.h>
 #include "overlayfs.h"
+static int ovl_encode_maybe_copy_up(struct dentry *dentry)
+{
+        int err;
+        if (ovl_dentry_upper(dentry))
+                return 0;
+        err = ovl_want_write(dentry);
+        if (!err) {
+                err = ovl_copy_up(dentry);
+                ovl_drop_write(dentry);
+        }
+        if (err) {
+                pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n",
+                                    dentry, err);
+        }
+        return err;
+}
+/*
+ * Before encoding a non-upper directory file handle from real layer N, we need
+ * to check if it will be possible to reconnect an overlay dentry from the real
+ * lower decoded dentry. This is done by following the overlay ancestry up to a
+ * "layer N connected" ancestor and verifying that all parents along the way are
+ * "layer N connectable". If an ancestor that is NOT "layer N connectable" is
+ * found, we need to copy up an ancestor, which is "layer N connectable", thus
+ * making that ancestor "layer N connected". For example:
+ *
+ * layer 1: /a
+ * layer 2: /a/b/c
+ *
+ * The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is
+ * copied up and renamed, upper dir /a will be indexed by lower dir /a from
+ * layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*)
+ * in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay
+ * dentry from the connected lower dentry /a/b/c.
+ *
+ * To avoid this problem on decode time, we need to copy up an ancestor of
+ * /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is
+ * /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected"
+ * and when the time comes to decode the file handle from lower dentry /a/b/c,
+ * ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding
+ * a connected overlay dentry will be accomplished.
+ *
+ * (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an
+ * entry /a in the lower layers above layer N and find the indexed dir /a from
+ * layer 1. If that improvement is made, then the check for "layer N connected"
+ * will need to verify there are no redirects in lower layers above N. In the
+ * example above, /a will be "layer 2 connectable". However, if layer 2 dir /a
+ * is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable":
+ *
+ * layer 1: /A (redirect = /a)
+ * layer 2: /a/b/c
+ */
+/* Return the lowest layer for encoding a connectable file handle */
+static int ovl_connectable_layer(struct dentry *dentry)
+{
+        struct ovl_entry *oe = OVL_E(dentry);
+        /* We can get overlay root from root of any layer */
+        if (dentry == dentry->d_sb->s_root)
+                return oe->numlower;
+        /*
+         * If it's an unindexed merge dir, then it's not connectable with any
+         * lower layer
+         */
+        if (ovl_dentry_upper(dentry) &&
+            !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
+                return 0;
+        /* We can get upper/overlay path from indexed/lower dentry */
+        return oe->lowerstack[0].layer->idx;
+}
+/*
+ * @dentry is "connected" if all ancestors up to root or a "connected" ancestor
+ * have the same uppermost lower layer as the origin's layer. We may need to
+ * copy up a "connectable" ancestor to make it "connected". A "connected" dentry
+ * cannot become non "connected", so cache positive result in dentry flags.
+ *
+ * Return the connected origin layer or < 0 on error.
+ */
+static int ovl_connect_layer(struct dentry *dentry)
+{
+        struct dentry *next, *parent = NULL;
+        int origin_layer;
+        int err = 0;
+        if (WARN_ON(dentry == dentry->d_sb->s_root) ||
+            WARN_ON(!ovl_dentry_lower(dentry)))
+                return -EIO;
+        origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx;
+        if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry))
+                return origin_layer;
+        /* Find the topmost origin layer connectable ancestor of @dentry */
+        next = dget(dentry);
+        for (;;) {
+                parent = dget_parent(next);
+                if (WARN_ON(parent == next)) {
+                        err = -EIO;
+                        break;
+                }
+                /*
+                 * If @parent is not origin layer connectable, then copy up
+                 * @next which is origin layer connectable and we are done.
+                 */
+                if (ovl_connectable_layer(parent) < origin_layer) {
+                        err = ovl_encode_maybe_copy_up(next);
+                        break;
+                }
+                /* If @parent is connected or indexed we are done */
+                if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) ||
+                    ovl_test_flag(OVL_INDEX, d_inode(parent)))
+                        break;
+                dput(next);
+                next = parent;
+        }
+        dput(parent);
+        dput(next);
+        if (!err)
+                ovl_dentry_set_flag(OVL_E_CONNECTED, dentry);
+        return err ?: origin_layer;
+}
 /*
 * We only need to encode origin if there is a chance that the same object was
 * encoded pre copy up and then we need to stay consistent with the same
@@ -41,73 +177,59 @@
 * L = lower file handle
 *
 * (*) Connecting an overlay dir from real lower dentry is not always
- * possible when there are redirects in lower layers. To mitigate this case,
+ * possible when there are redirects in lower layers and non-indexed merge dirs.
- * we copy up the lower dir first and then encode an upper dir file handle.
+ * To mitigate those case, we may copy up the lower dir ancestor before encode
+ * a lower dir file handle.
+ *
+ * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
 */
-static bool ovl_should_encode_origin(struct dentry *dentry)
+static int ovl_check_encode_origin(struct dentry *dentry)
 {
        struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+        /* Upper file handle for pure upper */
        if (!ovl_dentry_lower(dentry))
-                return false;
+                return 0;
        /*
-         * Decoding a merge dir, whose origin's parent is under a redirected
+         * Upper file handle for non-indexed upper.
-         * lower dir is not always possible. As a simple aproximation, we do
-         * not encode lower dir file handles when overlay has multiple lower
-         * layers and origin is below the topmost lower layer.
         *
-         * TODO: copy up only the parent that is under redirected lower.
+         * Root is never indexed, so if there's an upper layer, encode upper for
+         * root.
         */
-        if (d_is_dir(dentry) && ofs->upper_mnt &&
-            OVL_E(dentry)->lowerstack[0].layer->idx > 1)
-                return false;
-        /* Decoding a non-indexed upper from origin is not implemented */
        if (ovl_dentry_upper(dentry) &&
            !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
-                return false;
-        return true;
-}
-static int ovl_encode_maybe_copy_up(struct dentry *dentry)
-{
-        int err;
-        if (ovl_dentry_upper(dentry))
                return 0;
-        err = ovl_want_write(dentry);
+        /*
-        if (err)
+         * Decoding a merge dir, whose origin's ancestor is under a redirected
-                return err;
+         * lower dir or under a non-indexed upper is not always possible.
+         * ovl_connect_layer() will try to make origin's layer "connected" by
-        err = ovl_copy_up(dentry);
+         * copying up a "connectable" ancestor.
+         */
+        if (d_is_dir(dentry) && ofs->upper_mnt)
+                return ovl_connect_layer(dentry);
-        ovl_drop_write(dentry);
+        /* Lower file handle for indexed and non-upper dir/non-dir */
-        return err;
+        return 1;
 }
 static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen)
 {
-        struct dentry *origin = ovl_dentry_lower(dentry);
        struct ovl_fh *fh = NULL;
-        int err;
+        int err, enc_lower;
        /*
-         * If we should not encode a lower dir file handle, copy up and encode
+         * Check if we should encode a lower or upper file handle and maybe
-         * an upper dir file handle.
+         * copy up an ancestor to make lower file handle connectable.
         */
-        if (!ovl_should_encode_origin(dentry)) {
+        err = enc_lower = ovl_check_encode_origin(dentry);
-                err = ovl_encode_maybe_copy_up(dentry);
+        if (enc_lower < 0)
-                if (err)
+                goto fail;
-                        goto fail;
-                origin = NULL;
-        }
-        /* Encode an upper or origin file handle */
+        /* Encode an upper or lower file handle */
-        fh = ovl_encode_fh(origin ?: ovl_dentry_upper(dentry), !origin);
+        fh = ovl_encode_fh(enc_lower ? ovl_dentry_lower(dentry) :
+                                       ovl_dentry_upper(dentry), !enc_lower);
        err = PTR_ERR(fh);
        if (IS_ERR(fh))
                goto fail;
@@ -355,8 +477,8 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
                dput(upper);
        }
-        if (!this)
+        if (IS_ERR_OR_NULL(this))
-                return NULL;
+                return this;
        if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) {
                dput(this);
@@ -498,7 +620,7 @@ static struct dentry *ovl_lookup_real(struct super_block *sb,
                        if (err == -ECHILD) {
                                this = ovl_lookup_real_ancestor(sb, real,
                                                                layer);
-                                err = IS_ERR(this) ? PTR_ERR(this) : 0;
+                                err = PTR_ERR_OR_ZERO(this);
                        }
                        if (!err) {
                                dput(connected);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index fcd97b783fa1..3b1bd469accd 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -669,38 +669,59 @@ struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
        return inode;
 }
+/*
+ * Does overlay inode need to be hashed by lower inode?
+ */
+static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
+                             struct dentry *lower, struct dentry *index)
+{
+        struct ovl_fs *ofs = sb->s_fs_info;
+        /* No, if pure upper */
+        if (!lower)
+                return false;
+        /* Yes, if already indexed */
+        if (index)
+                return true;
+        /* Yes, if won't be copied up */
+        if (!ofs->upper_mnt)
+                return true;
+        /* No, if lower hardlink is or will be broken on copy up */
+        if ((upper || !ovl_indexdir(sb)) &&
+            !d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
+                return false;
+        /* No, if non-indexed upper with NFS export */
+        if (sb->s_export_op && upper)
+                return false;
+        /* Otherwise, hash by lower inode for fsnotify */
+        return true;
+}
 struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
                            struct dentry *lowerdentry, struct dentry *index,
                            unsigned int numlower)
 {
-        struct ovl_fs *ofs = sb->s_fs_info;
        struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
        struct inode *inode;
-        /* Already indexed or could be indexed on copy up? */
+        bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index);
-        bool indexed = (index || (ovl_indexdir(sb) && !upperdentry));
-        struct dentry *origin = indexed ? lowerdentry : NULL;
        bool is_dir;
-        if (WARN_ON(upperdentry && indexed && !lowerdentry))
-                return ERR_PTR(-EIO);
        if (!realinode)
                realinode = d_inode(lowerdentry);
        /*
-         * Copy up origin (lower) may exist for non-indexed non-dir upper, but
+         * Copy up origin (lower) may exist for non-indexed upper, but we must
-         * we must not use lower as hash key in that case.
+         * not use lower as hash key if this is a broken hardlink.
-         * Hash non-dir that is or could be indexed by origin inode.
-         * Hash dir that is or could be merged by origin inode.
-         * Hash pure upper and non-indexed non-dir by upper inode.
-         * Hash non-indexed dir by upper inode for NFS export.
         */
        is_dir = S_ISDIR(realinode->i_mode);
-        if (is_dir && (indexed || !sb->s_export_op || !ofs->upper_mnt))
+        if (upperdentry || bylower) {
-                origin = lowerdentry;
+                struct inode *key = d_inode(bylower ? lowerdentry :
+                                                      upperdentry);
-        if (upperdentry || origin) {
-                struct inode *key = d_inode(origin ?: upperdentry);
                unsigned int nlink = is_dir ? 1 : realinode->i_nlink;
                inode = iget5_locked(sb, (unsigned long) key,
@@ -728,6 +749,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
                        nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink);
                set_nlink(inode, nlink);
        } else {
+                /* Lower hardlink that will be broken on copy up */
                inode = new_inode(sb);
                if (!inode)
                        goto out_nomem;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index de3e6da1d5a5..70fcfcc684cc 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -913,9 +913,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
                stack[ctr].layer = lower.layer;
                ctr++;
-                if (d.stop)
-                        break;
                /*
                 * Following redirects can have security consequences: it's like
                 * a symlink into the lower layer without the permission checks.
@@ -933,6 +930,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
                        goto out_put;
                }
+                if (d.stop)
+                        break;
                if (d.redirect && d.redirect[0] == '/' && poe != roe) {
                        poe = roe;
                        /* Find the current layer on the root dentry */
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0df25a9c94bd..225ff1171147 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -40,6 +40,7 @@ enum ovl_inode_flag {
 enum ovl_entry_flag {
        OVL_E_UPPER_ALIAS,
        OVL_E_OPAQUE,
+        OVL_E_CONNECTED,
 };
 /*
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 9ee37c76091d..7c24619ae7fc 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1359,6 +1359,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        /* Root is always merge -> can have whiteouts */
        ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry));
+        ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry);
        ovl_inode_init(d_inode(root_dentry), upperpath.dentry,
                       ovl_dentry_lower(root_dentry));
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 8664db25a9a6..215c225b2ca1 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -106,6 +106,7 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
 {
        return sysfs_do_create_link(kobj, target, name, 0);
 }
+EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn);
 /**
 *      sysfs_delete_link - remove symlink in object's directory.
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 66e1edbfb2b2..046469fcc1b8 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -955,15 +955,29 @@ static inline bool imap_needs_alloc(struct inode *inode,
                (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
 }
+static inline bool needs_cow_for_zeroing(struct xfs_bmbt_irec *imap, int nimaps)
+{
+        return nimaps &&
+                imap->br_startblock != HOLESTARTBLOCK &&
+                imap->br_state != XFS_EXT_UNWRITTEN;
+}
 static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
 {
        /*
-         * COW writes will allocate delalloc space, so we need to make sure
+         * COW writes may allocate delalloc space or convert unwritten COW
-         * to take the lock exclusively here.
+         * extents, so we need to make sure to take the lock exclusively here.
         */
        if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO)))
                return true;
-        if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE))
+        /*
+         * Extents not yet cached requires exclusive access, don't block.
+         * This is an opencoded xfs_ilock_data_map_shared() to cater for the
+         * non-blocking behaviour.
+         */
+        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+            !(ip->i_df.if_flags & XFS_IFEXTENTS))
                return true;
        return false;
 }
@@ -993,16 +1007,18 @@ xfs_file_iomap_begin(
                return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
        }
-        if (need_excl_ilock(ip, flags)) {
+        if (need_excl_ilock(ip, flags))
                lockmode = XFS_ILOCK_EXCL;
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        else
-        } else {
+                lockmode = XFS_ILOCK_SHARED;
-                lockmode = xfs_ilock_data_map_shared(ip);
-        }
-        if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+        if (flags & IOMAP_NOWAIT) {
-                error = -EAGAIN;
+                if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
-                goto out_unlock;
+                        return -EAGAIN;
+                if (!xfs_ilock_nowait(ip, lockmode))
+                        return -EAGAIN;
+        } else {
+                xfs_ilock(ip, lockmode);
        }
        ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -1024,7 +1040,9 @@ xfs_file_iomap_begin(
                        goto out_unlock;
        }
-        if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+        if (xfs_is_reflink_inode(ip) &&
+            ((flags & IOMAP_WRITE) ||
+             ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) {
                if (flags & IOMAP_DIRECT) {
                        /*
                         * A reflinked inode will result in CoW alloc.