21 files changed, 219 insertions, 114 deletions
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 59970886690f..a7b44863d502 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -576,6 +576,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call,
 {
        signed long rtt2, timeout;
        long ret;
+        bool stalled = false;
        u64 rtt;
        u32 life, last_life;
@@ -609,12 +610,20 @@ static long afs_wait_for_call_to_complete(struct afs_call *call,
                life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
                if (timeout == 0 &&
-                    life == last_life && signal_pending(current))
+                    life == last_life && signal_pending(current)) {
+                        if (stalled)
                                break;
+                        __set_current_state(TASK_RUNNING);
+                        rxrpc_kernel_probe_life(call->net->socket, call->rxcall);
+                        timeout = rtt2;
+                        stalled = true;
+                        continue;
+                }
                if (life != last_life) {
                        timeout = rtt2;
                        last_life = life;
+                        stalled = false;
                }
                timeout = schedule_timeout(timeout);
diff --git a/fs/dax.c b/fs/dax.c
index 616e36ea6aaa..9bcce89ea18e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -98,12 +98,6 @@ static void *dax_make_entry(pfn_t pfn, unsigned long flags)
        return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
 }
-static void *dax_make_page_entry(struct page *page)
-{
-        pfn_t pfn = page_to_pfn_t(page);
-        return dax_make_entry(pfn, PageHead(page) ? DAX_PMD : 0);
-}
 static bool dax_is_locked(void *entry)
 {
        return xa_to_value(entry) & DAX_LOCKED;
@@ -116,12 +110,12 @@ static unsigned int dax_entry_order(void *entry)
        return 0;
 }
-static int dax_is_pmd_entry(void *entry)
+static unsigned long dax_is_pmd_entry(void *entry)
 {
        return xa_to_value(entry) & DAX_PMD;
 }
-static int dax_is_pte_entry(void *entry)
+static bool dax_is_pte_entry(void *entry)
 {
        return !(xa_to_value(entry) & DAX_PMD);
 }
@@ -222,9 +216,8 @@ static void *get_unlocked_entry(struct xa_state *xas)
        ewait.wait.func = wake_exceptional_entry_func;
        for (;;) {
-                entry = xas_load(xas);
+                entry = xas_find_conflict(xas);
-                if (!entry || xa_is_internal(entry) ||
+                if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
-                                WARN_ON_ONCE(!xa_is_value(entry)) ||
                                !dax_is_locked(entry))
                        return entry;
@@ -255,6 +248,7 @@ static void dax_unlock_entry(struct xa_state *xas, void *entry)
 {
        void *old;
+        BUG_ON(dax_is_locked(entry));
        xas_reset(xas);
        xas_lock_irq(xas);
        old = xas_store(xas, entry);
@@ -352,16 +346,27 @@ static struct page *dax_busy_page(void *entry)
        return NULL;
 }
+/*
+ * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
+ * @page: The page whose entry we want to lock
+ *
+ * Context: Process context.
+ * Return: %true if the entry was locked or does not need to be locked.
+ */
 bool dax_lock_mapping_entry(struct page *page)
 {
        XA_STATE(xas, NULL, 0);
        void *entry;
+        bool locked;
+        /* Ensure page->mapping isn't freed while we look at it */
+        rcu_read_lock();
        for (;;) {
                struct address_space *mapping = READ_ONCE(page->mapping);
+                locked = false;
                if (!dax_mapping(mapping))
-                        return false;
+                        break;
                /*
                 * In the device-dax case there's no need to lock, a
@@ -370,8 +375,9 @@ bool dax_lock_mapping_entry(struct page *page)
                 * otherwise we would not have a valid pfn_to_page()
                 * translation.
                 */
+                locked = true;
                if (S_ISCHR(mapping->host->i_mode))
-                        return true;
+                        break;
                xas.xa = &mapping->i_pages;
                xas_lock_irq(&xas);
@@ -382,28 +388,35 @@ bool dax_lock_mapping_entry(struct page *page)
                xas_set(&xas, page->index);
                entry = xas_load(&xas);
                if (dax_is_locked(entry)) {
+                        rcu_read_unlock();
                        entry = get_unlocked_entry(&xas);
-                        /* Did the page move while we slept? */
+                        xas_unlock_irq(&xas);
-                        if (dax_to_pfn(entry) != page_to_pfn(page)) {
+                        put_unlocked_entry(&xas, entry);
-                                xas_unlock_irq(&xas);
+                        rcu_read_lock();
-                                continue;
+                        continue;
-                        }
                }
                dax_lock_entry(&xas, entry);
                xas_unlock_irq(&xas);
-                return true;
+                break;
        }
+        rcu_read_unlock();
+        return locked;
 }
 void dax_unlock_mapping_entry(struct page *page)
 {
        struct address_space *mapping = page->mapping;
        XA_STATE(xas, &mapping->i_pages, page->index);
+        void *entry;
        if (S_ISCHR(mapping->host->i_mode))
                return;
-        dax_unlock_entry(&xas, dax_make_page_entry(page));
+        rcu_read_lock();
+        entry = xas_load(&xas);
+        rcu_read_unlock();
+        entry = dax_make_entry(page_to_pfn_t(page), dax_is_pmd_entry(entry));
+        dax_unlock_entry(&xas, entry);
 }
 /*
@@ -445,11 +458,9 @@ static void *grab_mapping_entry(struct xa_state *xas,
 retry:
        xas_lock_irq(xas);
        entry = get_unlocked_entry(xas);
-        if (xa_is_internal(entry))
-                goto fallback;
        if (entry) {
-                if (WARN_ON_ONCE(!xa_is_value(entry))) {
+                if (!xa_is_value(entry)) {
                        xas_set_err(xas, EIO);
                        goto out_unlock;
                }
@@ -1628,8 +1639,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
        /* Did we race with someone splitting entry or so? */
        if (!entry ||
            (order == 0 && !dax_is_pte_entry(entry)) ||
-            (order == PMD_ORDER && (xa_is_internal(entry) ||
+            (order == PMD_ORDER && !dax_is_pmd_entry(entry))) {
-                                    !dax_is_pmd_entry(entry)))) {
                put_unlocked_entry(&xas, entry);
                xas_unlock_irq(&xas);
                trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
diff --git a/fs/exec.c b/fs/exec.c
index fc281b738a98..acc3a5536384 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -62,6 +62,7 @@
 #include <linux/oom.h>
 #include <linux/compat.h>
 #include <linux/vmalloc.h>
+#include <linux/freezer.h>
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1083,7 +1084,7 @@ static int de_thread(struct task_struct *tsk)
        while (sig->notify_count) {
                __set_current_state(TASK_KILLABLE);
                spin_unlock_irq(lock);
-                schedule();
+                freezable_schedule();
                if (unlikely(__fatal_signal_pending(tsk)))
                        goto killed;
                spin_lock_irq(lock);
@@ -1111,7 +1112,7 @@ static int de_thread(struct task_struct *tsk)
                        __set_current_state(TASK_KILLABLE);
                        write_unlock_irq(&tasklist_lock);
                        cgroup_threadgroup_change_end(tsk);
-                        schedule();
+                        freezable_schedule();
                        if (unlikely(__fatal_signal_pending(tsk)))
                                goto killed;
                }
diff --git a/fs/iomap.c b/fs/iomap.c
index 64ce240217a1..3ffb776fbebe 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -142,13 +142,14 @@ static void
 iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
                loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
 {
+        loff_t orig_pos = *pos;
+        loff_t isize = i_size_read(inode);
        unsigned block_bits = inode->i_blkbits;
        unsigned block_size = (1 << block_bits);
        unsigned poff = offset_in_page(*pos);
        unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
        unsigned first = poff >> block_bits;
        unsigned last = (poff + plen - 1) >> block_bits;
-        unsigned end = offset_in_page(i_size_read(inode)) >> block_bits;
        /*
         * If the block size is smaller than the page size we need to check the
@@ -183,8 +184,12 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
         * handle both halves separately so that we properly zero data in the
         * page cache for blocks that are entirely outside of i_size.
         */
-        if (first <= end && last > end)
+        if (orig_pos <= isize && orig_pos + length > isize) {
-                plen -= (last - end) * block_size;
+                unsigned end = offset_in_page(isize - 1) >> block_bits;
+                if (first <= end && last > end)
+                        plen -= (last - end) * block_size;
+        }
        *offp = poff;
        *lenp = plen;
@@ -1580,7 +1585,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
        struct bio *bio;
        bool need_zeroout = false;
        bool use_fua = false;
-        int nr_pages, ret;
+        int nr_pages, ret = 0;
        size_t copied = 0;
        if ((pos | length | align) & ((1 << blkbits) - 1))
@@ -1596,12 +1601,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
        if (iomap->flags & IOMAP_F_NEW) {
                need_zeroout = true;
-        } else {
+        } else if (iomap->type == IOMAP_MAPPED) {
                /*
-                 * Use a FUA write if we need datasync semantics, this
+                 * Use a FUA write if we need datasync semantics, this is a pure
-                 * is a pure data IO that doesn't require any metadata
+                 * data IO that doesn't require any metadata updates (including
-                 * updates and the underlying device supports FUA. This
+                 * after IO completion such as unwritten extent conversion) and
-                 * allows us to avoid cache flushes on IO completion.
+                 * the underlying device supports FUA. This allows us to avoid
+                 * cache flushes on IO completion.
                 */
                if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
                    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
@@ -1644,8 +1650,14 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
                ret = bio_iov_iter_get_pages(bio, &iter);
                if (unlikely(ret)) {
+                        /*
+                         * We have to stop part way through an IO. We must fall
+                         * through to the sub-block tail zeroing here, otherwise
+                         * this short IO may expose stale data in the tail of
+                         * the block we haven't written data to.
+                         */
                        bio_put(bio);
-                        return copied ? copied : ret;
+                        goto zero_tail;
                }
                n = bio->bi_iter.bi_size;
@@ -1676,13 +1688,21 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
                dio->submit.cookie = submit_bio(bio);
        } while (nr_pages);
-        if (need_zeroout) {
+        /*
+         * We need to zeroout the tail of a sub-block write if the extent type
+         * requires zeroing or the write extends beyond EOF. If we don't zero
+         * the block tail in the latter case, we can expose stale data via mmap
+         * reads of the EOF block.
+         */
+zero_tail:
+        if (need_zeroout ||
+            ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
                /* zero out from the end of the write to the end of the block */
                pad = pos & (fs_block_size - 1);
                if (pad)
                        iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
        }
-        return copied;
+        return copied ? copied : ret;
 }
 static loff_t
@@ -1857,6 +1877,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                                dio->wait_for_completion = true;
                                ret = 0;
                        }
+                        /*
+                         * Splicing to pipes can fail on a full pipe. We have to
+                         * swallow this to make it look like a short IO
+                         * otherwise the higher splice layers will completely
+                         * mishandle the error and stop moving data.
+                         */
+                        if (ret == -EFAULT)
+                                ret = 0;
                        break;
                }
                pos += ret;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 7b861bbc0b43..315967354954 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -686,20 +686,24 @@ __be32 nfs4_callback_offload(void *data, void *dummy,
 {
        struct cb_offloadargs *args = data;
        struct nfs_server *server;
-        struct nfs4_copy_state *copy;
+        struct nfs4_copy_state *copy, *tmp_copy;
        bool found = false;
+        copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
+        if (!copy)
+                return htonl(NFS4ERR_SERVERFAULT);
        spin_lock(&cps->clp->cl_lock);
        rcu_read_lock();
        list_for_each_entry_rcu(server, &cps->clp->cl_superblocks,
                                client_link) {
-                list_for_each_entry(copy, &server->ss_copies, copies) {
+                list_for_each_entry(tmp_copy, &server->ss_copies, copies) {
                        if (memcmp(args->coa_stateid.other,
-                                        copy->stateid.other,
+                                        tmp_copy->stateid.other,
                                        sizeof(args->coa_stateid.other)))
                                continue;
-                        nfs4_copy_cb_args(copy, args);
+                        nfs4_copy_cb_args(tmp_copy, args);
-                        complete(&copy->completion);
+                        complete(&tmp_copy->completion);
                        found = true;
                        goto out;
                }
@@ -707,15 +711,11 @@ __be32 nfs4_callback_offload(void *data, void *dummy,
 out:
        rcu_read_unlock();
        if (!found) {
-                copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
-                if (!copy) {
-                        spin_unlock(&cps->clp->cl_lock);
-                        return htonl(NFS4ERR_SERVERFAULT);
-                }
                memcpy(&copy->stateid, &args->coa_stateid, NFS4_STATEID_SIZE);
                nfs4_copy_cb_args(copy, args);
                list_add_tail(&copy->copies, &cps->clp->pending_cb_stateids);
-        }
+        } else
+                kfree(copy);
        spin_unlock(&cps->clp->cl_lock);
        return 0;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 86bcba40ca61..74b36ed883ca 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1361,12 +1361,7 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
                                task))
                return;
-        if (ff_layout_read_prepare_common(task, hdr))
+        ff_layout_read_prepare_common(task, hdr);
-                return;
-        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
-                        hdr->args.lock_context, FMODE_READ) == -EIO)
-                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 }
 static void ff_layout_read_call_done(struct rpc_task *task, void *data)
@@ -1542,12 +1537,7 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
                                task))
                return;
-        if (ff_layout_write_prepare_common(task, hdr))
+        ff_layout_write_prepare_common(task, hdr);
-                return;
-        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
-                        hdr->args.lock_context, FMODE_WRITE) == -EIO)
-                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 }
 static void ff_layout_write_call_done(struct rpc_task *task, void *data)
@@ -1742,6 +1732,10 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
        fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
        if (fh)
                hdr->args.fh = fh;
+        if (!nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid))
+                goto out_failed;
        /*
         * Note that if we ever decide to split across DSes,
         * then we may need to handle dense-like offsets.
@@ -1804,6 +1798,9 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
        if (fh)
                hdr->args.fh = fh;
+        if (!nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid))
+                goto out_failed;
        /*
         * Note that if we ever decide to split across DSes,
         * then we may need to handle dense-like offsets.
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 411798346e48..de50a342d5a5 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -215,6 +215,10 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
                unsigned int maxnum);
 struct nfs_fh *
 nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
+int
+nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg,
+                                u32 mirror_idx,
+                                nfs4_stateid *stateid);
 struct nfs4_pnfs_ds *
 nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 74d8d5352438..d23347389626 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -370,6 +370,25 @@ out:
        return fh;
 }
+int
+nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg,
+                                u32 mirror_idx,
+                                nfs4_stateid *stateid)
+{
+        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
+        if (!ff_layout_mirror_valid(lseg, mirror, false)) {
+                pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
+                        __func__, mirror_idx);
+                goto out;
+        }
+        nfs4_stateid_copy(stateid, &mirror->stateid);
+        return 1;
+out:
+        return 0;
+}
 /**
 * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
 * @lseg: the layout segment we're operating on
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index ac5b784a1de0..fed06fd9998d 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -137,31 +137,32 @@ static int handle_async_copy(struct nfs42_copy_res *res,
                             struct file *dst,
                             nfs4_stateid *src_stateid)
 {
-        struct nfs4_copy_state *copy;
+        struct nfs4_copy_state *copy, *tmp_copy;
        int status = NFS4_OK;
        bool found_pending = false;
        struct nfs_open_context *ctx = nfs_file_open_context(dst);
+        copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
+        if (!copy)
+                return -ENOMEM;
        spin_lock(&server->nfs_client->cl_lock);
-        list_for_each_entry(copy, &server->nfs_client->pending_cb_stateids,
+        list_for_each_entry(tmp_copy, &server->nfs_client->pending_cb_stateids,
                                copies) {
-                if (memcmp(&res->write_res.stateid, &copy->stateid,
+                if (memcmp(&res->write_res.stateid, &tmp_copy->stateid,
                                NFS4_STATEID_SIZE))
                        continue;
                found_pending = true;
-                list_del(&copy->copies);
+                list_del(&tmp_copy->copies);
                break;
        }
        if (found_pending) {
                spin_unlock(&server->nfs_client->cl_lock);
+                kfree(copy);
+                copy = tmp_copy;
                goto out;
        }
-        copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
-        if (!copy) {
-                spin_unlock(&server->nfs_client->cl_lock);
-                return -ENOMEM;
-        }
        memcpy(&copy->stateid, &res->write_res.stateid, NFS4_STATEID_SIZE);
        init_completion(&copy->completion);
        copy->parent_state = ctx->state;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 8d59c9655ec4..1b994b527518 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -41,6 +41,8 @@ enum nfs4_client_state {
        NFS4CLNT_MOVED,
        NFS4CLNT_LEASE_MOVED,
        NFS4CLNT_DELEGATION_EXPIRED,
+        NFS4CLNT_RUN_MANAGER,
+        NFS4CLNT_DELEGRETURN_RUNNING,
 };
 #define NFS4_RENEW_TIMEOUT              0x01
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index ffea57885394..d8decf2ec48f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1210,6 +1210,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
        struct task_struct *task;
        char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
+        set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
        if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
                return;
        __module_get(THIS_MODULE);
@@ -2503,6 +2504,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
        /* Ensure exclusive access to NFSv4 state */
        do {
+                clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
                if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
                        section = "purge state";
                        status = nfs4_purge_lease(clp);
@@ -2593,14 +2595,18 @@ static void nfs4_state_manager(struct nfs_client *clp)
                }
                nfs4_end_drain_session(clp);
-                if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
+                nfs4_clear_state_manager_bit(clp);
-                        nfs_client_return_marked_delegations(clp);
-                        continue;
+                if (!test_and_set_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state)) {
+                        if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
+                                nfs_client_return_marked_delegations(clp);
+                                set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+                        }
+                        clear_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state);
                }
-                nfs4_clear_state_manager_bit(clp);
                /* Did we race with an attempt to give us more work? */
-                if (clp->cl_state == 0)
+                if (!test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
                        return;
                if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
                        return;
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index de99db518571..f2129a5d9f23 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -266,9 +266,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc,
                return;
        if (nbh == NULL) {      /* blocksize == pagesize */
-                xa_lock_irq(&btnc->i_pages);
+                xa_erase_irq(&btnc->i_pages, newkey);
-                __xa_erase(&btnc->i_pages, newkey);
-                xa_unlock_irq(&btnc->i_pages);
                unlock_page(ctxt->bh->b_page);
        } else
                brelse(nbh);
diff --git a/fs/read_write.c b/fs/read_write.c
index bfcb4ced5664..4dae0399c75a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -2094,17 +2094,18 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
        off = same->src_offset;
        len = same->src_length;
-        ret = -EISDIR;
        if (S_ISDIR(src->i_mode))
-                goto out;
+                return -EISDIR;
-        ret = -EINVAL;
        if (!S_ISREG(src->i_mode))
-                goto out;
+                return -EINVAL;
+        if (!file->f_op->remap_file_range)
+                return -EOPNOTSUPP;
        ret = remap_verify_area(file, off, len, false);
        if (ret < 0)
-                goto out;
+                return ret;
        ret = 0;
        if (off + len > i_size_read(src))
@@ -2147,10 +2148,8 @@ next_fdput:
                fdput(dst_fd);
 next_loop:
                if (fatal_signal_pending(current))
-                        goto out;
+                        break;
        }
-out:
        return ret;
 }
 EXPORT_SYMBOL(vfs_dedupe_file_range);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 74d7228e755b..19e921d1586f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1694,10 +1694,13 @@ xfs_bmap_add_extent_delay_real(
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
                /*
                 * Filling in all of a previously delayed allocation extent.
-                 * The right neighbor is contiguous, the left is not.
+                 * The right neighbor is contiguous, the left is not. Take care
+                 * with delay -> unwritten extent allocation here because the
+                 * delalloc record we are overwriting is always written.
                 */
                PREV.br_startblock = new->br_startblock;
                PREV.br_blockcount += RIGHT.br_blockcount;
+                PREV.br_state = new->br_state;
                xfs_iext_next(ifp, &bma->icur);
                xfs_iext_remove(bma->ip, &bma->icur, state);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 86c50208a143..7fbf8af0b159 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -538,15 +538,18 @@ xfs_inobt_rec_check_count(
 static xfs_extlen_t
 xfs_inobt_max_size(
-        struct xfs_mount        *mp)
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          agno)
 {
+        xfs_agblock_t           agblocks = xfs_ag_block_count(mp, agno);
        /* Bail out if we're uninitialized, which can happen in mkfs. */
        if (mp->m_inobt_mxr[0] == 0)
                return 0;
        return xfs_btree_calc_size(mp->m_inobt_mnr,
-                (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
+                                (uint64_t)agblocks * mp->m_sb.sb_inopblock /
-                                XFS_INODES_PER_CHUNK);
+                                        XFS_INODES_PER_CHUNK);
 }
 static int
@@ -594,7 +597,7 @@ xfs_finobt_calc_reserves(
        if (error)
                return error;
-        *ask += xfs_inobt_max_size(mp);
+        *ask += xfs_inobt_max_size(mp, agno);
        *used += tree_len;
        return 0;
 }
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 5d263dfdb3bc..404e581f1ea1 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1042,7 +1042,7 @@ out_trans_cancel:
        goto out_unlock;
 }
-static int
+int
 xfs_flush_unmap_range(
        struct xfs_inode        *ip,
        xfs_off_t               offset,
@@ -1195,13 +1195,7 @@ xfs_prepare_shift(
         * Writeback and invalidate cache for the remainder of the file as we're
         * about to shift down every extent from offset to EOF.
         */
-        error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1);
+        error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
-        if (error)
-                return error;
-        error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-                                        offset >> PAGE_SHIFT, -1);
-        if (error)
-                return error;
        /*
         * Clean out anything hanging around in the cow fork now that
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 87363d136bb6..7a78229cf1a7 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -80,4 +80,7 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
                          int whichfork, xfs_extnum_t *nextents,
                          xfs_filblks_t *count);
+int     xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset,
+                              xfs_off_t len);
 #endif  /* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 12d8455bfbb2..010db5f8fb00 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1233,9 +1233,23 @@ xfs_buf_iodone(
 }
 /*
- * Requeue a failed buffer for writeback
+ * Requeue a failed buffer for writeback.
 *
- * Return true if the buffer has been re-queued properly, false otherwise
+ * We clear the log item failed state here as well, but we have to be careful
+ * about reference counts because the only active reference counts on the buffer
+ * may be the failed log items. Hence if we clear the log item failed state
+ * before queuing the buffer for IO we can release all active references to
+ * the buffer and free it, leading to use after free problems in
+ * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
+ * order we process them in - the buffer is locked, and we own the buffer list
+ * so nothing on them is going to change while we are performing this action.
+ *
+ * Hence we can safely queue the buffer for IO before we clear the failed log
+ * item state, therefore  always having an active reference to the buffer and
+ * avoiding the transient zero-reference state that leads to use-after-free.
+ *
+ * Return true if the buffer was added to the buffer list, false if it was
+ * already on the buffer list.
 */
 bool
 xfs_buf_resubmit_failed_buffers(
@@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers(
        struct list_head        *buffer_list)
 {
        struct xfs_log_item     *lip;
+        bool                    ret;
+        ret = xfs_buf_delwri_queue(bp, buffer_list);
        /*
-         * Clear XFS_LI_FAILED flag from all items before resubmit
+         * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this
-         *
-         * XFS_LI_FAILED set/clear is protected by ail_lock, caller  this
         * function already have it acquired
         */
        list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
                xfs_clear_li_failed(lip);
-        /* Add this buffer back to the delayed write list */
+        return ret;
-        return xfs_buf_delwri_queue(bp, buffer_list);
 }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 53c9ab8fb777..e47425071e65 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -920,7 +920,7 @@ out_unlock:
 }
-loff_t
+STATIC loff_t
 xfs_file_remap_range(
        struct file             *file_in,
        loff_t                  pos_in,
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index ecdb086bc23e..322a852ce284 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -296,6 +296,7 @@ xfs_reflink_reserve_cow(
        if (error)
                return error;
+        xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
        trace_xfs_reflink_cow_alloc(ip, &got);
        return 0;
 }
@@ -1351,10 +1352,19 @@ xfs_reflink_remap_prep(
        if (ret)
                goto out_unlock;
-        /* Zap any page cache for the destination file's range. */
+        /*
-        truncate_inode_pages_range(&inode_out->i_data,
+         * If pos_out > EOF, we may have dirtied blocks between EOF and
-                        round_down(pos_out, PAGE_SIZE),
+         * pos_out. In that case, we need to extend the flush and unmap to cover
-                        round_up(pos_out + *len, PAGE_SIZE) - 1);
+         * from EOF to the end of the copy length.
+         */
+        if (pos_out > XFS_ISIZE(dest)) {
+                loff_t  flen = *len + (pos_out - XFS_ISIZE(dest));
+                ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
+        } else {
+                ret = xfs_flush_unmap_range(dest, pos_out, *len);
+        }
+        if (ret)
+                goto out_unlock;
        return 1;
 out_unlock:
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 3043e5ed6495..8a6532aae779 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -280,7 +280,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
        ),
        TP_fast_assign(
                __entry->dev = bp->b_target->bt_dev;
-                __entry->bno = bp->b_bn;
+                if (bp->b_bn == XFS_BUF_DADDR_NULL)
+                        __entry->bno = bp->b_maps[0].bm_bn;
+                else
+                        __entry->bno = bp->b_bn;
                __entry->nblks = bp->b_length;
                __entry->hold = atomic_read(&bp->b_hold);
                __entry->pincount = atomic_read(&bp->b_pin_count);