22 files changed, 169 insertions, 172 deletions
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 7d398d300e97..9382db998ec9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -743,7 +743,7 @@ static int tcp_accept_from_sock(struct connection *con)
        newsock->type = con->sock->type;
        newsock->ops = con->sock->ops;
-        result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
+        result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK, true);
        if (result < 0)
                goto accept_err;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 338d2f73eb29..a2c05f2ada6d 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1359,6 +1359,16 @@ out:
        return 0;
 }
+static void fat_dummy_inode_init(struct inode *inode)
+{
+        /* Initialize this dummy inode to work as no-op. */
+        MSDOS_I(inode)->mmu_private = 0;
+        MSDOS_I(inode)->i_start = 0;
+        MSDOS_I(inode)->i_logstart = 0;
+        MSDOS_I(inode)->i_attrs = 0;
+        MSDOS_I(inode)->i_pos = 0;
+}
 static int fat_read_root(struct inode *inode)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
@@ -1803,12 +1813,13 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
        fat_inode = new_inode(sb);
        if (!fat_inode)
                goto out_fail;
-        MSDOS_I(fat_inode)->i_pos = 0;
+        fat_dummy_inode_init(fat_inode);
        sbi->fat_inode = fat_inode;
        fsinfo_inode = new_inode(sb);
        if (!fsinfo_inode)
                goto out_fail;
+        fat_dummy_inode_init(fsinfo_inode);
        fsinfo_inode->i_ino = MSDOS_FSINFO_INO;
        sbi->fsinfo_inode = fsinfo_inode;
        insert_inode_hash(fsinfo_inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ef600591d96f..63ee2940775c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -173,19 +173,33 @@ static void wb_wakeup(struct bdi_writeback *wb)
        spin_unlock_bh(&wb->work_lock);
 }
+static void finish_writeback_work(struct bdi_writeback *wb,
+                                  struct wb_writeback_work *work)
+{
+        struct wb_completion *done = work->done;
+        if (work->auto_free)
+                kfree(work);
+        if (done && atomic_dec_and_test(&done->cnt))
+                wake_up_all(&wb->bdi->wb_waitq);
+}
 static void wb_queue_work(struct bdi_writeback *wb,
                          struct wb_writeback_work *work)
 {
        trace_writeback_queue(wb, work);
-        spin_lock_bh(&wb->work_lock);
-        if (!test_bit(WB_registered, &wb->state))
-                goto out_unlock;
        if (work->done)
                atomic_inc(&work->done->cnt);
-        list_add_tail(&work->list, &wb->work_list);
-        mod_delayed_work(bdi_wq, &wb->dwork, 0);
+        spin_lock_bh(&wb->work_lock);
-out_unlock:
+        if (test_bit(WB_registered, &wb->state)) {
+                list_add_tail(&work->list, &wb->work_list);
+                mod_delayed_work(bdi_wq, &wb->dwork, 0);
+        } else
+                finish_writeback_work(wb, work);
        spin_unlock_bh(&wb->work_lock);
 }
@@ -1873,16 +1887,9 @@ static long wb_do_writeback(struct bdi_writeback *wb)
        set_bit(WB_writeback_running, &wb->state);
        while ((work = get_next_work_item(wb)) != NULL) {
-                struct wb_completion *done = work->done;
                trace_writeback_exec(wb, work);
                wrote += wb_writeback(wb, work);
+                finish_writeback_work(wb, work);
-                if (work->auto_free)
-                        kfree(work);
-                if (done && atomic_dec_and_test(&done->cnt))
-                        wake_up_all(&wb->bdi->wb_waitq);
        }
        /*
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index c45084ac642d..511e1ed7e2de 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -207,7 +207,7 @@ struct lm_lockname {
        struct gfs2_sbd *ln_sbd;
        u64 ln_number;
        unsigned int ln_type;
-};
+} __packed __aligned(sizeof(int));
 #define lm_name_equal(name1, name2) \
        (((name1)->ln_number == (name2)->ln_number) &&  \
diff --git a/fs/iomap.c b/fs/iomap.c
index 3ca1a8e44135..141c3cd55a8b 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -846,7 +846,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        struct inode *inode = file_inode(iocb->ki_filp);
        size_t count = iov_iter_count(iter);
-        loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0;
+        loff_t pos = iocb->ki_pos, start = pos;
+        loff_t end = iocb->ki_pos + count - 1, ret = 0;
        unsigned int flags = IOMAP_DIRECT;
        struct blk_plug plug;
        struct iomap_dio *dio;
@@ -887,12 +888,12 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
        }
        if (mapping->nrpages) {
-                ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
+                ret = filemap_write_and_wait_range(mapping, start, end);
                if (ret)
                        goto out_free_dio;
                ret = invalidate_inode_pages2_range(mapping,
-                                iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+                                start >> PAGE_SHIFT, end >> PAGE_SHIFT);
                WARN_ON_ONCE(ret);
                ret = 0;
        }
@@ -941,6 +942,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                __set_current_state(TASK_RUNNING);
        }
+        ret = iomap_dio_complete(dio);
        /*
         * Try again to invalidate clean pages which might have been cached by
         * non-direct readahead, or faulted in by get_user_pages() if the source
@@ -949,12 +952,12 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
         * this invalidation fails, tough, the write still worked...
         */
        if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
-                ret = invalidate_inode_pages2_range(mapping,
+                int err = invalidate_inode_pages2_range(mapping,
-                                iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+                                start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-                WARN_ON_ONCE(ret);
+                WARN_ON_ONCE(err);
        }
-        return iomap_dio_complete(dio);
+        return ret;
 out_free_dio:
        kfree(dio);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 4348027384f5..d0ab7e56d0b4 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1863,7 +1863,7 @@ static int o2net_accept_one(struct socket *sock, int *more)
        new_sock->type = sock->type;
        new_sock->ops = sock->ops;
-        ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+        ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, false);
        if (ret < 0)
                goto out;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 1953986ee6bc..6e610a205e15 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -12,7 +12,6 @@
 #include <linux/slab.h>
 #include <linux/cred.h>
 #include <linux/xattr.h>
-#include <linux/sched/signal.h>
 #include "overlayfs.h"
 #include "ovl_entry.h"
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 384fa759a563..c543cdb5f8ed 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -400,9 +400,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
             clockid != CLOCK_BOOTTIME_ALARM))
                return -EINVAL;
-        if (!capable(CAP_WAKE_ALARM) &&
+        if ((clockid == CLOCK_REALTIME_ALARM ||
-            (clockid == CLOCK_REALTIME_ALARM ||
+             clockid == CLOCK_BOOTTIME_ALARM) &&
-             clockid == CLOCK_BOOTTIME_ALARM))
+            !capable(CAP_WAKE_ALARM))
                return -EPERM;
        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -449,7 +449,7 @@ static int do_timerfd_settime(int ufd, int flags,
                return ret;
        ctx = f.file->private_data;
-        if (!capable(CAP_WAKE_ALARM) && isalarm(ctx)) {
+        if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) {
                fdput(f);
                return -EPERM;
        }
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 973607df579d..1d227b0fcf49 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -138,8 +138,6 @@ out:
 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
 * context.
 * @ctx: [in] Pointer to the userfaultfd context.
- *
- * Returns: In case of success, returns not zero.
 */
 static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
 {
@@ -267,6 +265,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 {
        struct mm_struct *mm = ctx->mm;
        pgd_t *pgd;
+        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd, _pmd;
        pte_t *pte;
@@ -277,7 +276,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                goto out;
-        pud = pud_offset(pgd, address);
+        p4d = p4d_offset(pgd, address);
+        if (!p4d_present(*p4d))
+                goto out;
+        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                goto out;
        pmd = pmd_offset(pud, address);
@@ -490,7 +492,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
                         * in such case.
                         */
                        down_read(&mm->mmap_sem);
-                        ret = 0;
+                        ret = VM_FAULT_NOPAGE;
                }
        }
@@ -527,10 +529,11 @@ out:
        return ret;
 }
-static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
-                                             struct userfaultfd_wait_queue *ewq)
+                                              struct userfaultfd_wait_queue *ewq)
 {
-        int ret = 0;
+        if (WARN_ON_ONCE(current->flags & PF_EXITING))
+                goto out;
        ewq->ctx = ctx;
        init_waitqueue_entry(&ewq->wq, current);
@@ -547,8 +550,16 @@ static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
                        break;
                if (ACCESS_ONCE(ctx->released) ||
                    fatal_signal_pending(current)) {
-                        ret = -1;
                        __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
+                        if (ewq->msg.event == UFFD_EVENT_FORK) {
+                                struct userfaultfd_ctx *new;
+                                new = (struct userfaultfd_ctx *)
+                                        (unsigned long)
+                                        ewq->msg.arg.reserved.reserved1;
+                                userfaultfd_ctx_put(new);
+                        }
                        break;
                }
@@ -566,9 +577,8 @@ static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
         * ctx may go away after this if the userfault pseudo fd is
         * already released.
         */
+out:
        userfaultfd_ctx_put(ctx);
-        return ret;
 }
 static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
@@ -626,7 +636,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
        return 0;
 }
-static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
+static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
 {
        struct userfaultfd_ctx *ctx = fctx->orig;
        struct userfaultfd_wait_queue ewq;
@@ -636,17 +646,15 @@ static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
        ewq.msg.event = UFFD_EVENT_FORK;
        ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
-        return userfaultfd_event_wait_completion(ctx, &ewq);
+        userfaultfd_event_wait_completion(ctx, &ewq);
 }
 void dup_userfaultfd_complete(struct list_head *fcs)
 {
-        int ret = 0;
        struct userfaultfd_fork_ctx *fctx, *n;
        list_for_each_entry_safe(fctx, n, fcs, list) {
-                if (!ret)
+                dup_fctx(fctx);
-                        ret = dup_fctx(fctx);
                list_del(&fctx->list);
                kfree(fctx);
        }
@@ -689,8 +697,7 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
        userfaultfd_event_wait_completion(ctx, &ewq);
 }
-void userfaultfd_remove(struct vm_area_struct *vma,
+bool userfaultfd_remove(struct vm_area_struct *vma,
-                        struct vm_area_struct **prev,
                        unsigned long start, unsigned long end)
 {
        struct mm_struct *mm = vma->vm_mm;
@@ -699,13 +706,11 @@ void userfaultfd_remove(struct vm_area_struct *vma,
        ctx = vma->vm_userfaultfd_ctx.ctx;
        if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
-                return;
+                return true;
        userfaultfd_ctx_get(ctx);
        up_read(&mm->mmap_sem);
-        *prev = NULL; /* We wait for ACK w/o the mmap semaphore */
        msg_init(&ewq.msg);
        ewq.msg.event = UFFD_EVENT_REMOVE;
@@ -714,7 +719,7 @@ void userfaultfd_remove(struct vm_area_struct *vma,
        userfaultfd_event_wait_completion(ctx, &ewq);
-        down_read(&mm->mmap_sem);
+        return false;
 }
 static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
@@ -775,34 +780,6 @@ void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
        }
 }
-void userfaultfd_exit(struct mm_struct *mm)
-{
-        struct vm_area_struct *vma = mm->mmap;
-        /*
-         * We can do the vma walk without locking because the caller
-         * (exit_mm) knows it now has exclusive access
-         */
-        while (vma) {
-                struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
-                if (ctx && (ctx->features & UFFD_FEATURE_EVENT_EXIT)) {
-                        struct userfaultfd_wait_queue ewq;
-                        userfaultfd_ctx_get(ctx);
-                        msg_init(&ewq.msg);
-                        ewq.msg.event = UFFD_EVENT_EXIT;
-                        userfaultfd_event_wait_completion(ctx, &ewq);
-                        ctx->features &= ~UFFD_FEATURE_EVENT_EXIT;
-                }
-                vma = vma->vm_next;
-        }
-}
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
        struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 2dfdc62f795e..70a5b55e0870 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -25,24 +25,6 @@
 #include "kmem.h"
 #include "xfs_message.h"
-/*
- * Greedy allocation.  May fail and may return vmalloced memory.
- */
-void *
-kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
-{
-        void            *ptr;
-        size_t          kmsize = maxsize;
-        while (!(ptr = vzalloc(kmsize))) {
-                if ((kmsize >>= 1) <= minsize)
-                        kmsize = minsize;
-        }
-        if (ptr)
-                *size = kmsize;
-        return ptr;
-}
 void *
 kmem_alloc(size_t size, xfs_km_flags_t flags)
 {
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 689f746224e7..f0fc84fcaac2 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -69,8 +69,6 @@ static inline void  kmem_free(const void *ptr)
 }
-extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
 static inline void *
 kmem_zalloc(size_t size, xfs_km_flags_t flags)
 {
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index a9c66d47757a..9bd104f32908 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -763,8 +763,8 @@ xfs_bmap_extents_to_btree(
                args.type = XFS_ALLOCTYPE_START_BNO;
                args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
        } else if (dfops->dop_low) {
-try_another_ag:
                args.type = XFS_ALLOCTYPE_START_BNO;
+try_another_ag:
                args.fsbno = *firstblock;
        } else {
                args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -790,13 +790,17 @@ try_another_ag:
        if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
            args.fsbno == NULLFSBLOCK &&
            args.type == XFS_ALLOCTYPE_NEAR_BNO) {
-                dfops->dop_low = true;
+                args.type = XFS_ALLOCTYPE_FIRST_AG;
                goto try_another_ag;
        }
+        if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
+                xfs_iroot_realloc(ip, -1, whichfork);
+                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                return -ENOSPC;
+        }
        /*
         * Allocation can't fail, the space was reserved.
         */
-        ASSERT(args.fsbno != NULLFSBLOCK);
        ASSERT(*firstblock == NULLFSBLOCK ||
               args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock));
        *firstblock = cur->bc_private.b.firstblock = args.fsbno;
@@ -4150,6 +4154,19 @@ xfs_bmapi_read(
        return 0;
 }
+/*
+ * Add a delayed allocation extent to an inode. Blocks are reserved from the
+ * global pool and the extent inserted into the inode in-core extent tree.
+ *
+ * On entry, got refers to the first extent beyond the offset of the extent to
+ * allocate or eof is specified if no such extent exists. On return, got refers
+ * to the extent record that was inserted to the inode fork.
+ *
+ * Note that the allocated extent may have been merged with contiguous extents
+ * during insertion into the inode fork. Thus, got does not reflect the current
+ * state of the inode fork on return. If necessary, the caller can use lastx to
+ * look up the updated record in the inode fork.
+ */
 int
 xfs_bmapi_reserve_delalloc(
        struct xfs_inode        *ip,
@@ -4236,13 +4253,8 @@ xfs_bmapi_reserve_delalloc(
        got->br_startblock = nullstartblock(indlen);
        got->br_blockcount = alen;
        got->br_state = XFS_EXT_NORM;
-        xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
-        /*
+        xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
-         * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
-         * might have merged it into one of the neighbouring ones.
-         */
-        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
        /*
         * Tag the inode if blocks were preallocated. Note that COW fork
@@ -4254,10 +4266,6 @@ xfs_bmapi_reserve_delalloc(
        if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
                xfs_inode_set_cowblocks_tag(ip);
-        ASSERT(got->br_startoff <= aoff);
-        ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
-        ASSERT(isnullstartblock(got->br_startblock));
-        ASSERT(got->br_state == XFS_EXT_NORM);
        return 0;
 out_unreserve_blocks:
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index f93072b58a58..fd55db479385 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -447,8 +447,8 @@ xfs_bmbt_alloc_block(
        if (args.fsbno == NULLFSBLOCK) {
                args.fsbno = be64_to_cpu(start->l);
-try_another_ag:
                args.type = XFS_ALLOCTYPE_START_BNO;
+try_another_ag:
                /*
                 * Make sure there is sufficient room left in the AG to
                 * complete a full tree split for an extent insert.  If
@@ -488,8 +488,8 @@ try_another_ag:
        if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
            args.fsbno == NULLFSBLOCK &&
            args.type == XFS_ALLOCTYPE_NEAR_BNO) {
-                cur->bc_private.b.dfops->dop_low = true;
                args.fsbno = cur->bc_private.b.firstblock;
+                args.type = XFS_ALLOCTYPE_FIRST_AG;
                goto try_another_ag;
        }
@@ -506,7 +506,7 @@ try_another_ag:
                        goto error0;
                cur->bc_private.b.dfops->dop_low = true;
        }
-        if (args.fsbno == NULLFSBLOCK) {
+        if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
                *stat = 0;
                return 0;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index bf65a9ea8642..61494295d92f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -274,54 +274,49 @@ xfs_end_io(
        struct xfs_ioend        *ioend =
                container_of(work, struct xfs_ioend, io_work);
        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+        xfs_off_t               offset = ioend->io_offset;
+        size_t                  size = ioend->io_size;
        int                     error = ioend->io_bio->bi_error;
        /*
-         * Set an error if the mount has shut down and proceed with end I/O
+         * Just clean up the in-memory strutures if the fs has been shut down.
-         * processing so it can perform whatever cleanups are necessary.
         */
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                error = -EIO;
+                goto done;
+        }
        /*
-         * For a CoW extent, we need to move the mapping from the CoW fork
+         * Clean up any COW blocks on an I/O error.
-         * to the data fork.  If instead an error happened, just dump the
-         * new blocks.
         */
-        if (ioend->io_type == XFS_IO_COW) {
+        if (unlikely(error)) {
-                if (error)
+                switch (ioend->io_type) {
-                        goto done;
+                case XFS_IO_COW:
-                if (ioend->io_bio->bi_error) {
+                        xfs_reflink_cancel_cow_range(ip, offset, size, true);
-                        error = xfs_reflink_cancel_cow_range(ip,
+                        break;
-                                        ioend->io_offset, ioend->io_size);
-                        goto done;
                }
-                error = xfs_reflink_end_cow(ip, ioend->io_offset,
-                                ioend->io_size);
+                goto done;
-                if (error)
-                        goto done;
        }
        /*
-         * For unwritten extents we need to issue transactions to convert a
+         * Success:  commit the COW or unwritten blocks if needed.
-         * range to normal written extens after the data I/O has finished.
-         * Detecting and handling completion IO errors is done individually
-         * for each case as different cleanup operations need to be performed
-         * on error.
         */
-        if (ioend->io_type == XFS_IO_UNWRITTEN) {
+        switch (ioend->io_type) {
-                if (error)
+        case XFS_IO_COW:
-                        goto done;
+                error = xfs_reflink_end_cow(ip, offset, size);
-                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+                break;
-                                                  ioend->io_size);
+        case XFS_IO_UNWRITTEN:
-        } else if (ioend->io_append_trans) {
+                error = xfs_iomap_write_unwritten(ip, offset, size);
-                error = xfs_setfilesize_ioend(ioend, error);
+                break;
-        } else {
+        default:
-                ASSERT(!xfs_ioend_is_append(ioend) ||
+                ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
-                       ioend->io_type == XFS_IO_COW);
+                break;
        }
 done:
+        if (ioend->io_append_trans)
+                error = xfs_setfilesize_ioend(ioend, error);
        xfs_destroy_ioend(ioend, error);
 }
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 7234b9748c36..3531f8f72fa5 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1608,7 +1608,7 @@ xfs_inode_free_cowblocks(
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
        xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-        ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
+        ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
        xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index edfa6a55b064..7eaf1ef74e3c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1615,7 +1615,7 @@ xfs_itruncate_extents(
        /* Remove all pending CoW reservations. */
        error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block,
-                        last_block);
+                        last_block, true);
        if (error)
                goto out;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 41662fb14e87..288ee5b840d7 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -630,6 +630,11 @@ retry:
                goto out_unlock;
        }
+        /*
+         * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
+         * them out if the write happens to fail.
+         */
+        iomap->flags = IOMAP_F_NEW;
        trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
 done:
        if (isnullstartblock(got.br_startblock))
@@ -1071,16 +1076,22 @@ xfs_file_iomap_end_delalloc(
        struct xfs_inode        *ip,
        loff_t                  offset,
        loff_t                  length,
-        ssize_t                 written)
+        ssize_t                 written,
+        struct iomap            *iomap)
 {
        struct xfs_mount        *mp = ip->i_mount;
        xfs_fileoff_t           start_fsb;
        xfs_fileoff_t           end_fsb;
        int                     error = 0;
-        /* behave as if the write failed if drop writes is enabled */
+        /*
-        if (xfs_mp_drop_writes(mp))
+         * Behave as if the write failed if drop writes is enabled. Set the NEW
+         * flag to force delalloc cleanup.
+         */
+        if (xfs_mp_drop_writes(mp)) {
+                iomap->flags |= IOMAP_F_NEW;
                written = 0;
+        }
        /*
         * start_fsb refers to the first unused block after a short write. If
@@ -1094,14 +1105,14 @@ xfs_file_iomap_end_delalloc(
        end_fsb = XFS_B_TO_FSB(mp, offset + length);
        /*
-         * Trim back delalloc blocks if we didn't manage to write the whole
+         * Trim delalloc blocks if they were allocated by this write and we
-         * range reserved.
+         * didn't manage to write the whole range.
         *
         * We don't need to care about racing delalloc as we hold i_mutex
         * across the reserve/allocate/unreserve calls. If there are delalloc
         * blocks in the range, they are ours.
         */
-        if (start_fsb < end_fsb) {
+        if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
                truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
                                         XFS_FSB_TO_B(mp, end_fsb) - 1);
@@ -1131,7 +1142,7 @@ xfs_file_iomap_end(
 {
        if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
                return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
-                                length, written);
+                                length, written, iomap);
        return 0;
 }
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 66e881790c17..2a6d9b1558e0 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -361,7 +361,6 @@ xfs_bulkstat(
        xfs_agino_t             agino;  /* inode # in allocation group */
        xfs_agnumber_t          agno;   /* allocation group number */
        xfs_btree_cur_t         *cur;   /* btree cursor for ialloc btree */
-        size_t                  irbsize; /* size of irec buffer in bytes */
        xfs_inobt_rec_incore_t  *irbuf; /* start of irec buffer */
        int                     nirbuf; /* size of irbuf */
        int                     ubcount; /* size of user's buffer */
@@ -388,11 +387,10 @@ xfs_bulkstat(
        *ubcountp = 0;
        *done = 0;
-        irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
+        irbuf = kmem_zalloc_large(PAGE_SIZE * 4, KM_SLEEP);
        if (!irbuf)
                return -ENOMEM;
+        nirbuf = (PAGE_SIZE * 4) / sizeof(*irbuf);
-        nirbuf = irbsize / sizeof(*irbuf);
        /*
         * Loop over the allocation groups, starting from the last
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 450bde68bb75..688ebff1f663 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -513,8 +513,7 @@ STATIC void
 xfs_set_inoalignment(xfs_mount_t *mp)
 {
        if (xfs_sb_version_hasalign(&mp->m_sb) &&
-            mp->m_sb.sb_inoalignmt >=
+                mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
-            XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
                mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
        else
                mp->m_inoalign_mask = 0;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index da6d08fb359c..4a84c5ea266d 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -548,14 +548,18 @@ xfs_reflink_trim_irec_to_next_cow(
 }
 /*
- * Cancel all pending CoW reservations for some block range of an inode.
+ * Cancel CoW reservations for some block range of an inode.
+ *
+ * If cancel_real is true this function cancels all COW fork extents for the
+ * inode; if cancel_real is false, real extents are not cleared.
 */
 int
 xfs_reflink_cancel_cow_blocks(
        struct xfs_inode                *ip,
        struct xfs_trans                **tpp,
        xfs_fileoff_t                   offset_fsb,
-        xfs_fileoff_t                   end_fsb)
+        xfs_fileoff_t                   end_fsb,
+        bool                            cancel_real)
 {
        struct xfs_ifork                *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
        struct xfs_bmbt_irec            got, del;
@@ -579,7 +583,7 @@ xfs_reflink_cancel_cow_blocks(
                                        &idx, &got, &del);
                        if (error)
                                break;
-                } else {
+                } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
                        xfs_trans_ijoin(*tpp, ip, 0);
                        xfs_defer_init(&dfops, &firstfsb);
@@ -621,13 +625,17 @@ xfs_reflink_cancel_cow_blocks(
 }
 /*
- * Cancel all pending CoW reservations for some byte range of an inode.
+ * Cancel CoW reservations for some byte range of an inode.
+ *
+ * If cancel_real is true this function cancels all COW fork extents for the
+ * inode; if cancel_real is false, real extents are not cleared.
 */
 int
 xfs_reflink_cancel_cow_range(
        struct xfs_inode        *ip,
        xfs_off_t               offset,
-        xfs_off_t               count)
+        xfs_off_t               count,
+        bool                    cancel_real)
 {
        struct xfs_trans        *tp;
        xfs_fileoff_t           offset_fsb;
@@ -653,7 +661,8 @@ xfs_reflink_cancel_cow_range(
        xfs_trans_ijoin(tp, ip, 0);
        /* Scrape out the old CoW reservations */
-        error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb);
+        error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
+                        cancel_real);
        if (error)
                goto out_cancel;
@@ -1450,7 +1459,7 @@ next:
         * We didn't find any shared blocks so turn off the reflink flag.
         * First, get rid of any leftover CoW mappings.
         */
-        error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF);
+        error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
        if (error)
                return error;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 33ac9b8db683..d29a7967f029 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -39,9 +39,9 @@ extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
 extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
                struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
-                xfs_fileoff_t end_fsb);
+                xfs_fileoff_t end_fsb, bool cancel_real);
 extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
-                xfs_off_t count);
+                xfs_off_t count, bool cancel_real);
 extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
                xfs_off_t count);
 extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 890862f2447c..685c042a120f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -953,7 +953,7 @@ xfs_fs_destroy_inode(
        XFS_STATS_INC(ip->i_mount, vn_remove);
        if (xfs_is_reflink_inode(ip)) {
-                error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
+                error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
                if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount))
                        xfs_warn(ip->i_mount,
 "Error %d while evicting CoW blocks for inode %llu.",