Merge branch 'xfs-buf-iosubmit' into for-next

author: Dave Chinner <david@fromorbit.com> 2014-10-01 19:11:14 -0400
committer: Dave Chinner <david@fromorbit.com> 2014-10-01 19:11:14 -0400
commit: 75e58ce4c8f354f1a68a8bb8a9692827cdaf3d21 (patch)
tree: 4bbcd993a6e75d199c82958ffe15c0e0d03f4bbf
parent: bd438f825f7badafe56d117ed906488c8541f95f (diff)
parent: 8c15612546bce1ecafb7dee3cce8a2a9b560e15e (diff)
12 files changed, 282 insertions, 358 deletions
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index d8b77b5bf4d9..c2aaa58e59ee 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1122,14 +1122,6 @@ xfs_zero_remaining_bytes(
        if (endoff > XFS_ISIZE(ip))
                endoff = XFS_ISIZE(ip);
-        bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
-                                        mp->m_rtdev_targp : mp->m_ddev_targp,
-                                  BTOBB(mp->m_sb.sb_blocksize), 0);
-        if (!bp)
-                return -ENOMEM;
-        xfs_buf_unlock(bp);
        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
                uint lock_mode;
@@ -1152,42 +1144,24 @@ xfs_zero_remaining_bytes(
                ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
                if (imap.br_state == XFS_EXT_UNWRITTEN)
                        continue;
-                XFS_BUF_UNDONE(bp);
-                XFS_BUF_UNWRITE(bp);
-                XFS_BUF_READ(bp);
-                XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
-                if (XFS_FORCED_SHUTDOWN(mp)) {
+                error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
-                        error = -EIO;
+                                mp->m_rtdev_targp : mp->m_ddev_targp,
-                        break;
+                                xfs_fsb_to_db(ip, imap.br_startblock),
-                }
+                                BTOBB(mp->m_sb.sb_blocksize),
-                xfs_buf_iorequest(bp);
+                                0, &bp, NULL);
-                error = xfs_buf_iowait(bp);
+                if (error)
-                if (error) {
+                        return error;
-                        xfs_buf_ioerror_alert(bp,
-                                        "xfs_zero_remaining_bytes(read)");
-                        break;
-                }
                memset(bp->b_addr +
-                        (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+                                (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
-                      0, lastoffset - offset + 1);
+                       0, lastoffset - offset + 1);
-                XFS_BUF_UNDONE(bp);
-                XFS_BUF_UNREAD(bp);
+                error = xfs_bwrite(bp);
-                XFS_BUF_WRITE(bp);
+                xfs_buf_relse(bp);
+                if (error)
-                if (XFS_FORCED_SHUTDOWN(mp)) {
+                        return error;
-                        error = -EIO;
-                        break;
-                }
-                xfs_buf_iorequest(bp);
-                error = xfs_buf_iowait(bp);
-                if (error) {
-                        xfs_buf_ioerror_alert(bp,
-                                        "xfs_zero_remaining_bytes(write)");
-                        break;
-                }
        }
-        xfs_buf_free(bp);
        return error;
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ec6505056b2c..017b6afe340b 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -623,10 +623,11 @@ _xfs_buf_read(
        bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
        bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
-        xfs_buf_iorequest(bp);
+        if (flags & XBF_ASYNC) {
-        if (flags & XBF_ASYNC)
+                xfs_buf_submit(bp);
                return 0;
-        return xfs_buf_iowait(bp);
+        }
+        return xfs_buf_submit_wait(bp);
 }
 xfs_buf_t *
@@ -687,34 +688,39 @@ xfs_buf_readahead_map(
 * Read an uncached buffer from disk. Allocates and returns a locked
 * buffer containing the disk contents or nothing.
 */
-struct xfs_buf *
+int
 xfs_buf_read_uncached(
        struct xfs_buftarg      *target,
        xfs_daddr_t             daddr,
        size_t                  numblks,
        int                     flags,
+        struct xfs_buf          **bpp,
        const struct xfs_buf_ops *ops)
 {
        struct xfs_buf          *bp;
+        *bpp = NULL;
        bp = xfs_buf_get_uncached(target, numblks, flags);
        if (!bp)
-                return NULL;
+                return -ENOMEM;
        /* set up the buffer for a read IO */
        ASSERT(bp->b_map_count == 1);
-        bp->b_bn = daddr;
+        bp->b_bn = XFS_BUF_DADDR_NULL;  /* always null for uncached buffers */
        bp->b_maps[0].bm_bn = daddr;
        bp->b_flags |= XBF_READ;
        bp->b_ops = ops;
-        if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
+        xfs_buf_submit_wait(bp);
+        if (bp->b_error) {
+                int     error = bp->b_error;
                xfs_buf_relse(bp);
-                return NULL;
+                return error;
        }
-        xfs_buf_iorequest(bp);
-        xfs_buf_iowait(bp);
+        *bpp = bp;
-        return bp;
+        return 0;
 }
 /*
@@ -998,53 +1004,56 @@ xfs_buf_wait_unpin(
 *      Buffer Utility Routines
 */
-STATIC void
+void
-xfs_buf_iodone_work(
+xfs_buf_ioend(
-        struct work_struct      *work)
+        struct xfs_buf  *bp)
 {
-        struct xfs_buf          *bp =
+        bool            read = bp->b_flags & XBF_READ;
-                container_of(work, xfs_buf_t, b_iodone_work);
-        bool                    read = !!(bp->b_flags & XBF_READ);
+        trace_xfs_buf_iodone(bp, _RET_IP_);
        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
-        /* only validate buffers that were read without errors */
+        /*
-        if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
+         * Pull in IO completion errors now. We are guaranteed to be running
+         * single threaded, so we don't need the lock to read b_io_error.
+         */
+        if (!bp->b_error && bp->b_io_error)
+                xfs_buf_ioerror(bp, bp->b_io_error);
+        /* Only validate buffers that were read without errors */
+        if (read && !bp->b_error && bp->b_ops) {
+                ASSERT(!bp->b_iodone);
                bp->b_ops->verify_read(bp);
+        }
+        if (!bp->b_error)
+                bp->b_flags |= XBF_DONE;
        if (bp->b_iodone)
                (*(bp->b_iodone))(bp);
        else if (bp->b_flags & XBF_ASYNC)
                xfs_buf_relse(bp);
-        else {
+        else
-                ASSERT(read && bp->b_ops);
                complete(&bp->b_iowait);
-        }
 }
-void
+static void
-xfs_buf_ioend(
+xfs_buf_ioend_work(
-        struct xfs_buf  *bp,
+        struct work_struct      *work)
-        int             schedule)
 {
-        bool            read = !!(bp->b_flags & XBF_READ);
+        struct xfs_buf          *bp =
+                container_of(work, xfs_buf_t, b_iodone_work);
-        trace_xfs_buf_iodone(bp, _RET_IP_);
-        if (bp->b_error == 0)
+        xfs_buf_ioend(bp);
-                bp->b_flags |= XBF_DONE;
+}
-        if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
+void
-                if (schedule) {
+xfs_buf_ioend_async(
-                        INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
+        struct xfs_buf  *bp)
-                        queue_work(xfslogd_workqueue, &bp->b_iodone_work);
+{
-                } else {
+        INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work);
-                        xfs_buf_iodone_work(&bp->b_iodone_work);
+        queue_work(xfslogd_workqueue, &bp->b_iodone_work);
-                }
-        } else {
-                bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
-                complete(&bp->b_iowait);
-        }
 }
 void
@@ -1067,96 +1076,6 @@ xfs_buf_ioerror_alert(
                (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
 }
-/*
- * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
- * so that the proper iodone callbacks get called.
- */
-STATIC int
-xfs_bioerror(
-        xfs_buf_t *bp)
-{
-#ifdef XFSERRORDEBUG
-        ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
-#endif
-        /*
-         * No need to wait until the buffer is unpinned, we aren't flushing it.
-         */
-        xfs_buf_ioerror(bp, -EIO);
-        /*
-         * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
-         */
-        XFS_BUF_UNREAD(bp);
-        XFS_BUF_UNDONE(bp);
-        xfs_buf_stale(bp);
-        xfs_buf_ioend(bp, 0);
-        return -EIO;
-}
-/*
- * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the xfs_buf_ioend call.
- * This is meant for userdata errors; metadata bufs come with
- * iodone functions attached, so that we can track down errors.
- */
-int
-xfs_bioerror_relse(
-        struct xfs_buf  *bp)
-{
-        int64_t         fl = bp->b_flags;
-        /*
-         * No need to wait until the buffer is unpinned.
-         * We aren't flushing it.
-         *
-         * chunkhold expects B_DONE to be set, whether
-         * we actually finish the I/O or not. We don't want to
-         * change that interface.
-         */
-        XFS_BUF_UNREAD(bp);
-        XFS_BUF_DONE(bp);
-        xfs_buf_stale(bp);
-        bp->b_iodone = NULL;
-        if (!(fl & XBF_ASYNC)) {
-                /*
-                 * Mark b_error and B_ERROR _both_.
-                 * Lot's of chunkcache code assumes that.
-                 * There's no reason to mark error for
-                 * ASYNC buffers.
-                 */
-                xfs_buf_ioerror(bp, -EIO);
-                complete(&bp->b_iowait);
-        } else {
-                xfs_buf_relse(bp);
-        }
-        return -EIO;
-}
-STATIC int
-xfs_bdstrat_cb(
-        struct xfs_buf  *bp)
-{
-        if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
-                trace_xfs_bdstrat_shut(bp, _RET_IP_);
-                /*
-                 * Metadata write that didn't get logged but
-                 * written delayed anyway. These aren't associated
-                 * with a transaction, and can be ignored.
-                 */
-                if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
-                        return xfs_bioerror_relse(bp);
-                else
-                        return xfs_bioerror(bp);
-        }
-        xfs_buf_iorequest(bp);
-        return 0;
-}
 int
 xfs_bwrite(
        struct xfs_buf          *bp)
@@ -1166,11 +1085,10 @@ xfs_bwrite(
        ASSERT(xfs_buf_islocked(bp));
        bp->b_flags |= XBF_WRITE;
-        bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
+        bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
+                         XBF_WRITE_FAIL | XBF_DONE);
-        xfs_bdstrat_cb(bp);
+        error = xfs_buf_submit_wait(bp);
-        error = xfs_buf_iowait(bp);
        if (error) {
                xfs_force_shutdown(bp->b_target->bt_mount,
                                   SHUTDOWN_META_IO_ERROR);
@@ -1179,15 +1097,6 @@ xfs_bwrite(
 }
 STATIC void
-_xfs_buf_ioend(
-        xfs_buf_t               *bp,
-        int                     schedule)
-{
-        if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
-                xfs_buf_ioend(bp, schedule);
-}
-STATIC void
 xfs_buf_bio_end_io(
        struct bio              *bio,
        int                     error)
@@ -1198,13 +1107,18 @@ xfs_buf_bio_end_io(
         * don't overwrite existing errors - otherwise we can lose errors on
         * buffers that require multiple bios to complete.
         */
-        if (!bp->b_error)
+        if (error) {
-                xfs_buf_ioerror(bp, error);
+                spin_lock(&bp->b_lock);
+                if (!bp->b_io_error)
+                        bp->b_io_error = error;
+                spin_unlock(&bp->b_lock);
+        }
        if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
-        _xfs_buf_ioend(bp, 1);
+        if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+                xfs_buf_ioend_async(bp);
        bio_put(bio);
 }
@@ -1283,7 +1197,7 @@ next_chunk:
        } else {
                /*
                 * This is guaranteed not to be the last io reference count
-                 * because the caller (xfs_buf_iorequest) holds a count itself.
+                 * because the caller (xfs_buf_submit) holds a count itself.
                 */
                atomic_dec(&bp->b_io_remaining);
                xfs_buf_ioerror(bp, -EIO);
@@ -1373,53 +1287,131 @@ _xfs_buf_ioapply(
        blk_finish_plug(&plug);
 }
+/*
+ * Asynchronous IO submission path. This transfers the buffer lock ownership and
+ * the current reference to the IO. It is not safe to reference the buffer after
+ * a call to this function unless the caller holds an additional reference
+ * itself.
+ */
 void
-xfs_buf_iorequest(
+xfs_buf_submit(
-        xfs_buf_t               *bp)
+        struct xfs_buf  *bp)
 {
-        trace_xfs_buf_iorequest(bp, _RET_IP_);
+        trace_xfs_buf_submit(bp, _RET_IP_);
        ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+        ASSERT(bp->b_flags & XBF_ASYNC);
+        /* on shutdown we stale and complete the buffer immediately */
+        if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+                xfs_buf_ioerror(bp, -EIO);
+                bp->b_flags &= ~XBF_DONE;
+                xfs_buf_stale(bp);
+                xfs_buf_ioend(bp);
+                return;
+        }
        if (bp->b_flags & XBF_WRITE)
                xfs_buf_wait_unpin(bp);
+        /* clear the internal error state to avoid spurious errors */
+        bp->b_io_error = 0;
+        /*
+         * The caller's reference is released during I/O completion.
+         * This occurs some time after the last b_io_remaining reference is
+         * released, so after we drop our Io reference we have to have some
+         * other reference to ensure the buffer doesn't go away from underneath
+         * us. Take a direct reference to ensure we have safe access to the
+         * buffer until we are finished with it.
+         */
        xfs_buf_hold(bp);
        /*
-         * Set the count to 1 initially, this will stop an I/O
+         * Set the count to 1 initially, this will stop an I/O completion
-         * completion callout which happens before we have started
+         * callout which happens before we have started all the I/O from calling
-         * all the I/O from calling xfs_buf_ioend too early.
+         * xfs_buf_ioend too early.
         */
        atomic_set(&bp->b_io_remaining, 1);
        _xfs_buf_ioapply(bp);
        /*
-         * If _xfs_buf_ioapply failed, we'll get back here with
+         * If _xfs_buf_ioapply failed, we can get back here with only the IO
-         * only the reference we took above.  _xfs_buf_ioend will
+         * reference we took above. If we drop it to zero, run completion so
-         * drop it to zero, so we'd better not queue it for later,
+         * that we don't return to the caller with completion still pending.
-         * or we'll free it before it's done.
         */
-        _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
+        if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+                if (bp->b_error)
+                        xfs_buf_ioend(bp);
+                else
+                        xfs_buf_ioend_async(bp);
+        }
        xfs_buf_rele(bp);
+        /* Note: it is not safe to reference bp now we've dropped our ref */
 }
 /*
- * Waits for I/O to complete on the buffer supplied.  It returns immediately if
+ * Synchronous buffer IO submission path, read or write.
- * no I/O is pending or there is already a pending error on the buffer, in which
- * case nothing will ever complete.  It returns the I/O error code, if any, or
- * 0 if there was no error.
 */
 int
-xfs_buf_iowait(
+xfs_buf_submit_wait(
-        xfs_buf_t               *bp)
+        struct xfs_buf  *bp)
 {
-        trace_xfs_buf_iowait(bp, _RET_IP_);
+        int             error;
-        if (!bp->b_error)
+        trace_xfs_buf_submit_wait(bp, _RET_IP_);
-                wait_for_completion(&bp->b_iowait);
+        ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC)));
+        if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+                xfs_buf_ioerror(bp, -EIO);
+                xfs_buf_stale(bp);
+                bp->b_flags &= ~XBF_DONE;
+                return -EIO;
+        }
+        if (bp->b_flags & XBF_WRITE)
+                xfs_buf_wait_unpin(bp);
+        /* clear the internal error state to avoid spurious errors */
+        bp->b_io_error = 0;
+        /*
+         * For synchronous IO, the IO does not inherit the submitters reference
+         * count, nor the buffer lock. Hence we cannot release the reference we
+         * are about to take until we've waited for all IO completion to occur,
+         * including any xfs_buf_ioend_async() work that may be pending.
+         */
+        xfs_buf_hold(bp);
+        /*
+         * Set the count to 1 initially, this will stop an I/O completion
+         * callout which happens before we have started all the I/O from calling
+         * xfs_buf_ioend too early.
+         */
+        atomic_set(&bp->b_io_remaining, 1);
+        _xfs_buf_ioapply(bp);
+        /*
+         * make sure we run completion synchronously if it raced with us and is
+         * already complete.
+         */
+        if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+                xfs_buf_ioend(bp);
+        /* wait for completion before gathering the error from the buffer */
+        trace_xfs_buf_iowait(bp, _RET_IP_);
+        wait_for_completion(&bp->b_iowait);
        trace_xfs_buf_iowait_done(bp, _RET_IP_);
-        return bp->b_error;
+        error = bp->b_error;
+        /*
+         * all done now, we can release the hold that keeps the buffer
+         * referenced for the entire IO.
+         */
+        xfs_buf_rele(bp);
+        return error;
 }
 xfs_caddr_t
@@ -1813,13 +1805,19 @@ __xfs_buf_delwri_submit(
        blk_start_plug(&plug);
        list_for_each_entry_safe(bp, n, io_list, b_list) {
                bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
-                bp->b_flags |= XBF_WRITE;
+                bp->b_flags |= XBF_WRITE | XBF_ASYNC;
-                if (!wait) {
+                /*
-                        bp->b_flags |= XBF_ASYNC;
+                 * we do all Io submission async. This means if we need to wait
+                 * for IO completion we need to take an extra reference so the
+                 * buffer is still valid on the other side.
+                 */
+                if (wait)
+                        xfs_buf_hold(bp);
+                else
                        list_del_init(&bp->b_list);
-                }
-                xfs_bdstrat_cb(bp);
+                xfs_buf_submit(bp);
        }
        blk_finish_plug(&plug);
@@ -1866,7 +1864,10 @@ xfs_buf_delwri_submit(
                bp = list_first_entry(&io_list, struct xfs_buf, b_list);
                list_del_init(&bp->b_list);
-                error2 = xfs_buf_iowait(bp);
+                /* locking the buffer will wait for async IO completion. */
+                xfs_buf_lock(bp);
+                error2 = bp->b_error;
                xfs_buf_relse(bp);
                if (!error)
                        error = error2;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c753183900b3..82002c00af90 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -158,6 +158,7 @@ typedef struct xfs_buf {
        struct list_head        b_lru;          /* lru list */
        spinlock_t              b_lock;         /* internal state lock */
        unsigned int            b_state;        /* internal state flags */
+        int                     b_io_error;     /* internal IO error state */
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
        struct xfs_perag        *b_pag;         /* contains rbtree root */
@@ -268,9 +269,9 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
 struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
                                int flags);
-struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
+int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
-                                xfs_daddr_t daddr, size_t numblks, int flags,
+                          size_t numblks, int flags, struct xfs_buf **bpp,
-                                const struct xfs_buf_ops *ops);
+                          const struct xfs_buf_ops *ops);
 void xfs_buf_hold(struct xfs_buf *bp);
 /* Releasing Buffers */
@@ -286,18 +287,16 @@ extern void xfs_buf_unlock(xfs_buf_t *);
 /* Buffer Read and Write Routines */
 extern int xfs_bwrite(struct xfs_buf *bp);
-extern void xfs_buf_ioend(xfs_buf_t *,  int);
+extern void xfs_buf_ioend(struct xfs_buf *bp);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
 extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
-extern void xfs_buf_iorequest(xfs_buf_t *);
+extern void xfs_buf_submit(struct xfs_buf *bp);
-extern int xfs_buf_iowait(xfs_buf_t *);
+extern int xfs_buf_submit_wait(struct xfs_buf *bp);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
                                xfs_buf_rw_t);
 #define xfs_buf_zero(bp, off, len) \
            xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-extern int xfs_bioerror_relse(struct xfs_buf *);
 /* Buffer Utility Routines */
 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 30fa5db9aea8..f15969543326 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -491,7 +491,7 @@ xfs_buf_item_unpin(
                xfs_buf_ioerror(bp, -EIO);
                XFS_BUF_UNDONE(bp);
                xfs_buf_stale(bp);
-                xfs_buf_ioend(bp, 0);
+                xfs_buf_ioend(bp);
        }
 }
@@ -1081,7 +1081,7 @@ xfs_buf_iodone_callbacks(
         * a way to shut the filesystem down if the writes keep failing.
         *
         * In practice we'll shut the filesystem down soon as non-transient
-         * erorrs tend to affect the whole device and a failing log write
+         * errors tend to affect the whole device and a failing log write
         * will make us give up.  But we really ought to do better here.
         */
        if (XFS_BUF_ISASYNC(bp)) {
@@ -1094,7 +1094,7 @@ xfs_buf_iodone_callbacks(
                if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
                        bp->b_flags |= XBF_WRITE | XBF_ASYNC |
                                       XBF_DONE | XBF_WRITE_FAIL;
-                        xfs_buf_iorequest(bp);
+                        xfs_buf_submit(bp);
                } else {
                        xfs_buf_relse(bp);
                }
@@ -1115,7 +1115,7 @@ do_callbacks:
        xfs_buf_do_callbacks(bp);
        bp->b_fspriv = NULL;
        bp->b_iodone = NULL;
-        xfs_buf_ioend(bp, 0);
+        xfs_buf_ioend(bp);
 }
 /*
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index f91de1ef05e1..c05ac8b70fa9 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -172,16 +172,11 @@ xfs_growfs_data_private(
        if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
                return error;
        dpct = pct - mp->m_sb.sb_imax_pct;
-        bp = xfs_buf_read_uncached(mp->m_ddev_targp,
+        error = xfs_buf_read_uncached(mp->m_ddev_targp,
                                XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-                                XFS_FSS_TO_BB(mp, 1), 0, NULL);
+                                XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
-        if (!bp)
+        if (error)
-                return -EIO;
-        if (bp->b_error) {
-                error = bp->b_error;
-                xfs_buf_relse(bp);
                return error;
-        }
        xfs_buf_relse(bp);
        new = nb;       /* use new as a temporary here */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c92cb48617d1..e5bbc1f30f16 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3062,7 +3062,7 @@ cluster_corrupt_out:
                        XFS_BUF_UNDONE(bp);
                        xfs_buf_stale(bp);
                        xfs_buf_ioerror(bp, -EIO);
-                        xfs_buf_ioend(bp, 0);
+                        xfs_buf_ioend(bp);
                } else {
                        xfs_buf_stale(bp);
                        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ca4fd5bd8522..fe88ef67f93a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1678,7 +1678,7 @@ xlog_bdstrat(
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                xfs_buf_ioerror(bp, -EIO);
                xfs_buf_stale(bp);
-                xfs_buf_ioend(bp, 0);
+                xfs_buf_ioend(bp);
                /*
                 * It would seem logical to return EIO here, but we rely on
                 * the log state machine to propagate I/O errors instead of
@@ -1688,7 +1688,7 @@ xlog_bdstrat(
                return 0;
        }
-        xfs_buf_iorequest(bp);
+        xfs_buf_submit(bp);
        return 0;
 }
@@ -3867,18 +3867,17 @@ xlog_state_ioerror(
 * This is called from xfs_force_shutdown, when we're forcibly
 * shutting down the filesystem, typically because of an IO error.
 * Our main objectives here are to make sure that:
- *      a. the filesystem gets marked 'SHUTDOWN' for all interested
+ *      a. if !logerror, flush the logs to disk. Anything modified
+ *         after this is ignored.
+ *      b. the filesystem gets marked 'SHUTDOWN' for all interested
 *         parties to find out, 'atomically'.
- *      b. those who're sleeping on log reservations, pinned objects and
+ *      c. those who're sleeping on log reservations, pinned objects and
 *          other resources get woken up, and be told the bad news.
- *      c. nothing new gets queued up after (a) and (b) are done.
+ *      d. nothing new gets queued up after (b) and (c) are done.
- *      d. if !logerror, flush the iclogs to disk, then seal them off
- *         for business.
 *
- * Note: for delayed logging the !logerror case needs to flush the regions
+ * Note: for the !logerror case we need to flush the regions held in memory out
- * held in memory out to the iclogs before flushing them to disk. This needs
+ * to disk first. This needs to be done before the log is marked as shutdown,
- * to be done before the log is marked as shutdown, otherwise the flush to the
+ * otherwise the iclog writes will fail.
- * iclogs will fail.
 */
 int
 xfs_log_force_umount(
@@ -3910,16 +3909,16 @@ xfs_log_force_umount(
                ASSERT(XLOG_FORCED_SHUTDOWN(log));
                return 1;
        }
-        retval = 0;
        /*
-         * Flush the in memory commit item list before marking the log as
+         * Flush all the completed transactions to disk before marking the log
-         * being shut down. We need to do it in this order to ensure all the
+         * being shut down. We need to do it in this order to ensure that
-         * completed transactions are flushed to disk with the xfs_log_force()
+         * completed operations are safely on disk before we shut down, and that
-         * call below.
+         * we don't have to issue any buffer IO after the shutdown flags are set
+         * to guarantee this.
         */
        if (!logerror)
-                xlog_cil_force(log);
+                _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
        /*
         * mark the filesystem and the as in a shutdown state and wake
@@ -3931,18 +3930,11 @@ xfs_log_force_umount(
                XFS_BUF_DONE(mp->m_sb_bp);
        /*
-         * This flag is sort of redundant because of the mount flag, but
+         * Mark the log and the iclogs with IO error flags to prevent any
-         * it's good to maintain the separation between the log and the rest
+         * further log IO from being issued or completed.
-         * of XFS.
         */
        log->l_flags |= XLOG_IO_ERROR;
+        retval = xlog_state_ioerror(log);
-        /*
-         * If we hit a log error, we want to mark all the iclogs IOERROR
-         * while we're still holding the loglock.
-         */
-        if (logerror)
-                retval = xlog_state_ioerror(log);
        spin_unlock(&log->l_icloglock);
        /*
@@ -3955,19 +3947,6 @@ xfs_log_force_umount(
        xlog_grant_head_wake_all(&log->l_reserve_head);
        xlog_grant_head_wake_all(&log->l_write_head);
-        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
-                ASSERT(!logerror);
-                /*
-                 * Force the incore logs to disk before shutting the
-                 * log down completely.
-                 */
-                _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
-                spin_lock(&log->l_icloglock);
-                retval = xlog_state_ioerror(log);
-                spin_unlock(&log->l_icloglock);
-        }
        /*
         * Wake up everybody waiting on xfs_log_force. Wake the CIL push first
         * as if the log writes were completed. The abort handling in the log
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 79cfe7e6ec7a..00cd7f3a8f59 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -193,12 +193,8 @@ xlog_bread_noalign(
        bp->b_io_length = nbblks;
        bp->b_error = 0;
-        if (XFS_FORCED_SHUTDOWN(log->l_mp))
+        error = xfs_buf_submit_wait(bp);
-                return -EIO;
+        if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
-        xfs_buf_iorequest(bp);
-        error = xfs_buf_iowait(bp);
-        if (error)
                xfs_buf_ioerror_alert(bp, __func__);
        return error;
 }
@@ -378,12 +374,14 @@ xlog_recover_iodone(
                 * We're not going to bother about retrying
                 * this during recovery. One strike!
                 */
-                xfs_buf_ioerror_alert(bp, __func__);
+                if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
-                xfs_force_shutdown(bp->b_target->bt_mount,
+                        xfs_buf_ioerror_alert(bp, __func__);
-                                        SHUTDOWN_META_IO_ERROR);
+                        xfs_force_shutdown(bp->b_target->bt_mount,
+                                                SHUTDOWN_META_IO_ERROR);
+                }
        }
        bp->b_iodone = NULL;
-        xfs_buf_ioend(bp, 0);
+        xfs_buf_ioend(bp);
 }
 /*
@@ -4452,16 +4450,12 @@ xlog_do_recover(
        XFS_BUF_UNASYNC(bp);
        bp->b_ops = &xfs_sb_buf_ops;
-        if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+        error = xfs_buf_submit_wait(bp);
-                xfs_buf_relse(bp);
-                return -EIO;
-        }
-        xfs_buf_iorequest(bp);
-        error = xfs_buf_iowait(bp);
        if (error) {
-                xfs_buf_ioerror_alert(bp, __func__);
+                if (!XFS_FORCED_SHUTDOWN(log->l_mp)) {
-                ASSERT(0);
+                        xfs_buf_ioerror_alert(bp, __func__);
+                        ASSERT(0);
+                }
                xfs_buf_relse(bp);
                return error;
        }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d36bdbc9eeb2..51435dbce9c4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -300,21 +300,15 @@ xfs_readsb(
         * access to the superblock.
         */
 reread:
-        bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
+        error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-                                   BTOBB(sector_size), 0, buf_ops);
+                                   BTOBB(sector_size), 0, &bp, buf_ops);
-        if (!bp) {
+        if (error) {
-                if (loud)
-                        xfs_warn(mp, "SB buffer read failed");
-                return -EIO;
-        }
-        if (bp->b_error) {
-                error = bp->b_error;
                if (loud)
                        xfs_warn(mp, "SB validate failed with error %d.", error);
                /* bad CRC means corrupted metadata */
                if (error == -EFSBADCRC)
                        error = -EFSCORRUPTED;
-                goto release_buf;
+                return error;
        }
        /*
@@ -544,40 +538,43 @@ xfs_set_inoalignment(xfs_mount_t *mp)
 * Check that the data (and log if separate) is an ok size.
 */
 STATIC int
-xfs_check_sizes(xfs_mount_t *mp)
+xfs_check_sizes(
+        struct xfs_mount *mp)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf  *bp;
        xfs_daddr_t     d;
+        int             error;
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
                xfs_warn(mp, "filesystem size mismatch detected");
                return -EFBIG;
        }
-        bp = xfs_buf_read_uncached(mp->m_ddev_targp,
+        error = xfs_buf_read_uncached(mp->m_ddev_targp,
                                        d - XFS_FSS_TO_BB(mp, 1),
-                                        XFS_FSS_TO_BB(mp, 1), 0, NULL);
+                                        XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
-        if (!bp) {
+        if (error) {
                xfs_warn(mp, "last sector read failed");
-                return -EIO;
+                return error;
        }
        xfs_buf_relse(bp);
-        if (mp->m_logdev_targp != mp->m_ddev_targp) {
+        if (mp->m_logdev_targp == mp->m_ddev_targp)
-                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+                return 0;
-                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-                        xfs_warn(mp, "log size mismatch detected");
+        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
-                        return -EFBIG;
+        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-                }
+                xfs_warn(mp, "log size mismatch detected");
-                bp = xfs_buf_read_uncached(mp->m_logdev_targp,
+                return -EFBIG;
+        }
+        error = xfs_buf_read_uncached(mp->m_logdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
-                                        XFS_FSB_TO_BB(mp, 1), 0, NULL);
+                                        XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
-                if (!bp) {
+        if (error) {
-                        xfs_warn(mp, "log device read failed");
+                xfs_warn(mp, "log device read failed");
-                        return -EIO;
+                return error;
-                }
-                xfs_buf_relse(bp);
        }
+        xfs_buf_relse(bp);
        return 0;
 }
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index d45aebe04dde..e1175ea9b551 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -921,16 +921,11 @@ xfs_growfs_rt(
        /*
         * Read in the last block of the device, make sure it exists.
         */
-        bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
+        error = xfs_buf_read_uncached(mp->m_rtdev_targp,
                                XFS_FSB_TO_BB(mp, nrblocks - 1),
-                                XFS_FSB_TO_BB(mp, 1), 0, NULL);
+                                XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
-        if (!bp)
+        if (error)
-                return -EIO;
-        if (bp->b_error) {
-                error = bp->b_error;
-                xfs_buf_relse(bp);
                return error;
-        }
        xfs_buf_relse(bp);
        /*
@@ -1184,11 +1179,12 @@ xfs_rtallocate_extent(
 */
 int                             /* error */
 xfs_rtmount_init(
-        xfs_mount_t     *mp)    /* file system mount structure */
+        struct xfs_mount        *mp)    /* file system mount structure */
 {
-        xfs_buf_t       *bp;    /* buffer for last block of subvolume */
+        struct xfs_buf          *bp;    /* buffer for last block of subvolume */
-        xfs_daddr_t     d;      /* address of last block of subvolume */
+        struct xfs_sb           *sbp;   /* filesystem superblock copy in mount */
-        xfs_sb_t        *sbp;   /* filesystem superblock copy in mount */
+        xfs_daddr_t             d;      /* address of last block of subvolume */
+        int                     error;
        sbp = &mp->m_sb;
        if (sbp->sb_rblocks == 0)
@@ -1214,14 +1210,12 @@ xfs_rtmount_init(
                        (unsigned long long) mp->m_sb.sb_rblocks);
                return -EFBIG;
        }
-        bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
+        error = xfs_buf_read_uncached(mp->m_rtdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
-                                        XFS_FSB_TO_BB(mp, 1), 0, NULL);
+                                        XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
-        if (!bp || bp->b_error) {
+        if (error) {
                xfs_warn(mp, "realtime device size check failed");
-                if (bp)
+                return error;
-                        xfs_buf_relse(bp);
-                return -EIO;
        }
        xfs_buf_relse(bp);
        return 0;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 152f82782630..51372e34d988 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -349,7 +349,8 @@ DEFINE_BUF_EVENT(xfs_buf_free);
 DEFINE_BUF_EVENT(xfs_buf_hold);
 DEFINE_BUF_EVENT(xfs_buf_rele);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
-DEFINE_BUF_EVENT(xfs_buf_iorequest);
+DEFINE_BUF_EVENT(xfs_buf_submit);
+DEFINE_BUF_EVENT(xfs_buf_submit_wait);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 96c898e7ac9a..e2b2216b1635 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -318,20 +318,10 @@ xfs_trans_read_buf_map(
                        XFS_BUF_READ(bp);
                        bp->b_ops = ops;
-                        /*
+                        error = xfs_buf_submit_wait(bp);
-                         * XXX(hch): clean up the error handling here to be less
-                         * of a mess..
-                         */
-                        if (XFS_FORCED_SHUTDOWN(mp)) {
-                                trace_xfs_bdstrat_shut(bp, _RET_IP_);
-                                xfs_bioerror_relse(bp);
-                        } else {
-                                xfs_buf_iorequest(bp);
-                        }
-                        error = xfs_buf_iowait(bp);
                        if (error) {
-                                xfs_buf_ioerror_alert(bp, __func__);
+                                if (!XFS_FORCED_SHUTDOWN(mp))
+                                        xfs_buf_ioerror_alert(bp, __func__);
                                xfs_buf_relse(bp);
                                /*
                                 * We can gracefully recover from most read
author	Dave Chinner <david@fromorbit.com>	2014-10-01 19:11:14 -0400
committer	Dave Chinner <david@fromorbit.com>	2014-10-01 19:11:14 -0400
commit	75e58ce4c8f354f1a68a8bb8a9692827cdaf3d21 (patch)
tree	4bbcd993a6e75d199c82958ffe15c0e0d03f4bbf
parent	bd438f825f7badafe56d117ed906488c8541f95f (diff)
parent	8c15612546bce1ecafb7dee3cce8a2a9b560e15e (diff)