Merge tag 'xfs-for-linus-3.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs

Pull xfs update from Dave Chinner: "This update contains: - various cleanups - log recovery debug hooks - seek hole/data implementation merge - extent shift rework to fix collapse range bugs - various sparse warning fixes - log recovery transaction processing rework to fix use after free bugs - metadata buffer IO infrastructuer rework to ensure all buffers under IO have valid reference counts - various fixes for ondisk flags, writeback and zero range corner cases" * tag 'xfs-for-linus-3.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (56 commits) xfs: fix agno increment in xfs_inumbers() loop xfs: xfs_iflush_done checks the wrong log item callback xfs: flush the range before zero range conversion xfs: restore buffer_head unwritten bit on ioend cancel xfs: check for null dquot in xfs_quota_calc_throttle() xfs: fix crc field handling in xfs_sb_to/from_disk xfs: don't send null bp to xfs_trans_brelse() xfs: check for inode size overflow in xfs_new_eof() xfs: only set extent size hint when asked xfs: project id inheritance is a directory only flag xfs: kill time.h xfs: compat_xfs_bstat does not have forkoff xfs: simplify xfs_zero_remaining_bytes xfs: check xfs_buf_read_uncached returns correctly xfs: introduce xfs_buf_submit[_wait] xfs: kill xfs_bioerror_relse xfs: xfs_bioerror can die. xfs: kill xfs_bdstrat_cb xfs: rework xfs_buf_bio_endio error handling xfs: xfs_buf_ioend and xfs_buf_iodone_work duplicate functionality ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-10-13 06:06:54 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-10-13 06:06:54 -0400
commit: 5ff0b9e1a1da58b584aa4b8ea234be20b5a1164b (patch)
tree: 4849a305c073d4add184c1474a6c000a847285e7
parent: 77c688ac87183537ed0fb84ec2cb8fa8ec97c458 (diff)
parent: 6889e783cd68b79f8330ad4d10a2571c67c3f7df (diff)
47 files changed, 1304 insertions, 1176 deletions
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 844e288b9576..53e95b2a1369 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -21,7 +21,6 @@
 #include <linux/swap.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
-#include "time.h"
 #include "kmem.h"
 #include "xfs_message.h"
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 4bffffe038a1..eff34218f405 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2209,6 +2209,10 @@ xfs_agf_verify(
              be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
                return false;
+        if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
+            be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
+                return false;
        /*
         * during growfs operations, the perag is not fully initialised,
         * so we can't use it for any useful checking. growfs ensures we can't
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 86df952d3e24..79c981984dca 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5404,22 +5404,223 @@ error0:
 }
 /*
+ * Determine whether an extent shift can be accomplished by a merge with the
+ * extent that precedes the target hole of the shift.
+ */
+STATIC bool
+xfs_bmse_can_merge(
+        struct xfs_bmbt_irec    *left,  /* preceding extent */
+        struct xfs_bmbt_irec    *got,   /* current extent to shift */
+        xfs_fileoff_t           shift)  /* shift fsb */
+{
+        xfs_fileoff_t           startoff;
+        startoff = got->br_startoff - shift;
+        /*
+         * The extent, once shifted, must be adjacent in-file and on-disk with
+         * the preceding extent.
+         */
+        if ((left->br_startoff + left->br_blockcount != startoff) ||
+            (left->br_startblock + left->br_blockcount != got->br_startblock) ||
+            (left->br_state != got->br_state) ||
+            (left->br_blockcount + got->br_blockcount > MAXEXTLEN))
+                return false;
+        return true;
+}
+/*
+ * A bmap extent shift adjusts the file offset of an extent to fill a preceding
+ * hole in the file. If an extent shift would result in the extent being fully
+ * adjacent to the extent that currently precedes the hole, we can merge with
+ * the preceding extent rather than do the shift.
+ *
+ * This function assumes the caller has verified a shift-by-merge is possible
+ * with the provided extents via xfs_bmse_can_merge().
+ */
+STATIC int
+xfs_bmse_merge(
+        struct xfs_inode                *ip,
+        int                             whichfork,
+        xfs_fileoff_t                   shift,          /* shift fsb */
+        int                             current_ext,    /* idx of gotp */
+        struct xfs_bmbt_rec_host        *gotp,          /* extent to shift */
+        struct xfs_bmbt_rec_host        *leftp,         /* preceding extent */
+        struct xfs_btree_cur            *cur,
+        int                             *logflags)      /* output */
+{
+        struct xfs_ifork                *ifp;
+        struct xfs_bmbt_irec            got;
+        struct xfs_bmbt_irec            left;
+        xfs_filblks_t                   blockcount;
+        int                             error, i;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        xfs_bmbt_get_all(gotp, &got);
+        xfs_bmbt_get_all(leftp, &left);
+        blockcount = left.br_blockcount + got.br_blockcount;
+        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        ASSERT(xfs_bmse_can_merge(&left, &got, shift));
+        /*
+         * Merge the in-core extents. Note that the host record pointers and
+         * current_ext index are invalid once the extent has been removed via
+         * xfs_iext_remove().
+         */
+        xfs_bmbt_set_blockcount(leftp, blockcount);
+        xfs_iext_remove(ip, current_ext, 1, 0);
+        /*
+         * Update the on-disk extent count, the btree if necessary and log the
+         * inode.
+         */
+        XFS_IFORK_NEXT_SET(ip, whichfork,
+                           XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+        *logflags |= XFS_ILOG_CORE;
+        if (!cur) {
+                *logflags |= XFS_ILOG_DEXT;
+                return 0;
+        }
+        /* lookup and remove the extent to merge */
+        error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
+                                   got.br_blockcount, &i);
+        if (error)
+                goto out_error;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+        error = xfs_btree_delete(cur, &i);
+        if (error)
+                goto out_error;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+        /* lookup and update size of the previous extent */
+        error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
+                                   left.br_blockcount, &i);
+        if (error)
+                goto out_error;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+        left.br_blockcount = blockcount;
+        error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock,
+                                left.br_blockcount, left.br_state);
+        if (error)
+                goto out_error;
+        return 0;
+out_error:
+        return error;
+}
+/*
+ * Shift a single extent.
+ */
+STATIC int
+xfs_bmse_shift_one(
+        struct xfs_inode                *ip,
+        int                             whichfork,
+        xfs_fileoff_t                   offset_shift_fsb,
+        int                             *current_ext,
+        struct xfs_bmbt_rec_host        *gotp,
+        struct xfs_btree_cur            *cur,
+        int                             *logflags)
+{
+        struct xfs_ifork                *ifp;
+        xfs_fileoff_t                   startoff;
+        struct xfs_bmbt_rec_host        *leftp;
+        struct xfs_bmbt_irec            got;
+        struct xfs_bmbt_irec            left;
+        int                             error;
+        int                             i;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        xfs_bmbt_get_all(gotp, &got);
+        startoff = got.br_startoff - offset_shift_fsb;
+        /* delalloc extents should be prevented by caller */
+        XFS_WANT_CORRUPTED_GOTO(!isnullstartblock(got.br_startblock),
+                                out_error);
+        /*
+         * If this is the first extent in the file, make sure there's enough
+         * room at the start of the file and jump right to the shift as there's
+         * no left extent to merge.
+         */
+        if (*current_ext == 0) {
+                if (got.br_startoff < offset_shift_fsb)
+                        return -EINVAL;
+                goto shift_extent;
+        }
+        /* grab the left extent and check for a large enough hole */
+        leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
+        xfs_bmbt_get_all(leftp, &left);
+        if (startoff < left.br_startoff + left.br_blockcount)
+                return -EINVAL;
+        /* check whether to merge the extent or shift it down */
+        if (!xfs_bmse_can_merge(&left, &got, offset_shift_fsb))
+                goto shift_extent;
+        return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, *current_ext,
+                              gotp, leftp, cur, logflags);
+shift_extent:
+        /*
+         * Increment the extent index for the next iteration, update the start
+         * offset of the in-core extent and update the btree if applicable.
+         */
+        (*current_ext)++;
+        xfs_bmbt_set_startoff(gotp, startoff);
+        *logflags |= XFS_ILOG_CORE;
+        if (!cur) {
+                *logflags |= XFS_ILOG_DEXT;
+                return 0;
+        }
+        error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
+                                   got.br_blockcount, &i);
+        if (error)
+                return error;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+        got.br_startoff = startoff;
+        error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
+                                got.br_blockcount, got.br_state);
+        if (error)
+                return error;
+        return 0;
+out_error:
+        return error;
+}
+/*
 * Shift extent records to the left to cover a hole.
 *
- * The maximum number of extents to be shifted in a single operation
+ * The maximum number of extents to be shifted in a single operation is
- * is @num_exts, and @current_ext keeps track of the current extent
+ * @num_exts. @start_fsb specifies the file offset to start the shift and the
- * index we have shifted. @offset_shift_fsb is the length by which each
+ * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb
- * extent is shifted. If there is no hole to shift the extents
+ * is the length by which each extent is shifted. If there is no hole to shift
- * into, this will be considered invalid operation and we abort immediately.
+ * the extents into, this will be considered invalid operation and we abort
+ * immediately.
 */
 int
 xfs_bmap_shift_extents(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
-        int                     *done,
        xfs_fileoff_t           start_fsb,
        xfs_fileoff_t           offset_shift_fsb,
-        xfs_extnum_t            *current_ext,
+        int                     *done,
+        xfs_fileoff_t           *next_fsb,
        xfs_fsblock_t           *firstblock,
        struct xfs_bmap_free    *flist,
        int                     num_exts)
@@ -5427,16 +5628,13 @@ xfs_bmap_shift_extents(
        struct xfs_btree_cur            *cur = NULL;
        struct xfs_bmbt_rec_host        *gotp;
        struct xfs_bmbt_irec            got;
-        struct xfs_bmbt_irec            left;
        struct xfs_mount                *mp = ip->i_mount;
        struct xfs_ifork                *ifp;
        xfs_extnum_t                    nexts = 0;
-        xfs_fileoff_t                   startoff;
+        xfs_extnum_t                    current_ext;
        int                             error = 0;
-        int                             i;
        int                             whichfork = XFS_DATA_FORK;
        int                             logflags = 0;
-        xfs_filblks_t                   blockcount = 0;
        int                             total_extents;
        if (unlikely(XFS_TEST_ERROR(
@@ -5451,7 +5649,8 @@ xfs_bmap_shift_extents(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
-        ASSERT(current_ext != NULL);
+        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ifp = XFS_IFORK_PTR(ip, whichfork);
        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -5461,23 +5660,6 @@ xfs_bmap_shift_extents(
                        return error;
        }
-        /*
-         * If *current_ext is 0, we would need to lookup the extent
-         * from where we would start shifting and store it in gotp.
-         */
-        if (!*current_ext) {
-                gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
-                /*
-                 * gotp can be null in 2 cases: 1) if there are no extents
-                 * or 2) start_fsb lies in a hole beyond which there are
-                 * no extents. Either way, we are done.
-                 */
-                if (!gotp) {
-                        *done = 1;
-                        return 0;
-                }
-        }
        if (ifp->if_flags & XFS_IFBROOT) {
                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
                cur->bc_private.b.firstblock = *firstblock;
@@ -5486,112 +5668,46 @@ xfs_bmap_shift_extents(
        }
        /*
+         * Look up the extent index for the fsb where we start shifting. We can
+         * henceforth iterate with current_ext as extent list changes are locked
+         * out via ilock.
+         *
+         * gotp can be null in 2 cases: 1) if there are no extents or 2)
+         * start_fsb lies in a hole beyond which there are no extents. Either
+         * way, we are done.
+         */
+        gotp = xfs_iext_bno_to_ext(ifp, start_fsb, &current_ext);
+        if (!gotp) {
+                *done = 1;
+                goto del_cursor;
+        }
+        /*
         * There may be delalloc extents in the data fork before the range we
-         * are collapsing out, so we cannot
+         * are collapsing out, so we cannot use the count of real extents here.
-         * use the count of real extents here. Instead we have to calculate it
+         * Instead we have to calculate it from the incore fork.
-         * from the incore fork.
         */
        total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-        while (nexts++ < num_exts && *current_ext < total_extents) {
+        while (nexts++ < num_exts && current_ext < total_extents) {
+                error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
-                gotp = xfs_iext_get_ext(ifp, *current_ext);
+                                        &current_ext, gotp, cur, &logflags);
-                xfs_bmbt_get_all(gotp, &got);
-                startoff = got.br_startoff - offset_shift_fsb;
-                /*
-                 * Before shifting extent into hole, make sure that the hole
-                 * is large enough to accomodate the shift.
-                 */
-                if (*current_ext) {
-                        xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
-                                                *current_ext - 1), &left);
-                        if (startoff < left.br_startoff + left.br_blockcount)
-                                error = -EINVAL;
-                } else if (offset_shift_fsb > got.br_startoff) {
-                        /*
-                         * When first extent is shifted, offset_shift_fsb
-                         * should be less than the stating offset of
-                         * the first extent.
-                         */
-                        error = -EINVAL;
-                }
                if (error)
                        goto del_cursor;
-                if (cur) {
+                /* update total extent count and grab the next record */
-                        error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
-                                                   got.br_startblock,
-                                                   got.br_blockcount,
-                                                   &i);
-                        if (error)
-                                goto del_cursor;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-                }
-                /* Check if we can merge 2 adjacent extents */
-                if (*current_ext &&
-                    left.br_startoff + left.br_blockcount == startoff &&
-                    left.br_startblock + left.br_blockcount ==
-                                got.br_startblock &&
-                    left.br_state == got.br_state &&
-                    left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
-                        blockcount = left.br_blockcount +
-                                got.br_blockcount;
-                        xfs_iext_remove(ip, *current_ext, 1, 0);
-                        logflags |= XFS_ILOG_CORE;
-                        if (cur) {
-                                error = xfs_btree_delete(cur, &i);
-                                if (error)
-                                        goto del_cursor;
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-                        } else {
-                                logflags |= XFS_ILOG_DEXT;
-                        }
-                        XFS_IFORK_NEXT_SET(ip, whichfork,
-                                XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
-                        gotp = xfs_iext_get_ext(ifp, --*current_ext);
-                        xfs_bmbt_get_all(gotp, &got);
-                        /* Make cursor point to the extent we will update */
-                        if (cur) {
-                                error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
-                                                           got.br_startblock,
-                                                           got.br_blockcount,
-                                                           &i);
-                                if (error)
-                                        goto del_cursor;
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-                        }
-                        xfs_bmbt_set_blockcount(gotp, blockcount);
-                        got.br_blockcount = blockcount;
-                } else {
-                        /* We have to update the startoff */
-                        xfs_bmbt_set_startoff(gotp, startoff);
-                        got.br_startoff = startoff;
-                }
-                logflags |= XFS_ILOG_CORE;
-                if (cur) {
-                        error = xfs_bmbt_update(cur, got.br_startoff,
-                                                got.br_startblock,
-                                                got.br_blockcount,
-                                                got.br_state);
-                        if (error)
-                                goto del_cursor;
-                } else {
-                        logflags |= XFS_ILOG_DEXT;
-                }
-                (*current_ext)++;
                total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+                if (current_ext >= total_extents)
+                        break;
+                gotp = xfs_iext_get_ext(ifp, current_ext);
        }
        /* Check if we are done */
-        if (*current_ext == total_extents)
+        if (current_ext == total_extents) {
                *done = 1;
+        } else if (next_fsb) {
+                xfs_bmbt_get_all(gotp, &got);
+                *next_fsb = got.br_startoff;
+        }
 del_cursor:
        if (cur)
@@ -5600,5 +5716,6 @@ del_cursor:
        if (logflags)
                xfs_trans_log_inode(tp, ip, logflags);
        return error;
 }
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b879ca56a64c..44db6db86402 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -178,9 +178,8 @@ int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
                xfs_extnum_t num);
 uint    xfs_default_attroffset(struct xfs_inode *ip);
 int     xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
-                int *done, xfs_fileoff_t start_fsb,
+                xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb,
-                xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
+                int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock,
-                xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
+                struct xfs_bmap_free *flist, int num_exts);
-                int num_exts);
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 2c42ae28d027..fd827530afec 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2563,7 +2563,8 @@ xfs_da_get_buf(
                                    mapp, nmap, 0);
        error = bp ? bp->b_error : -EIO;
        if (error) {
-                xfs_trans_brelse(trans, bp);
+                if (bp)
+                        xfs_trans_brelse(trans, bp);
                goto out_free;
        }
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index c9aee52a37e2..7e42fdfd2f1d 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -270,7 +270,6 @@ xfs_dir3_data_get_ftype(
 {
        __uint8_t       ftype = dep->name[dep->namelen];
-        ASSERT(ftype < XFS_DIR3_FT_MAX);
        if (ftype >= XFS_DIR3_FT_MAX)
                return XFS_DIR3_FT_UNKNOWN;
        return ftype;
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 6cef22152fd6..7075aaf131f4 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -237,7 +237,8 @@ xfs_dir_init(
 }
 /*
-  Enter a name in a directory.
+ * Enter a name in a directory, or check for available space.
+ * If inum is 0, only the available space test is performed.
 */
 int
 xfs_dir_createname(
@@ -254,10 +255,12 @@ xfs_dir_createname(
        int                     v;              /* type-checking value */
        ASSERT(S_ISDIR(dp->i_d.di_mode));
-        rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+        if (inum) {
-        if (rval)
+                rval = xfs_dir_ino_validate(tp->t_mountp, inum);
-                return rval;
+                if (rval)
-        XFS_STATS_INC(xs_dir_create);
+                        return rval;
+                XFS_STATS_INC(xs_dir_create);
+        }
        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
        if (!args)
@@ -276,6 +279,8 @@ xfs_dir_createname(
        args->whichfork = XFS_DATA_FORK;
        args->trans = tp;
        args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+        if (!inum)
+                args->op_flags |= XFS_DA_OP_JUSTCHECK;
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
                rval = xfs_dir2_sf_addname(args);
@@ -535,62 +540,14 @@ out_free:
 /*
 * See if this entry can be added to the directory without allocating space.
- * First checks that the caller couldn't reserve enough space (resblks = 0).
 */
 int
 xfs_dir_canenter(
        xfs_trans_t     *tp,
        xfs_inode_t     *dp,
-        struct xfs_name *name,          /* name of entry to add */
+        struct xfs_name *name)          /* name of entry to add */
-        uint            resblks)
 {
-        struct xfs_da_args *args;
+        return xfs_dir_createname(tp, dp, name, 0, NULL, NULL, 0);
-        int             rval;
-        int             v;              /* type-checking value */
-        if (resblks)
-                return 0;
-        ASSERT(S_ISDIR(dp->i_d.di_mode));
-        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-        if (!args)
-                return -ENOMEM;
-        args->geo = dp->i_mount->m_dir_geo;
-        args->name = name->name;
-        args->namelen = name->len;
-        args->filetype = name->type;
-        args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-        args->dp = dp;
-        args->whichfork = XFS_DATA_FORK;
-        args->trans = tp;
-        args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
-                                                        XFS_DA_OP_OKNOENT;
-        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-                rval = xfs_dir2_sf_addname(args);
-                goto out_free;
-        }
-        rval = xfs_dir2_isblock(args, &v);
-        if (rval)
-                goto out_free;
-        if (v) {
-                rval = xfs_dir2_block_addname(args);
-                goto out_free;
-        }
-        rval = xfs_dir2_isleaf(args, &v);
-        if (rval)
-                goto out_free;
-        if (v)
-                rval = xfs_dir2_leaf_addname(args);
-        else
-                rval = xfs_dir2_node_addname(args);
-out_free:
-        kmem_free(args);
-        return rval;
 }
 /*
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c8e86b0b5e99..4dff261e6ed5 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -136,7 +136,7 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
                                xfs_fsblock_t *first,
                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
-                                struct xfs_name *name, uint resblks);
+                                struct xfs_name *name);
 /*
 * Direct call from the bmap code, bypassing the generic directory layer.
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index b62771f1f4b5..23dcb72fc5e6 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1076,8 +1076,8 @@ xfs_dialloc_ag_finobt_newino(
        int i;
        if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
-                error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
+                error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
-                                         &i);
+                                         XFS_LOOKUP_EQ, &i);
                if (error)
                        return error;
                if (i == 1) {
@@ -1085,7 +1085,6 @@ xfs_dialloc_ag_finobt_newino(
                        if (error)
                                return error;
                        XFS_WANT_CORRUPTED_RETURN(i == 1);
                        return 0;
                }
        }
@@ -2051,6 +2050,8 @@ xfs_agi_verify(
        if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
                return false;
+        if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+                return false;
        /*
         * during growfs operations, the perag is not fully initialised,
         * so we can't use it for any useful checking. growfs ensures we can't
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index f4dd697cac08..7c818f1e4484 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -424,20 +424,24 @@ xfs_rtfind_forw(
 }
 /*
- * Read and modify the summary information for a given extent size,
+ * Read and/or modify the summary information for a given extent size,
 * bitmap block combination.
 * Keeps track of a current summary block, so we don't keep reading
 * it from the buffer cache.
+ *
+ * Summary information is returned in *sum if specified.
+ * If no delta is specified, returns summary only.
 */
 int
-xfs_rtmodify_summary(
+xfs_rtmodify_summary_int(
-        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_mount_t     *mp,            /* file system mount structure */
        xfs_trans_t     *tp,            /* transaction pointer */
        int             log,            /* log2 of extent size */
        xfs_rtblock_t   bbno,           /* bitmap block number */
        int             delta,          /* change to make to summary info */
        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
-        xfs_fsblock_t   *rsb)           /* in/out: summary block number */
+        xfs_fsblock_t   *rsb,           /* in/out: summary block number */
+        xfs_suminfo_t   *sum)           /* out: summary info for this block */
 {
        xfs_buf_t       *bp;            /* buffer for the summary block */
        int             error;          /* error value */
@@ -456,7 +460,7 @@ xfs_rtmodify_summary(
        /*
         * If we have an old buffer, and the block number matches, use that.
         */
-        if (rbpp && *rbpp && *rsb == sb)
+        if (*rbpp && *rsb == sb)
                bp = *rbpp;
        /*
         * Otherwise we have to get the buffer.
@@ -465,7 +469,7 @@ xfs_rtmodify_summary(
                /*
                 * If there was an old one, get rid of it first.
                 */
-                if (rbpp && *rbpp)
+                if (*rbpp)
                        xfs_trans_brelse(tp, *rbpp);
                error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
                if (error) {
@@ -474,21 +478,38 @@ xfs_rtmodify_summary(
                /*
                 * Remember this buffer and block for the next call.
                 */
-                if (rbpp) {
+                *rbpp = bp;
-                        *rbpp = bp;
+                *rsb = sb;
-                        *rsb = sb;
-                }
        }
        /*
-         * Point to the summary information, modify and log it.
+         * Point to the summary information, modify/log it, and/or copy it out.
         */
        sp = XFS_SUMPTR(mp, bp, so);
-        *sp += delta;
+        if (delta) {
-        xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
+                uint first = (uint)((char *)sp - (char *)bp->b_addr);
-                (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
+                *sp += delta;
+                xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
+        }
+        if (sum)
+                *sum = *sp;
        return 0;
 }
+int
+xfs_rtmodify_summary(
+        xfs_mount_t     *mp,            /* file system mount structure */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        int             log,            /* log2 of extent size */
+        xfs_rtblock_t   bbno,           /* bitmap block number */
+        int             delta,          /* change to make to summary info */
+        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+        xfs_fsblock_t   *rsb)           /* in/out: summary block number */
+{
+        return xfs_rtmodify_summary_int(mp, tp, log, bbno,
+                                        delta, rbpp, rsb, NULL);
+}
 /*
 * Set the given range of bitmap bits to the given value.
 * Do whatever I/O and logging is required.
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index ad525a5623a4..5f902fa7913f 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -279,11 +279,13 @@ xfs_mount_validate_sb(
            sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG                    ||
            sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG                    ||
            sbp->sb_blocksize != (1 << sbp->sb_blocklog)                ||
+            sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG                   ||
            sbp->sb_inodesize < XFS_DINODE_MIN_SIZE                     ||
            sbp->sb_inodesize > XFS_DINODE_MAX_SIZE                     ||
            sbp->sb_inodelog < XFS_DINODE_MIN_LOG                       ||
            sbp->sb_inodelog > XFS_DINODE_MAX_LOG                       ||
            sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
+            sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE                    ||
            sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
            (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)   ||
            (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
@@ -443,6 +445,8 @@ __xfs_sb_from_disk(
        to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
        to->sb_features_log_incompat =
                                be32_to_cpu(from->sb_features_log_incompat);
+        /* crc is only used on disk, not in memory; just init to 0 here. */
+        to->sb_crc = 0;
        to->sb_pad = 0;
        to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
        to->sb_lsn = be64_to_cpu(from->sb_lsn);
@@ -548,6 +552,9 @@ xfs_sb_to_disk(
        if (!fields)
                return;
+        /* We should never write the crc here, it's updated in the IO path */
+        fields &= ~XFS_SB_CRC;
        xfs_sb_quota_to_disk(to, from, &fields);
        while (fields) {
                f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
diff --git a/fs/xfs/time.h b/fs/xfs/time.h
deleted file mode 100644
index 387e695a184c..000000000000
--- a/fs/xfs/time.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_TIME_H__
-#define __XFS_SUPPORT_TIME_H__
-#include <linux/sched.h>
-#include <linux/time.h>
-typedef struct timespec timespec_t;
-static inline void delay(long ticks)
-{
-        schedule_timeout_uninterruptible(ticks);
-}
-static inline void nanotime(struct timespec *tvp)
-{
-        *tvp = CURRENT_TIME;
-}
-#endif /* __XFS_SUPPORT_TIME_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index b984647c24db..f5b2453a43b2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -434,10 +434,22 @@ xfs_start_page_writeback(
 {
        ASSERT(PageLocked(page));
        ASSERT(!PageWriteback(page));
-        if (clear_dirty)
+        /*
+         * if the page was not fully cleaned, we need to ensure that the higher
+         * layers come back to it correctly. That means we need to keep the page
+         * dirty, and for WB_SYNC_ALL writeback we need to ensure the
+         * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
+         * write this page in this writeback sweep will be made.
+         */
+        if (clear_dirty) {
                clear_page_dirty_for_io(page);
-        set_page_writeback(page);
+                set_page_writeback(page);
+        } else
+                set_page_writeback_keepwrite(page);
        unlock_page(page);
        /* If no buffers on the page are to be written, finish it here */
        if (!buffers)
                end_page_writeback(page);
@@ -548,6 +560,13 @@ xfs_cancel_ioend(
                do {
                        next_bh = bh->b_private;
                        clear_buffer_async_write(bh);
+                        /*
+                         * The unwritten flag is cleared when added to the
+                         * ioend. We're not submitting for I/O so mark the
+                         * buffer unwritten again for next time around.
+                         */
+                        if (ioend->io_type == XFS_IO_UNWRITTEN)
+                                set_buffer_unwritten(bh);
                        unlock_buffer(bh);
                } while ((bh = next_bh) != NULL);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 1707980f9a4b..92e8f99a5857 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1122,14 +1122,6 @@ xfs_zero_remaining_bytes(
        if (endoff > XFS_ISIZE(ip))
                endoff = XFS_ISIZE(ip);
-        bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
-                                        mp->m_rtdev_targp : mp->m_ddev_targp,
-                                  BTOBB(mp->m_sb.sb_blocksize), 0);
-        if (!bp)
-                return -ENOMEM;
-        xfs_buf_unlock(bp);
        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
                uint lock_mode;
@@ -1152,42 +1144,24 @@ xfs_zero_remaining_bytes(
                ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
                if (imap.br_state == XFS_EXT_UNWRITTEN)
                        continue;
-                XFS_BUF_UNDONE(bp);
-                XFS_BUF_UNWRITE(bp);
-                XFS_BUF_READ(bp);
-                XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
-                if (XFS_FORCED_SHUTDOWN(mp)) {
+                error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
-                        error = -EIO;
+                                mp->m_rtdev_targp : mp->m_ddev_targp,
-                        break;
+                                xfs_fsb_to_db(ip, imap.br_startblock),
-                }
+                                BTOBB(mp->m_sb.sb_blocksize),
-                xfs_buf_iorequest(bp);
+                                0, &bp, NULL);
-                error = xfs_buf_iowait(bp);
+                if (error)
-                if (error) {
+                        return error;
-                        xfs_buf_ioerror_alert(bp,
-                                        "xfs_zero_remaining_bytes(read)");
-                        break;
-                }
                memset(bp->b_addr +
-                        (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+                                (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
-                      0, lastoffset - offset + 1);
+                       0, lastoffset - offset + 1);
-                XFS_BUF_UNDONE(bp);
-                XFS_BUF_UNREAD(bp);
+                error = xfs_bwrite(bp);
-                XFS_BUF_WRITE(bp);
+                xfs_buf_relse(bp);
+                if (error)
-                if (XFS_FORCED_SHUTDOWN(mp)) {
+                        return error;
-                        error = -EIO;
-                        break;
-                }
-                xfs_buf_iorequest(bp);
-                error = xfs_buf_iowait(bp);
-                if (error) {
-                        xfs_buf_ioerror_alert(bp,
-                                        "xfs_zero_remaining_bytes(write)");
-                        break;
-                }
        }
-        xfs_buf_free(bp);
        return error;
 }
@@ -1205,6 +1179,7 @@ xfs_free_file_space(
        xfs_bmap_free_t         free_list;
        xfs_bmbt_irec_t         imap;
        xfs_off_t               ioffset;
+        xfs_off_t               iendoffset;
        xfs_extlen_t            mod=0;
        xfs_mount_t             *mp;
        int                     nimap;
@@ -1233,12 +1208,13 @@ xfs_free_file_space(
        inode_dio_wait(VFS_I(ip));
        rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
-        ioffset = offset & ~(rounding - 1);
+        ioffset = round_down(offset, rounding);
-        error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+        iendoffset = round_up(offset + len, rounding) - 1;
-                                              ioffset, -1);
+        error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
+                                             iendoffset);
        if (error)
                goto out;
-        truncate_pagecache_range(VFS_I(ip), ioffset, -1);
+        truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
        /*
         * Need to zero the stuff we're not freeing, on disk.
@@ -1392,14 +1368,14 @@ xfs_zero_file_space(
        if (start_boundary < end_boundary - 1) {
                /*
-                 * punch out delayed allocation blocks and the page cache over
+                 * Writeback the range to ensure any inode size updates due to
-                 * the conversion range
+                 * appending writes make it to disk (otherwise we could just
+                 * punch out the delalloc blocks).
                 */
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                error = xfs_bmap_punch_delalloc_range(ip,
+                                start_boundary, end_boundary - 1);
-                                XFS_B_TO_FSBT(mp, start_boundary),
+                if (error)
-                                XFS_B_TO_FSB(mp, end_boundary - start_boundary));
+                        goto out;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                truncate_pagecache_range(VFS_I(ip), start_boundary,
                                         end_boundary - 1);
@@ -1456,41 +1432,47 @@ xfs_collapse_file_space(
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
        int                     error;
-        xfs_extnum_t            current_ext = 0;
        struct xfs_bmap_free    free_list;
        xfs_fsblock_t           first_block;
        int                     committed;
        xfs_fileoff_t           start_fsb;
+        xfs_fileoff_t           next_fsb;
        xfs_fileoff_t           shift_fsb;
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
        trace_xfs_collapse_file_space(ip);
-        start_fsb = XFS_B_TO_FSB(mp, offset + len);
+        next_fsb = XFS_B_TO_FSB(mp, offset + len);
        shift_fsb = XFS_B_TO_FSB(mp, len);
-        /*
+        error = xfs_free_file_space(ip, offset, len);
-         * Writeback the entire file and force remove any post-eof blocks. The
-         * writeback prevents changes to the extent list via concurrent
-         * writeback and the eofblocks trim prevents the extent shift algorithm
-         * from running into a post-eof delalloc extent.
-         *
-         * XXX: This is a temporary fix until the extent shift loop below is
-         * converted to use offsets and lookups within the ILOCK rather than
-         * carrying around the index into the extent list for the next
-         * iteration.
-         */
-        error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
        if (error)
                return error;
+        /*
+         * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
+         * into the accessible region of the file.
+         */
        if (xfs_can_free_eofblocks(ip, true)) {
                error = xfs_free_eofblocks(mp, ip, false);
                if (error)
                        return error;
        }
-        error = xfs_free_file_space(ip, offset, len);
+        /*
+         * Writeback and invalidate cache for the remainder of the file as we're
+         * about to shift down every extent from the collapse range to EOF. The
+         * free of the collapse range above might have already done some of
+         * this, but we shouldn't rely on it to do anything outside of the range
+         * that was freed.
+         */
+        error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+                                             offset + len, -1);
+        if (error)
+                return error;
+        error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+                                        (offset + len) >> PAGE_CACHE_SHIFT, -1);
        if (error)
                return error;
@@ -1525,10 +1507,10 @@ xfs_collapse_file_space(
                 * We are using the write transaction in which max 2 bmbt
                 * updates are allowed
                 */
-                error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
+                start_fsb = next_fsb;
-                                               shift_fsb, &current_ext,
+                error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb,
-                                               &first_block, &free_list,
+                                &done, &next_fsb, &first_block, &free_list,
-                                               XFS_BMAP_MAX_SHIFT_EXTENTS);
+                                XFS_BMAP_MAX_SHIFT_EXTENTS);
                if (error)
                        goto out;
@@ -1638,7 +1620,7 @@ xfs_swap_extents_check_format(
        return 0;
 }
-int
+static int
 xfs_swap_extent_flush(
        struct xfs_inode        *ip)
 {
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index cd7b8ca9b064..017b6afe340b 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -623,10 +623,11 @@ _xfs_buf_read(
        bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
        bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
-        xfs_buf_iorequest(bp);
+        if (flags & XBF_ASYNC) {
-        if (flags & XBF_ASYNC)
+                xfs_buf_submit(bp);
                return 0;
-        return xfs_buf_iowait(bp);
+        }
+        return xfs_buf_submit_wait(bp);
 }
 xfs_buf_t *
@@ -687,34 +688,39 @@ xfs_buf_readahead_map(
 * Read an uncached buffer from disk. Allocates and returns a locked
 * buffer containing the disk contents or nothing.
 */
-struct xfs_buf *
+int
 xfs_buf_read_uncached(
        struct xfs_buftarg      *target,
        xfs_daddr_t             daddr,
        size_t                  numblks,
        int                     flags,
+        struct xfs_buf          **bpp,
        const struct xfs_buf_ops *ops)
 {
        struct xfs_buf          *bp;
+        *bpp = NULL;
        bp = xfs_buf_get_uncached(target, numblks, flags);
        if (!bp)
-                return NULL;
+                return -ENOMEM;
        /* set up the buffer for a read IO */
        ASSERT(bp->b_map_count == 1);
-        bp->b_bn = daddr;
+        bp->b_bn = XFS_BUF_DADDR_NULL;  /* always null for uncached buffers */
        bp->b_maps[0].bm_bn = daddr;
        bp->b_flags |= XBF_READ;
        bp->b_ops = ops;
-        if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
+        xfs_buf_submit_wait(bp);
+        if (bp->b_error) {
+                int     error = bp->b_error;
                xfs_buf_relse(bp);
-                return NULL;
+                return error;
        }
-        xfs_buf_iorequest(bp);
-        xfs_buf_iowait(bp);
+        *bpp = bp;
-        return bp;
+        return 0;
 }
 /*
@@ -998,53 +1004,56 @@ xfs_buf_wait_unpin(
 *      Buffer Utility Routines
 */
-STATIC void
+void
-xfs_buf_iodone_work(
+xfs_buf_ioend(
-        struct work_struct      *work)
+        struct xfs_buf  *bp)
 {
-        struct xfs_buf          *bp =
+        bool            read = bp->b_flags & XBF_READ;
-                container_of(work, xfs_buf_t, b_iodone_work);
-        bool                    read = !!(bp->b_flags & XBF_READ);
+        trace_xfs_buf_iodone(bp, _RET_IP_);
        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
-        /* only validate buffers that were read without errors */
+        /*
-        if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
+         * Pull in IO completion errors now. We are guaranteed to be running
+         * single threaded, so we don't need the lock to read b_io_error.
+         */
+        if (!bp->b_error && bp->b_io_error)
+                xfs_buf_ioerror(bp, bp->b_io_error);
+        /* Only validate buffers that were read without errors */
+        if (read && !bp->b_error && bp->b_ops) {
+                ASSERT(!bp->b_iodone);
                bp->b_ops->verify_read(bp);
+        }
+        if (!bp->b_error)
+                bp->b_flags |= XBF_DONE;
        if (bp->b_iodone)
                (*(bp->b_iodone))(bp);
        else if (bp->b_flags & XBF_ASYNC)
                xfs_buf_relse(bp);
-        else {
+        else
-                ASSERT(read && bp->b_ops);
                complete(&bp->b_iowait);
-        }
 }
-void
+static void
-xfs_buf_ioend(
+xfs_buf_ioend_work(
-        struct xfs_buf  *bp,
+        struct work_struct      *work)
-        int             schedule)
 {
-        bool            read = !!(bp->b_flags & XBF_READ);
+        struct xfs_buf          *bp =
+                container_of(work, xfs_buf_t, b_iodone_work);
-        trace_xfs_buf_iodone(bp, _RET_IP_);
-        if (bp->b_error == 0)
+        xfs_buf_ioend(bp);
-                bp->b_flags |= XBF_DONE;
+}
-        if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
+void
-                if (schedule) {
+xfs_buf_ioend_async(
-                        INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
+        struct xfs_buf  *bp)
-                        queue_work(xfslogd_workqueue, &bp->b_iodone_work);
+{
-                } else {
+        INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work);
-                        xfs_buf_iodone_work(&bp->b_iodone_work);
+        queue_work(xfslogd_workqueue, &bp->b_iodone_work);
-                }
-        } else {
-                bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
-                complete(&bp->b_iowait);
-        }
 }
 void
@@ -1067,96 +1076,6 @@ xfs_buf_ioerror_alert(
                (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
 }
-/*
- * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
- * so that the proper iodone callbacks get called.
- */
-STATIC int
-xfs_bioerror(
-        xfs_buf_t *bp)
-{
-#ifdef XFSERRORDEBUG
-        ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
-#endif
-        /*
-         * No need to wait until the buffer is unpinned, we aren't flushing it.
-         */
-        xfs_buf_ioerror(bp, -EIO);
-        /*
-         * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
-         */
-        XFS_BUF_UNREAD(bp);
-        XFS_BUF_UNDONE(bp);
-        xfs_buf_stale(bp);
-        xfs_buf_ioend(bp, 0);
-        return -EIO;
-}
-/*
- * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the xfs_buf_ioend call.
- * This is meant for userdata errors; metadata bufs come with
- * iodone functions attached, so that we can track down errors.
- */
-int
-xfs_bioerror_relse(
-        struct xfs_buf  *bp)
-{
-        int64_t         fl = bp->b_flags;
-        /*
-         * No need to wait until the buffer is unpinned.
-         * We aren't flushing it.
-         *
-         * chunkhold expects B_DONE to be set, whether
-         * we actually finish the I/O or not. We don't want to
-         * change that interface.
-         */
-        XFS_BUF_UNREAD(bp);
-        XFS_BUF_DONE(bp);
-        xfs_buf_stale(bp);
-        bp->b_iodone = NULL;
-        if (!(fl & XBF_ASYNC)) {
-                /*
-                 * Mark b_error and B_ERROR _both_.
-                 * Lot's of chunkcache code assumes that.
-                 * There's no reason to mark error for
-                 * ASYNC buffers.
-                 */
-                xfs_buf_ioerror(bp, -EIO);
-                complete(&bp->b_iowait);
-        } else {
-                xfs_buf_relse(bp);
-        }
-        return -EIO;
-}
-STATIC int
-xfs_bdstrat_cb(
-        struct xfs_buf  *bp)
-{
-        if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
-                trace_xfs_bdstrat_shut(bp, _RET_IP_);
-                /*
-                 * Metadata write that didn't get logged but
-                 * written delayed anyway. These aren't associated
-                 * with a transaction, and can be ignored.
-                 */
-                if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
-                        return xfs_bioerror_relse(bp);
-                else
-                        return xfs_bioerror(bp);
-        }
-        xfs_buf_iorequest(bp);
-        return 0;
-}
 int
 xfs_bwrite(
        struct xfs_buf          *bp)
@@ -1166,11 +1085,10 @@ xfs_bwrite(
        ASSERT(xfs_buf_islocked(bp));
        bp->b_flags |= XBF_WRITE;
-        bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
+        bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
+                         XBF_WRITE_FAIL | XBF_DONE);
-        xfs_bdstrat_cb(bp);
+        error = xfs_buf_submit_wait(bp);
-        error = xfs_buf_iowait(bp);
        if (error) {
                xfs_force_shutdown(bp->b_target->bt_mount,
                                   SHUTDOWN_META_IO_ERROR);
@@ -1179,15 +1097,6 @@ xfs_bwrite(
 }
 STATIC void
-_xfs_buf_ioend(
-        xfs_buf_t               *bp,
-        int                     schedule)
-{
-        if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
-                xfs_buf_ioend(bp, schedule);
-}
-STATIC void
 xfs_buf_bio_end_io(
        struct bio              *bio,
        int                     error)
@@ -1198,13 +1107,18 @@ xfs_buf_bio_end_io(
         * don't overwrite existing errors - otherwise we can lose errors on
         * buffers that require multiple bios to complete.
         */
-        if (!bp->b_error)
+        if (error) {
-                xfs_buf_ioerror(bp, error);
+                spin_lock(&bp->b_lock);
+                if (!bp->b_io_error)
+                        bp->b_io_error = error;
+                spin_unlock(&bp->b_lock);
+        }
        if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
-        _xfs_buf_ioend(bp, 1);
+        if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+                xfs_buf_ioend_async(bp);
        bio_put(bio);
 }
@@ -1283,7 +1197,7 @@ next_chunk:
        } else {
                /*
                 * This is guaranteed not to be the last io reference count
-                 * because the caller (xfs_buf_iorequest) holds a count itself.
+                 * because the caller (xfs_buf_submit) holds a count itself.
                 */
                atomic_dec(&bp->b_io_remaining);
                xfs_buf_ioerror(bp, -EIO);
@@ -1373,53 +1287,131 @@ _xfs_buf_ioapply(
        blk_finish_plug(&plug);
 }
+/*
+ * Asynchronous IO submission path. This transfers the buffer lock ownership and
+ * the current reference to the IO. It is not safe to reference the buffer after
+ * a call to this function unless the caller holds an additional reference
+ * itself.
+ */
 void
-xfs_buf_iorequest(
+xfs_buf_submit(
-        xfs_buf_t               *bp)
+        struct xfs_buf  *bp)
 {
-        trace_xfs_buf_iorequest(bp, _RET_IP_);
+        trace_xfs_buf_submit(bp, _RET_IP_);
        ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+        ASSERT(bp->b_flags & XBF_ASYNC);
+        /* on shutdown we stale and complete the buffer immediately */
+        if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+                xfs_buf_ioerror(bp, -EIO);
+                bp->b_flags &= ~XBF_DONE;
+                xfs_buf_stale(bp);
+                xfs_buf_ioend(bp);
+                return;
+        }
        if (bp->b_flags & XBF_WRITE)
                xfs_buf_wait_unpin(bp);
+        /* clear the internal error state to avoid spurious errors */
+        bp->b_io_error = 0;
+        /*
+         * The caller's reference is released during I/O completion.
+         * This occurs some time after the last b_io_remaining reference is
+         * released, so after we drop our Io reference we have to have some
+         * other reference to ensure the buffer doesn't go away from underneath
+         * us. Take a direct reference to ensure we have safe access to the
+         * buffer until we are finished with it.
+         */
        xfs_buf_hold(bp);
        /*
-         * Set the count to 1 initially, this will stop an I/O
+         * Set the count to 1 initially, this will stop an I/O completion
-         * completion callout which happens before we have started
+         * callout which happens before we have started all the I/O from calling
-         * all the I/O from calling xfs_buf_ioend too early.
+         * xfs_buf_ioend too early.
         */
        atomic_set(&bp->b_io_remaining, 1);
        _xfs_buf_ioapply(bp);
        /*
-         * If _xfs_buf_ioapply failed, we'll get back here with
+         * If _xfs_buf_ioapply failed, we can get back here with only the IO
-         * only the reference we took above.  _xfs_buf_ioend will
+         * reference we took above. If we drop it to zero, run completion so
-         * drop it to zero, so we'd better not queue it for later,
+         * that we don't return to the caller with completion still pending.
-         * or we'll free it before it's done.
         */
-        _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
+        if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+                if (bp->b_error)
+                        xfs_buf_ioend(bp);
+                else
+                        xfs_buf_ioend_async(bp);
+        }
        xfs_buf_rele(bp);
+        /* Note: it is not safe to reference bp now we've dropped our ref */
 }
 /*
- * Waits for I/O to complete on the buffer supplied.  It returns immediately if
+ * Synchronous buffer IO submission path, read or write.
- * no I/O is pending or there is already a pending error on the buffer, in which
- * case nothing will ever complete.  It returns the I/O error code, if any, or
- * 0 if there was no error.
 */
 int
-xfs_buf_iowait(
+xfs_buf_submit_wait(
-        xfs_buf_t               *bp)
+        struct xfs_buf  *bp)
 {
-        trace_xfs_buf_iowait(bp, _RET_IP_);
+        int             error;
-        if (!bp->b_error)
+        trace_xfs_buf_submit_wait(bp, _RET_IP_);
-                wait_for_completion(&bp->b_iowait);
+        ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC)));
+        if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+                xfs_buf_ioerror(bp, -EIO);
+                xfs_buf_stale(bp);
+                bp->b_flags &= ~XBF_DONE;
+                return -EIO;
+        }
+        if (bp->b_flags & XBF_WRITE)
+                xfs_buf_wait_unpin(bp);
+        /* clear the internal error state to avoid spurious errors */
+        bp->b_io_error = 0;
+        /*
+         * For synchronous IO, the IO does not inherit the submitters reference
+         * count, nor the buffer lock. Hence we cannot release the reference we
+         * are about to take until we've waited for all IO completion to occur,
+         * including any xfs_buf_ioend_async() work that may be pending.
+         */
+        xfs_buf_hold(bp);
+        /*
+         * Set the count to 1 initially, this will stop an I/O completion
+         * callout which happens before we have started all the I/O from calling
+         * xfs_buf_ioend too early.
+         */
+        atomic_set(&bp->b_io_remaining, 1);
+        _xfs_buf_ioapply(bp);
+        /*
+         * make sure we run completion synchronously if it raced with us and is
+         * already complete.
+         */
+        if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+                xfs_buf_ioend(bp);
+        /* wait for completion before gathering the error from the buffer */
+        trace_xfs_buf_iowait(bp, _RET_IP_);
+        wait_for_completion(&bp->b_iowait);
        trace_xfs_buf_iowait_done(bp, _RET_IP_);
-        return bp->b_error;
+        error = bp->b_error;
+        /*
+         * all done now, we can release the hold that keeps the buffer
+         * referenced for the entire IO.
+         */
+        xfs_buf_rele(bp);
+        return error;
 }
 xfs_caddr_t
@@ -1813,13 +1805,19 @@ __xfs_buf_delwri_submit(
        blk_start_plug(&plug);
        list_for_each_entry_safe(bp, n, io_list, b_list) {
                bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
-                bp->b_flags |= XBF_WRITE;
+                bp->b_flags |= XBF_WRITE | XBF_ASYNC;
-                if (!wait) {
+                /*
-                        bp->b_flags |= XBF_ASYNC;
+                 * we do all Io submission async. This means if we need to wait
+                 * for IO completion we need to take an extra reference so the
+                 * buffer is still valid on the other side.
+                 */
+                if (wait)
+                        xfs_buf_hold(bp);
+                else
                        list_del_init(&bp->b_list);
-                }
-                xfs_bdstrat_cb(bp);
+                xfs_buf_submit(bp);
        }
        blk_finish_plug(&plug);
@@ -1866,7 +1864,10 @@ xfs_buf_delwri_submit(
                bp = list_first_entry(&io_list, struct xfs_buf, b_list);
                list_del_init(&bp->b_list);
-                error2 = xfs_buf_iowait(bp);
+                /* locking the buffer will wait for async IO completion. */
+                xfs_buf_lock(bp);
+                error2 = bp->b_error;
                xfs_buf_relse(bp);
                if (!error)
                        error = error2;
@@ -1884,7 +1885,7 @@ xfs_buf_init(void)
                goto out;
        xfslogd_workqueue = alloc_workqueue("xfslogd",
-                                        WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
+                                WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1);
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c753183900b3..82002c00af90 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -158,6 +158,7 @@ typedef struct xfs_buf {
        struct list_head        b_lru;          /* lru list */
        spinlock_t              b_lock;         /* internal state lock */
        unsigned int            b_state;        /* internal state flags */
+        int                     b_io_error;     /* internal IO error state */
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
        struct xfs_perag        *b_pag;         /* contains rbtree root */
@@ -268,9 +269,9 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
 struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
                                int flags);
-struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
+int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
-                                xfs_daddr_t daddr, size_t numblks, int flags,
+                          size_t numblks, int flags, struct xfs_buf **bpp,
-                                const struct xfs_buf_ops *ops);
+                          const struct xfs_buf_ops *ops);
 void xfs_buf_hold(struct xfs_buf *bp);
 /* Releasing Buffers */
@@ -286,18 +287,16 @@ extern void xfs_buf_unlock(xfs_buf_t *);
 /* Buffer Read and Write Routines */
 extern int xfs_bwrite(struct xfs_buf *bp);
-extern void xfs_buf_ioend(xfs_buf_t *,  int);
+extern void xfs_buf_ioend(struct xfs_buf *bp);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
 extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
-extern void xfs_buf_iorequest(xfs_buf_t *);
+extern void xfs_buf_submit(struct xfs_buf *bp);
-extern int xfs_buf_iowait(xfs_buf_t *);
+extern int xfs_buf_submit_wait(struct xfs_buf *bp);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
                                xfs_buf_rw_t);
 #define xfs_buf_zero(bp, off, len) \
            xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-extern int xfs_bioerror_relse(struct xfs_buf *);
 /* Buffer Utility Routines */
 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 76007deed31f..f15969543326 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -491,7 +491,7 @@ xfs_buf_item_unpin(
                xfs_buf_ioerror(bp, -EIO);
                XFS_BUF_UNDONE(bp);
                xfs_buf_stale(bp);
-                xfs_buf_ioend(bp, 0);
+                xfs_buf_ioend(bp);
        }
 }
@@ -501,7 +501,7 @@ xfs_buf_item_unpin(
 * buffer being bad..
 */
-DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
+static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
 STATIC uint
 xfs_buf_item_push(
@@ -1081,7 +1081,7 @@ xfs_buf_iodone_callbacks(
         * a way to shut the filesystem down if the writes keep failing.
         *
         * In practice we'll shut the filesystem down soon as non-transient
-         * erorrs tend to affect the whole device and a failing log write
+         * errors tend to affect the whole device and a failing log write
         * will make us give up.  But we really ought to do better here.
         */
        if (XFS_BUF_ISASYNC(bp)) {
@@ -1094,7 +1094,7 @@ xfs_buf_iodone_callbacks(
                if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
                        bp->b_flags |= XBF_WRITE | XBF_ASYNC |
                                       XBF_DONE | XBF_WRITE_FAIL;
-                        xfs_buf_iorequest(bp);
+                        xfs_buf_submit(bp);
                } else {
                        xfs_buf_relse(bp);
                }
@@ -1115,7 +1115,7 @@ do_callbacks:
        xfs_buf_do_callbacks(bp);
        bp->b_fspriv = NULL;
        bp->b_iodone = NULL;
-        xfs_buf_ioend(bp, 0);
+        xfs_buf_ioend(bp);
 }
 /*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index de5368c803f9..eb596b419942 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -983,7 +983,7 @@ xfs_vm_page_mkwrite(
 /*
 * This type is designed to indicate the type of offset we would like
- * to search from page cache for either xfs_seek_data() or xfs_seek_hole().
+ * to search from page cache for xfs_seek_hole_data().
 */
 enum {
        HOLE_OFF = 0,
@@ -1040,7 +1040,7 @@ xfs_lookup_buffer_offset(
 /*
 * This routine is called to find out and return a data or hole offset
 * from the page cache for unwritten extents according to the desired
- * type for xfs_seek_data() or xfs_seek_hole().
+ * type for xfs_seek_hole_data().
 *
 * The argument offset is used to tell where we start to search from the
 * page cache.  Map is used to figure out the end points of the range to
@@ -1200,9 +1200,10 @@ out:
 }
 STATIC loff_t
-xfs_seek_data(
+xfs_seek_hole_data(
        struct file             *file,
-        loff_t                  start)
+        loff_t                  start,
+        int                     whence)
 {
        struct inode            *inode = file->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
@@ -1214,6 +1215,9 @@ xfs_seek_data(
        uint                    lock;
        int                     error;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
        lock = xfs_ilock_data_map_shared(ip);
        isize = i_size_read(inode);
@@ -1228,6 +1232,7 @@ xfs_seek_data(
         */
        fsbno = XFS_B_TO_FSBT(mp, start);
        end = XFS_B_TO_FSB(mp, isize);
        for (;;) {
                struct xfs_bmbt_irec    map[2];
                int                     nmap = 2;
@@ -1248,29 +1253,48 @@ xfs_seek_data(
                        offset = max_t(loff_t, start,
                                       XFS_FSB_TO_B(mp, map[i].br_startoff));
-                        /* Landed in a data extent */
+                        /* Landed in the hole we wanted? */
-                        if (map[i].br_startblock == DELAYSTARTBLOCK ||
+                        if (whence == SEEK_HOLE &&
-                            (map[i].br_state == XFS_EXT_NORM &&
+                            map[i].br_startblock == HOLESTARTBLOCK)
-                             !isnullstartblock(map[i].br_startblock)))
+                                goto out;
+                        /* Landed in the data extent we wanted? */
+                        if (whence == SEEK_DATA &&
+                            (map[i].br_startblock == DELAYSTARTBLOCK ||
+                             (map[i].br_state == XFS_EXT_NORM &&
+                              !isnullstartblock(map[i].br_startblock))))
                                goto out;
                        /*
-                         * Landed in an unwritten extent, try to search data
+                         * Landed in an unwritten extent, try to search
-                         * from page cache.
+                         * for hole or data from page cache.
                         */
                        if (map[i].br_state == XFS_EXT_UNWRITTEN) {
                                if (xfs_find_get_desired_pgoff(inode, &map[i],
-                                                        DATA_OFF, &offset))
+                                      whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF,
+                                                        &offset))
                                        goto out;
                        }
                }
                /*
-                 * map[0] is hole or its an unwritten extent but
+                 * We only received one extent out of the two requested. This
-                 * without data in page cache.  Probably means that
+                 * means we've hit EOF and didn't find what we are looking for.
-                 * we are reading after EOF if nothing in map[1].
                 */
                if (nmap == 1) {
+                        /*
+                         * If we were looking for a hole, set offset to
+                         * the end of the file (i.e., there is an implicit
+                         * hole at the end of any file).
+                         */
+                        if (whence == SEEK_HOLE) {
+                                offset = isize;
+                                break;
+                        }
+                        /*
+                         * If we were looking for data, it's nowhere to be found
+                         */
+                        ASSERT(whence == SEEK_DATA);
                        error = -ENXIO;
                        goto out_unlock;
                }
@@ -1279,125 +1303,30 @@ xfs_seek_data(
                /*
                 * Nothing was found, proceed to the next round of search
-                 * if reading offset not beyond or hit EOF.
+                 * if the next reading offset is not at or beyond EOF.
                 */
                fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
                start = XFS_FSB_TO_B(mp, fsbno);
                if (start >= isize) {
+                        if (whence == SEEK_HOLE) {
+                                offset = isize;
+                                break;
+                        }
+                        ASSERT(whence == SEEK_DATA);
                        error = -ENXIO;
                        goto out_unlock;
                }
        }
 out:
-        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-out_unlock:
-        xfs_iunlock(ip, lock);
-        if (error)
-                return error;
-        return offset;
-}
-STATIC loff_t
-xfs_seek_hole(
-        struct file             *file,
-        loff_t                  start)
-{
-        struct inode            *inode = file->f_mapping->host;
-        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
-        loff_t                  uninitialized_var(offset);
-        xfs_fsize_t             isize;
-        xfs_fileoff_t           fsbno;
-        xfs_filblks_t           end;
-        uint                    lock;
-        int                     error;
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return -EIO;
-        lock = xfs_ilock_data_map_shared(ip);
-        isize = i_size_read(inode);
-        if (start >= isize) {
-                error = -ENXIO;
-                goto out_unlock;
-        }
-        fsbno = XFS_B_TO_FSBT(mp, start);
-        end = XFS_B_TO_FSB(mp, isize);
-        for (;;) {
-                struct xfs_bmbt_irec    map[2];
-                int                     nmap = 2;
-                unsigned int            i;
-                error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
-                                       XFS_BMAPI_ENTIRE);
-                if (error)
-                        goto out_unlock;
-                /* No extents at given offset, must be beyond EOF */
-                if (nmap == 0) {
-                        error = -ENXIO;
-                        goto out_unlock;
-                }
-                for (i = 0; i < nmap; i++) {
-                        offset = max_t(loff_t, start,
-                                       XFS_FSB_TO_B(mp, map[i].br_startoff));
-                        /* Landed in a hole */
-                        if (map[i].br_startblock == HOLESTARTBLOCK)
-                                goto out;
-                        /*
-                         * Landed in an unwritten extent, try to search hole
-                         * from page cache.
-                         */
-                        if (map[i].br_state == XFS_EXT_UNWRITTEN) {
-                                if (xfs_find_get_desired_pgoff(inode, &map[i],
-                                                        HOLE_OFF, &offset))
-                                        goto out;
-                        }
-                }
-                /*
-                 * map[0] contains data or its unwritten but contains
-                 * data in page cache, probably means that we are
-                 * reading after EOF.  We should fix offset to point
-                 * to the end of the file(i.e., there is an implicit
-                 * hole at the end of any file).
-                 */
-                if (nmap == 1) {
-                        offset = isize;
-                        break;
-                }
-                ASSERT(i > 1);
-                /*
-                 * Both mappings contains data, proceed to the next round of
-                 * search if the current reading offset not beyond or hit EOF.
-                 */
-                fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
-                start = XFS_FSB_TO_B(mp, fsbno);
-                if (start >= isize) {
-                        offset = isize;
-                        break;
-                }
-        }
-out:
        /*
-         * At this point, we must have found a hole.  However, the returned
+         * If at this point we have found the hole we wanted, the returned
         * offset may be bigger than the file size as it may be aligned to
-         * page boundary for unwritten extents, we need to deal with this
+         * page boundary for unwritten extents.  We need to deal with this
         * situation in particular.
         */
-        offset = min_t(loff_t, offset, isize);
+        if (whence == SEEK_HOLE)
+                offset = min_t(loff_t, offset, isize);
        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 out_unlock:
@@ -1412,17 +1341,16 @@ STATIC loff_t
 xfs_file_llseek(
        struct file     *file,
        loff_t          offset,
-        int             origin)
+        int             whence)
 {
-        switch (origin) {
+        switch (whence) {
        case SEEK_END:
        case SEEK_CUR:
        case SEEK_SET:
-                return generic_file_llseek(file, offset, origin);
+                return generic_file_llseek(file, offset, whence);
-        case SEEK_DATA:
-                return xfs_seek_data(file, offset);
        case SEEK_HOLE:
-                return xfs_seek_hole(file, offset);
+        case SEEK_DATA:
+                return xfs_seek_hole_data(file, offset, whence);
        default:
                return -EINVAL;
        }
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index f91de1ef05e1..c05ac8b70fa9 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -172,16 +172,11 @@ xfs_growfs_data_private(
        if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
                return error;
        dpct = pct - mp->m_sb.sb_imax_pct;
-        bp = xfs_buf_read_uncached(mp->m_ddev_targp,
+        error = xfs_buf_read_uncached(mp->m_ddev_targp,
                                XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-                                XFS_FSS_TO_BB(mp, 1), 0, NULL);
+                                XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
-        if (!bp)
+        if (error)
-                return -EIO;
-        if (bp->b_error) {
-                error = bp->b_error;
-                xfs_buf_relse(bp);
                return error;
-        }
        xfs_buf_relse(bp);
        new = nb;       /* use new as a temporary here */
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 5399ef222dd7..4d41b241298f 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -43,3 +43,7 @@ xfs_param_t xfs_params = {
        .fstrm_timer    = {     1,              30*100,         3600*100},
        .eofb_timer     = {     1,              300,            3600*24},
 };
+struct xfs_globals xfs_globals = {
+        .log_recovery_delay     =       0,      /* no delay by default */
+};
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 981b2cf51985..b45f7b27b5df 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -33,7 +33,6 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_bmap_util.h"
-#include "xfs_quota.h"
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index fea3c92fb3f0..8ed049d1e332 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -654,7 +654,7 @@ xfs_ialloc(
        xfs_inode_t     *ip;
        uint            flags;
        int             error;
-        timespec_t      tv;
+        struct timespec tv;
        /*
         * Call the space management code to pick
@@ -720,7 +720,7 @@ xfs_ialloc(
        ip->i_d.di_nextents = 0;
        ASSERT(ip->i_d.di_nblocks == 0);
-        nanotime(&tv);
+        tv = current_fs_time(mp->m_super);
        ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
        ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
        ip->i_d.di_atime = ip->i_d.di_mtime;
@@ -769,6 +769,8 @@ xfs_ialloc(
                                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
                                }
+                                if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+                                        di_flags |= XFS_DIFLAG_PROJINHERIT;
                        } else if (S_ISREG(mode)) {
                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
                                        di_flags |= XFS_DIFLAG_REALTIME;
@@ -789,8 +791,6 @@ xfs_ialloc(
                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
                            xfs_inherit_nosymlinks)
                                di_flags |= XFS_DIFLAG_NOSYMLINKS;
-                        if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-                                di_flags |= XFS_DIFLAG_PROJINHERIT;
                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
                            xfs_inherit_nodefrag)
                                di_flags |= XFS_DIFLAG_NODEFRAG;
@@ -1153,9 +1153,11 @@ xfs_create(
        if (error)
                goto out_trans_cancel;
-        error = xfs_dir_canenter(tp, dp, name, resblks);
+        if (!resblks) {
-        if (error)
+                error = xfs_dir_canenter(tp, dp, name);
-                goto out_trans_cancel;
+                if (error)
+                        goto out_trans_cancel;
+        }
        /*
         * A newly created regular or special file just has one directory
@@ -1421,9 +1423,11 @@ xfs_link(
                goto error_return;
        }
-        error = xfs_dir_canenter(tp, tdp, target_name, resblks);
+        if (!resblks) {
-        if (error)
+                error = xfs_dir_canenter(tp, tdp, target_name);
-                goto error_return;
+                if (error)
+                        goto error_return;
+        }
        xfs_bmap_init(&free_list, &first_block);
@@ -2759,9 +2763,11 @@ xfs_rename(
                 * If there's no space reservation, check the entry will
                 * fit before actually inserting it.
                 */
-                error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
+                if (!spaceres) {
-                if (error)
+                        error = xfs_dir_canenter(tp, target_dp, target_name);
-                        goto error_return;
+                        if (error)
+                                goto error_return;
+                }
                /*
                 * If target does not exist and the rename crosses
                 * directories, adjust the target directory link count
@@ -3056,7 +3062,7 @@ cluster_corrupt_out:
                        XFS_BUF_UNDONE(bp);
                        xfs_buf_stale(bp);
                        xfs_buf_ioerror(bp, -EIO);
-                        xfs_buf_ioend(bp, 0);
+                        xfs_buf_ioend(bp);
                } else {
                        xfs_buf_stale(bp);
                        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c10e3fadd9af..9af2882e1f4c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -102,7 +102,7 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
 {
        xfs_fsize_t i_size = i_size_read(VFS_I(ip));
-        if (new_size > i_size)
+        if (new_size > i_size || new_size < 0)
                new_size = i_size;
        return new_size > ip->i_d.di_size ? new_size : 0;
 }
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index de5a7be36e60..63de0b0acc32 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -615,7 +615,7 @@ xfs_iflush_done(
        blip = bp->b_fspriv;
        prev = NULL;
        while (blip != NULL) {
-                if (lip->li_cb != xfs_iflush_done) {
+                if (blip->li_cb != xfs_iflush_done) {
                        prev = blip;
                        blip = blip->li_bio_list;
                        continue;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 3799695b9249..24c926b6fe85 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -968,8 +968,6 @@ xfs_set_diflags(
                di_flags |= XFS_DIFLAG_NOATIME;
        if (xflags & XFS_XFLAG_NODUMP)
                di_flags |= XFS_DIFLAG_NODUMP;
-        if (xflags & XFS_XFLAG_PROJINHERIT)
-                di_flags |= XFS_DIFLAG_PROJINHERIT;
        if (xflags & XFS_XFLAG_NODEFRAG)
                di_flags |= XFS_DIFLAG_NODEFRAG;
        if (xflags & XFS_XFLAG_FILESTREAM)
@@ -981,6 +979,8 @@ xfs_set_diflags(
                        di_flags |= XFS_DIFLAG_NOSYMLINKS;
                if (xflags & XFS_XFLAG_EXTSZINHERIT)
                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+                if (xflags & XFS_XFLAG_PROJINHERIT)
+                        di_flags |= XFS_DIFLAG_PROJINHERIT;
        } else if (S_ISREG(ip->i_d.di_mode)) {
                if (xflags & XFS_XFLAG_REALTIME)
                        di_flags |= XFS_DIFLAG_REALTIME;
@@ -1231,13 +1231,25 @@ xfs_ioctl_setattr(
        }
-        if (mask & FSX_EXTSIZE)
-                ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
        if (mask & FSX_XFLAGS) {
                xfs_set_diflags(ip, fa->fsx_xflags);
                xfs_diflags_to_linux(ip);
        }
+        /*
+         * Only set the extent size hint if we've already determined that the
+         * extent size hint should be set on the inode. If no extent size flags
+         * are set on the inode then unconditionally clear the extent size hint.
+         */
+        if (mask & FSX_EXTSIZE) {
+                int     extsize = 0;
+                if (ip->i_d.di_flags &
+                                (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
+                        extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+                ip->i_d.di_extsize = extsize;
+        }
        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1349,7 +1361,7 @@ xfs_ioc_setxflags(
 STATIC int
 xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
 {
-        struct getbmap __user   *base = *ap;
+        struct getbmap __user   *base = (struct getbmap __user *)*ap;
        /* copy only getbmap portion (not getbmapx) */
        if (copy_to_user(base, bmv, sizeof(struct getbmap)))
@@ -1380,7 +1392,7 @@ xfs_ioc_getbmap(
                bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
        error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
-                            (struct getbmap *)arg+1);
+                            (__force struct getbmap *)arg+1);
        if (error)
                return error;
@@ -1393,7 +1405,7 @@ xfs_ioc_getbmap(
 STATIC int
 xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
 {
-        struct getbmapx __user  *base = *ap;
+        struct getbmapx __user  *base = (struct getbmapx __user *)*ap;
        if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
                return -EFAULT;
@@ -1420,7 +1432,7 @@ xfs_ioc_getbmapx(
                return -EINVAL;
        error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
-                            (struct getbmapx *)arg+1);
+                            (__force struct getbmapx *)arg+1);
        if (error)
                return error;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index a554646ff141..94ce027e28e3 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -160,6 +160,7 @@ xfs_ioctl32_bstat_copyin(
            get_user(bstat->bs_gen,     &bstat32->bs_gen)       ||
            get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
            get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
+            get_user(bstat->bs_forkoff, &bstat32->bs_forkoff)   ||
            get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
            get_user(bstat->bs_dmstate, &bstat32->bs_dmstate)   ||
            get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -214,6 +215,7 @@ xfs_bulkstat_one_fmt_compat(
            put_user(buffer->bs_gen,      &p32->bs_gen)         ||
            put_user(buffer->bs_projid,   &p32->bs_projid)      ||
            put_user(buffer->bs_projid_hi,      &p32->bs_projid_hi)     ||
+            put_user(buffer->bs_forkoff,  &p32->bs_forkoff)     ||
            put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)    ||
            put_user(buffer->bs_dmstate,  &p32->bs_dmstate)     ||
            put_user(buffer->bs_aextents, &p32->bs_aextents))
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 80f4060e8970..b1bb45444df8 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -67,8 +67,9 @@ typedef struct compat_xfs_bstat {
        __u32           bs_gen;         /* generation count             */
        __u16           bs_projid_lo;   /* lower part of project id     */
 #define bs_projid       bs_projid_lo    /* (previously just bs_projid)  */
+        __u16           bs_forkoff;     /* inode fork offset in bytes   */
        __u16           bs_projid_hi;   /* high part of project id      */
-        unsigned char   bs_pad[12];     /* pad space, unused            */
+        unsigned char   bs_pad[10];     /* pad space, unused            */
        __u32           bs_dmevmask;    /* DMIG event mask              */
        __u16           bs_dmstate;     /* DMIG state info              */
        __u16           bs_aextents;    /* attribute number of extents  */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index e9c47b6f5e5a..afcf3c926565 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -404,8 +404,8 @@ xfs_quota_calc_throttle(
        int shift = 0;
        struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
-        /* over hi wmark, squash the prealloc completely */
+        /* no dq, or over hi wmark, squash the prealloc completely */
-        if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
+        if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
                *qblocks = 0;
                *qfreesp = 0;
                return;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 72129493e9d3..ec6dcdc181ee 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -849,6 +849,36 @@ xfs_setattr_size(
                return error;
        truncate_setsize(inode, newsize);
+        /*
+         * The "we can't serialise against page faults" pain gets worse.
+         *
+         * If the file is mapped then we have to clean the page at the old EOF
+         * when extending the file. Extending the file can expose changes the
+         * underlying page mapping (e.g. from beyond EOF to a hole or
+         * unwritten), and so on the next attempt to write to that page we need
+         * to remap it for write. i.e. we need .page_mkwrite() to be called.
+         * Hence we need to clean the page to clean the pte and so a new write
+         * fault will be triggered appropriately.
+         *
+         * If we do it before we change the inode size, then we can race with a
+         * page fault that maps the page with exactly the same problem. If we do
+         * it after we change the file size, then a new page fault can come in
+         * and allocate space before we've run the rest of the truncate
+         * transaction. That's kinda grotesque, but it's better than have data
+         * over a hole, and so that's the lesser evil that has been chosen here.
+         *
+         * The real solution, however, is to have some mechanism for locking out
+         * page faults while a truncate is in progress.
+         */
+        if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
+                error = filemap_write_and_wait_range(
+                                VFS_I(ip)->i_mapping,
+                                round_down(oldsize, PAGE_CACHE_SIZE),
+                                round_up(oldsize, PAGE_CACHE_SIZE) - 1);
+                if (error)
+                        return error;
+        }
        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error)
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f71be9c68017..f1deb961a296 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -639,7 +639,8 @@ next_ag:
                xfs_buf_relse(agbp);
                agbp = NULL;
                agino = 0;
-        } while (++agno < mp->m_sb.sb_agcount);
+                agno++;
+        } while (agno < mp->m_sb.sb_agcount);
        if (!error) {
                if (bufidx) {
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index d10dc8f397c9..6a51619d8690 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -56,7 +56,6 @@ typedef __uint64_t __psunsigned_t;
 #include "kmem.h"
 #include "mrlock.h"
-#include "time.h"
 #include "uuid.h"
 #include <linux/semaphore.h>
@@ -179,6 +178,11 @@ typedef __uint64_t __psunsigned_t;
 #define MAX(a,b)        (max(a,b))
 #define howmany(x, y)   (((x)+((y)-1))/(y))
+static inline void delay(long ticks)
+{
+        schedule_timeout_uninterruptible(ticks);
+}
 /*
 * XFS wrapper structure for sysfs support. It depends on external data
 * structures and is embedded in various internal data structures to implement
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ca4fd5bd8522..fe88ef67f93a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1678,7 +1678,7 @@ xlog_bdstrat(
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                xfs_buf_ioerror(bp, -EIO);
                xfs_buf_stale(bp);
-                xfs_buf_ioend(bp, 0);
+                xfs_buf_ioend(bp);
                /*
                 * It would seem logical to return EIO here, but we rely on
                 * the log state machine to propagate I/O errors instead of
@@ -1688,7 +1688,7 @@ xlog_bdstrat(
                return 0;
        }
-        xfs_buf_iorequest(bp);
+        xfs_buf_submit(bp);
        return 0;
 }
@@ -3867,18 +3867,17 @@ xlog_state_ioerror(
 * This is called from xfs_force_shutdown, when we're forcibly
 * shutting down the filesystem, typically because of an IO error.
 * Our main objectives here are to make sure that:
- *      a. the filesystem gets marked 'SHUTDOWN' for all interested
+ *      a. if !logerror, flush the logs to disk. Anything modified
+ *         after this is ignored.
+ *      b. the filesystem gets marked 'SHUTDOWN' for all interested
 *         parties to find out, 'atomically'.
- *      b. those who're sleeping on log reservations, pinned objects and
+ *      c. those who're sleeping on log reservations, pinned objects and
 *          other resources get woken up, and be told the bad news.
- *      c. nothing new gets queued up after (a) and (b) are done.
+ *      d. nothing new gets queued up after (b) and (c) are done.
- *      d. if !logerror, flush the iclogs to disk, then seal them off
- *         for business.
 *
- * Note: for delayed logging the !logerror case needs to flush the regions
+ * Note: for the !logerror case we need to flush the regions held in memory out
- * held in memory out to the iclogs before flushing them to disk. This needs
+ * to disk first. This needs to be done before the log is marked as shutdown,
- * to be done before the log is marked as shutdown, otherwise the flush to the
+ * otherwise the iclog writes will fail.
- * iclogs will fail.
 */
 int
 xfs_log_force_umount(
@@ -3910,16 +3909,16 @@ xfs_log_force_umount(
                ASSERT(XLOG_FORCED_SHUTDOWN(log));
                return 1;
        }
-        retval = 0;
        /*
-         * Flush the in memory commit item list before marking the log as
+         * Flush all the completed transactions to disk before marking the log
-         * being shut down. We need to do it in this order to ensure all the
+         * being shut down. We need to do it in this order to ensure that
-         * completed transactions are flushed to disk with the xfs_log_force()
+         * completed operations are safely on disk before we shut down, and that
-         * call below.
+         * we don't have to issue any buffer IO after the shutdown flags are set
+         * to guarantee this.
         */
        if (!logerror)
-                xlog_cil_force(log);
+                _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
        /*
         * mark the filesystem and the as in a shutdown state and wake
@@ -3931,18 +3930,11 @@ xfs_log_force_umount(
                XFS_BUF_DONE(mp->m_sb_bp);
        /*
-         * This flag is sort of redundant because of the mount flag, but
+         * Mark the log and the iclogs with IO error flags to prevent any
-         * it's good to maintain the separation between the log and the rest
+         * further log IO from being issued or completed.
-         * of XFS.
         */
        log->l_flags |= XLOG_IO_ERROR;
+        retval = xlog_state_ioerror(log);
-        /*
-         * If we hit a log error, we want to mark all the iclogs IOERROR
-         * while we're still holding the loglock.
-         */
-        if (logerror)
-                retval = xlog_state_ioerror(log);
        spin_unlock(&log->l_icloglock);
        /*
@@ -3955,19 +3947,6 @@ xfs_log_force_umount(
        xlog_grant_head_wake_all(&log->l_reserve_head);
        xlog_grant_head_wake_all(&log->l_write_head);
-        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
-                ASSERT(!logerror);
-                /*
-                 * Force the incore logs to disk before shutting the
-                 * log down completely.
-                 */
-                _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
-                spin_lock(&log->l_icloglock);
-                retval = xlog_state_ioerror(log);
-                spin_unlock(&log->l_icloglock);
-        }
        /*
         * Wake up everybody waiting on xfs_log_force. Wake the CIL push first
         * as if the log writes were completed. The abort handling in the log
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index f6b79e5325dd..f506c457011e 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -463,12 +463,40 @@ xlog_cil_push(
                spin_unlock(&cil->xc_push_lock);
                goto out_skip;
        }
-        spin_unlock(&cil->xc_push_lock);
        /* check for a previously pushed seqeunce */
-        if (push_seq < cil->xc_ctx->sequence)
+        if (push_seq < cil->xc_ctx->sequence) {
+                spin_unlock(&cil->xc_push_lock);
                goto out_skip;
+        }
+        /*
+         * We are now going to push this context, so add it to the committing
+         * list before we do anything else. This ensures that anyone waiting on
+         * this push can easily detect the difference between a "push in
+         * progress" and "CIL is empty, nothing to do".
+         *
+         * IOWs, a wait loop can now check for:
+         *      the current sequence not being found on the committing list;
+         *      an empty CIL; and
+         *      an unchanged sequence number
+         * to detect a push that had nothing to do and therefore does not need
+         * waiting on. If the CIL is not empty, we get put on the committing
+         * list before emptying the CIL and bumping the sequence number. Hence
+         * an empty CIL and an unchanged sequence number means we jumped out
+         * above after doing nothing.
+         *
+         * Hence the waiter will either find the commit sequence on the
+         * committing list or the sequence number will be unchanged and the CIL
+         * still dirty. In that latter case, the push has not yet started, and
+         * so the waiter will have to continue trying to check the CIL
+         * committing list until it is found. In extreme cases of delay, the
+         * sequence may fully commit between the attempts the wait makes to wait
+         * on the commit sequence.
+         */
+        list_add(&ctx->committing, &cil->xc_committing);
+        spin_unlock(&cil->xc_push_lock);
        /*
         * pull all the log vectors off the items in the CIL, and
@@ -532,7 +560,6 @@ xlog_cil_push(
         */
        spin_lock(&cil->xc_push_lock);
        cil->xc_current_sequence = new_ctx->sequence;
-        list_add(&ctx->committing, &cil->xc_committing);
        spin_unlock(&cil->xc_push_lock);
        up_write(&cil->xc_ctx_lock);
@@ -855,13 +882,15 @@ restart:
         * Hence by the time we have got here it our sequence may not have been
         * pushed yet. This is true if the current sequence still matches the
         * push sequence after the above wait loop and the CIL still contains
-         * dirty objects.
+         * dirty objects. This is guaranteed by the push code first adding the
+         * context to the committing list before emptying the CIL.
         *
-         * When the push occurs, it will empty the CIL and atomically increment
+         * Hence if we don't find the context in the committing list and the
-         * the currect sequence past the push sequence and move it into the
+         * current sequence number is unchanged then the CIL contents are
-         * committing list. Of course, if the CIL is clean at the time of the
+         * significant.  If the CIL is empty, if means there was nothing to push
-         * push, it won't have pushed the CIL at all, so in that case we should
+         * and that means there is nothing to wait for. If the CIL is not empty,
-         * try the push for this sequence again from the start just in case.
+         * it means we haven't yet started the push, because if it had started
+         * we would have found the context on the committing list.
         */
        if (sequence == cil->xc_current_sequence &&
            !list_empty(&cil->xc_cil)) {
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1fd5787add99..00cd7f3a8f59 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -193,12 +193,8 @@ xlog_bread_noalign(
        bp->b_io_length = nbblks;
        bp->b_error = 0;
-        if (XFS_FORCED_SHUTDOWN(log->l_mp))
+        error = xfs_buf_submit_wait(bp);
-                return -EIO;
+        if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
-        xfs_buf_iorequest(bp);
-        error = xfs_buf_iowait(bp);
-        if (error)
                xfs_buf_ioerror_alert(bp, __func__);
        return error;
 }
@@ -378,12 +374,14 @@ xlog_recover_iodone(
                 * We're not going to bother about retrying
                 * this during recovery. One strike!
                 */
-                xfs_buf_ioerror_alert(bp, __func__);
+                if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
-                xfs_force_shutdown(bp->b_target->bt_mount,
+                        xfs_buf_ioerror_alert(bp, __func__);
-                                        SHUTDOWN_META_IO_ERROR);
+                        xfs_force_shutdown(bp->b_target->bt_mount,
+                                                SHUTDOWN_META_IO_ERROR);
+                }
        }
        bp->b_iodone = NULL;
-        xfs_buf_ioend(bp, 0);
+        xfs_buf_ioend(bp);
 }
 /*
@@ -1445,160 +1443,6 @@ xlog_clear_stale_blocks(
 ******************************************************************************
 */
-STATIC xlog_recover_t *
-xlog_recover_find_tid(
-        struct hlist_head       *head,
-        xlog_tid_t              tid)
-{
-        xlog_recover_t          *trans;
-        hlist_for_each_entry(trans, head, r_list) {
-                if (trans->r_log_tid == tid)
-                        return trans;
-        }
-        return NULL;
-}
-STATIC void
-xlog_recover_new_tid(
-        struct hlist_head       *head,
-        xlog_tid_t              tid,
-        xfs_lsn_t               lsn)
-{
-        xlog_recover_t          *trans;
-        trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
-        trans->r_log_tid   = tid;
-        trans->r_lsn       = lsn;
-        INIT_LIST_HEAD(&trans->r_itemq);
-        INIT_HLIST_NODE(&trans->r_list);
-        hlist_add_head(&trans->r_list, head);
-}
-STATIC void
-xlog_recover_add_item(
-        struct list_head        *head)
-{
-        xlog_recover_item_t     *item;
-        item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
-        INIT_LIST_HEAD(&item->ri_list);
-        list_add_tail(&item->ri_list, head);
-}
-STATIC int
-xlog_recover_add_to_cont_trans(
-        struct xlog             *log,
-        struct xlog_recover     *trans,
-        xfs_caddr_t             dp,
-        int                     len)
-{
-        xlog_recover_item_t     *item;
-        xfs_caddr_t             ptr, old_ptr;
-        int                     old_len;
-        if (list_empty(&trans->r_itemq)) {
-                /* finish copying rest of trans header */
-                xlog_recover_add_item(&trans->r_itemq);
-                ptr = (xfs_caddr_t) &trans->r_theader +
-                                sizeof(xfs_trans_header_t) - len;
-                memcpy(ptr, dp, len); /* d, s, l */
-                return 0;
-        }
-        /* take the tail entry */
-        item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
-        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
-        old_len = item->ri_buf[item->ri_cnt-1].i_len;
-        ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
-        memcpy(&ptr[old_len], dp, len); /* d, s, l */
-        item->ri_buf[item->ri_cnt-1].i_len += len;
-        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
-        trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
-        return 0;
-}
-/*
- * The next region to add is the start of a new region.  It could be
- * a whole region or it could be the first part of a new region.  Because
- * of this, the assumption here is that the type and size fields of all
- * format structures fit into the first 32 bits of the structure.
- *
- * This works because all regions must be 32 bit aligned.  Therefore, we
- * either have both fields or we have neither field.  In the case we have
- * neither field, the data part of the region is zero length.  We only have
- * a log_op_header and can throw away the header since a new one will appear
- * later.  If we have at least 4 bytes, then we can determine how many regions
- * will appear in the current log item.
- */
-STATIC int
-xlog_recover_add_to_trans(
-        struct xlog             *log,
-        struct xlog_recover     *trans,
-        xfs_caddr_t             dp,
-        int                     len)
-{
-        xfs_inode_log_format_t  *in_f;                  /* any will do */
-        xlog_recover_item_t     *item;
-        xfs_caddr_t             ptr;
-        if (!len)
-                return 0;
-        if (list_empty(&trans->r_itemq)) {
-                /* we need to catch log corruptions here */
-                if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
-                        xfs_warn(log->l_mp, "%s: bad header magic number",
-                                __func__);
-                        ASSERT(0);
-                        return -EIO;
-                }
-                if (len == sizeof(xfs_trans_header_t))
-                        xlog_recover_add_item(&trans->r_itemq);
-                memcpy(&trans->r_theader, dp, len); /* d, s, l */
-                return 0;
-        }
-        ptr = kmem_alloc(len, KM_SLEEP);
-        memcpy(ptr, dp, len);
-        in_f = (xfs_inode_log_format_t *)ptr;
-        /* take the tail entry */
-        item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
-        if (item->ri_total != 0 &&
-             item->ri_total == item->ri_cnt) {
-                /* tail item is in use, get a new one */
-                xlog_recover_add_item(&trans->r_itemq);
-                item = list_entry(trans->r_itemq.prev,
-                                        xlog_recover_item_t, ri_list);
-        }
-        if (item->ri_total == 0) {              /* first region to be added */
-                if (in_f->ilf_size == 0 ||
-                    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
-                        xfs_warn(log->l_mp,
-                "bad number of regions (%d) in inode log format",
-                                  in_f->ilf_size);
-                        ASSERT(0);
-                        kmem_free(ptr);
-                        return -EIO;
-                }
-                item->ri_total = in_f->ilf_size;
-                item->ri_buf =
-                        kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
-                                    KM_SLEEP);
-        }
-        ASSERT(item->ri_total > item->ri_cnt);
-        /* Description region is ri_buf[0] */
-        item->ri_buf[item->ri_cnt].i_addr = ptr;
-        item->ri_buf[item->ri_cnt].i_len  = len;
-        item->ri_cnt++;
-        trace_xfs_log_recover_item_add(log, trans, item, 0);
-        return 0;
-}
 /*
 * Sort the log items in the transaction.
 *
@@ -3254,31 +3098,6 @@ xlog_recover_do_icreate_pass2(
        return 0;
 }
-/*
- * Free up any resources allocated by the transaction
- *
- * Remember that EFIs, EFDs, and IUNLINKs are handled later.
- */
-STATIC void
-xlog_recover_free_trans(
-        struct xlog_recover     *trans)
-{
-        xlog_recover_item_t     *item, *n;
-        int                     i;
-        list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
-                /* Free the regions in the item. */
-                list_del(&item->ri_list);
-                for (i = 0; i < item->ri_cnt; i++)
-                        kmem_free(item->ri_buf[i].i_addr);
-                /* Free the item itself */
-                kmem_free(item->ri_buf);
-                kmem_free(item);
-        }
-        /* Free the transaction recover structure */
-        kmem_free(trans);
-}
 STATIC void
 xlog_recover_buffer_ra_pass2(
        struct xlog                     *log,
@@ -3528,22 +3347,309 @@ out:
        if (!list_empty(&done_list))
                list_splice_init(&done_list, &trans->r_itemq);
-        xlog_recover_free_trans(trans);
        error2 = xfs_buf_delwri_submit(&buffer_list);
        return error ? error : error2;
 }
+STATIC void
+xlog_recover_add_item(
+        struct list_head        *head)
+{
+        xlog_recover_item_t     *item;
+        item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+        INIT_LIST_HEAD(&item->ri_list);
+        list_add_tail(&item->ri_list, head);
+}
 STATIC int
-xlog_recover_unmount_trans(
+xlog_recover_add_to_cont_trans(
-        struct xlog             *log)
+        struct xlog             *log,
+        struct xlog_recover     *trans,
+        xfs_caddr_t             dp,
+        int                     len)
 {
-        /* Do nothing now */
+        xlog_recover_item_t     *item;
-        xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
+        xfs_caddr_t             ptr, old_ptr;
+        int                     old_len;
+        if (list_empty(&trans->r_itemq)) {
+                /* finish copying rest of trans header */
+                xlog_recover_add_item(&trans->r_itemq);
+                ptr = (xfs_caddr_t) &trans->r_theader +
+                                sizeof(xfs_trans_header_t) - len;
+                memcpy(ptr, dp, len);
+                return 0;
+        }
+        /* take the tail entry */
+        item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
+        old_len = item->ri_buf[item->ri_cnt-1].i_len;
+        ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
+        memcpy(&ptr[old_len], dp, len);
+        item->ri_buf[item->ri_cnt-1].i_len += len;
+        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+        trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
+        return 0;
+}
+/*
+ * The next region to add is the start of a new region.  It could be
+ * a whole region or it could be the first part of a new region.  Because
+ * of this, the assumption here is that the type and size fields of all
+ * format structures fit into the first 32 bits of the structure.
+ *
+ * This works because all regions must be 32 bit aligned.  Therefore, we
+ * either have both fields or we have neither field.  In the case we have
+ * neither field, the data part of the region is zero length.  We only have
+ * a log_op_header and can throw away the header since a new one will appear
+ * later.  If we have at least 4 bytes, then we can determine how many regions
+ * will appear in the current log item.
+ */
+STATIC int
+xlog_recover_add_to_trans(
+        struct xlog             *log,
+        struct xlog_recover     *trans,
+        xfs_caddr_t             dp,
+        int                     len)
+{
+        xfs_inode_log_format_t  *in_f;                  /* any will do */
+        xlog_recover_item_t     *item;
+        xfs_caddr_t             ptr;
+        if (!len)
+                return 0;
+        if (list_empty(&trans->r_itemq)) {
+                /* we need to catch log corruptions here */
+                if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
+                        xfs_warn(log->l_mp, "%s: bad header magic number",
+                                __func__);
+                        ASSERT(0);
+                        return -EIO;
+                }
+                if (len == sizeof(xfs_trans_header_t))
+                        xlog_recover_add_item(&trans->r_itemq);
+                memcpy(&trans->r_theader, dp, len);
+                return 0;
+        }
+        ptr = kmem_alloc(len, KM_SLEEP);
+        memcpy(ptr, dp, len);
+        in_f = (xfs_inode_log_format_t *)ptr;
+        /* take the tail entry */
+        item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+        if (item->ri_total != 0 &&
+             item->ri_total == item->ri_cnt) {
+                /* tail item is in use, get a new one */
+                xlog_recover_add_item(&trans->r_itemq);
+                item = list_entry(trans->r_itemq.prev,
+                                        xlog_recover_item_t, ri_list);
+        }
+        if (item->ri_total == 0) {              /* first region to be added */
+                if (in_f->ilf_size == 0 ||
+                    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
+                        xfs_warn(log->l_mp,
+                "bad number of regions (%d) in inode log format",
+                                  in_f->ilf_size);
+                        ASSERT(0);
+                        kmem_free(ptr);
+                        return -EIO;
+                }
+                item->ri_total = in_f->ilf_size;
+                item->ri_buf =
+                        kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+                                    KM_SLEEP);
+        }
+        ASSERT(item->ri_total > item->ri_cnt);
+        /* Description region is ri_buf[0] */
+        item->ri_buf[item->ri_cnt].i_addr = ptr;
+        item->ri_buf[item->ri_cnt].i_len  = len;
+        item->ri_cnt++;
+        trace_xfs_log_recover_item_add(log, trans, item, 0);
        return 0;
 }
 /*
+ * Free up any resources allocated by the transaction
+ *
+ * Remember that EFIs, EFDs, and IUNLINKs are handled later.
+ */
+STATIC void
+xlog_recover_free_trans(
+        struct xlog_recover     *trans)
+{
+        xlog_recover_item_t     *item, *n;
+        int                     i;
+        list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+                /* Free the regions in the item. */
+                list_del(&item->ri_list);
+                for (i = 0; i < item->ri_cnt; i++)
+                        kmem_free(item->ri_buf[i].i_addr);
+                /* Free the item itself */
+                kmem_free(item->ri_buf);
+                kmem_free(item);
+        }
+        /* Free the transaction recover structure */
+        kmem_free(trans);
+}
+/*
+ * On error or completion, trans is freed.
+ */
+STATIC int
+xlog_recovery_process_trans(
+        struct xlog             *log,
+        struct xlog_recover     *trans,
+        xfs_caddr_t             dp,
+        unsigned int            len,
+        unsigned int            flags,
+        int                     pass)
+{
+        int                     error = 0;
+        bool                    freeit = false;
+        /* mask off ophdr transaction container flags */
+        flags &= ~XLOG_END_TRANS;
+        if (flags & XLOG_WAS_CONT_TRANS)
+                flags &= ~XLOG_CONTINUE_TRANS;
+        /*
+         * Callees must not free the trans structure. We'll decide if we need to
+         * free it or not based on the operation being done and it's result.
+         */
+        switch (flags) {
+        /* expected flag values */
+        case 0:
+        case XLOG_CONTINUE_TRANS:
+                error = xlog_recover_add_to_trans(log, trans, dp, len);
+                break;
+        case XLOG_WAS_CONT_TRANS:
+                error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
+                break;
+        case XLOG_COMMIT_TRANS:
+                error = xlog_recover_commit_trans(log, trans, pass);
+                /* success or fail, we are now done with this transaction. */
+                freeit = true;
+                break;
+        /* unexpected flag values */
+        case XLOG_UNMOUNT_TRANS:
+                /* just skip trans */
+                xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
+                freeit = true;
+                break;
+        case XLOG_START_TRANS:
+        default:
+                xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
+                ASSERT(0);
+                error = -EIO;
+                break;
+        }
+        if (error || freeit)
+                xlog_recover_free_trans(trans);
+        return error;
+}
+/*
+ * Lookup the transaction recovery structure associated with the ID in the
+ * current ophdr. If the transaction doesn't exist and the start flag is set in
+ * the ophdr, then allocate a new transaction for future ID matches to find.
+ * Either way, return what we found during the lookup - an existing transaction
+ * or nothing.
+ */
+STATIC struct xlog_recover *
+xlog_recover_ophdr_to_trans(
+        struct hlist_head       rhash[],
+        struct xlog_rec_header  *rhead,
+        struct xlog_op_header   *ohead)
+{
+        struct xlog_recover     *trans;
+        xlog_tid_t              tid;
+        struct hlist_head       *rhp;
+        tid = be32_to_cpu(ohead->oh_tid);
+        rhp = &rhash[XLOG_RHASH(tid)];
+        hlist_for_each_entry(trans, rhp, r_list) {
+                if (trans->r_log_tid == tid)
+                        return trans;
+        }
+        /*
+         * skip over non-start transaction headers - we could be
+         * processing slack space before the next transaction starts
+         */
+        if (!(ohead->oh_flags & XLOG_START_TRANS))
+                return NULL;
+        ASSERT(be32_to_cpu(ohead->oh_len) == 0);
+        /*
+         * This is a new transaction so allocate a new recovery container to
+         * hold the recovery ops that will follow.
+         */
+        trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
+        trans->r_log_tid = tid;
+        trans->r_lsn = be64_to_cpu(rhead->h_lsn);
+        INIT_LIST_HEAD(&trans->r_itemq);
+        INIT_HLIST_NODE(&trans->r_list);
+        hlist_add_head(&trans->r_list, rhp);
+        /*
+         * Nothing more to do for this ophdr. Items to be added to this new
+         * transaction will be in subsequent ophdr containers.
+         */
+        return NULL;
+}
+STATIC int
+xlog_recover_process_ophdr(
+        struct xlog             *log,
+        struct hlist_head       rhash[],
+        struct xlog_rec_header  *rhead,
+        struct xlog_op_header   *ohead,
+        xfs_caddr_t             dp,
+        xfs_caddr_t             end,
+        int                     pass)
+{
+        struct xlog_recover     *trans;
+        unsigned int            len;
+        /* Do we understand who wrote this op? */
+        if (ohead->oh_clientid != XFS_TRANSACTION &&
+            ohead->oh_clientid != XFS_LOG) {
+                xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
+                        __func__, ohead->oh_clientid);
+                ASSERT(0);
+                return -EIO;
+        }
+        /*
+         * Check the ophdr contains all the data it is supposed to contain.
+         */
+        len = be32_to_cpu(ohead->oh_len);
+        if (dp + len > end) {
+                xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
+                WARN_ON(1);
+                return -EIO;
+        }
+        trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
+        if (!trans) {
+                /* nothing to do, so skip over this ophdr */
+                return 0;
+        }
+        return xlog_recovery_process_trans(log, trans, dp, len,
+                                           ohead->oh_flags, pass);
+}
+/*
 * There are two valid states of the r_state field.  0 indicates that the
 * transaction structure is in a normal state.  We have either seen the
 * start of the transaction or the last operation we added was not a partial
@@ -3560,86 +3666,30 @@ xlog_recover_process_data(
        xfs_caddr_t             dp,
        int                     pass)
 {
-        xfs_caddr_t             lp;
+        struct xlog_op_header   *ohead;
+        xfs_caddr_t             end;
        int                     num_logops;
-        xlog_op_header_t        *ohead;
-        xlog_recover_t          *trans;
-        xlog_tid_t              tid;
        int                     error;
-        unsigned long           hash;
-        uint                    flags;
-        lp = dp + be32_to_cpu(rhead->h_len);
+        end = dp + be32_to_cpu(rhead->h_len);
        num_logops = be32_to_cpu(rhead->h_num_logops);
        /* check the log format matches our own - else we can't recover */
        if (xlog_header_check_recover(log->l_mp, rhead))
                return -EIO;
-        while ((dp < lp) && num_logops) {
+        while ((dp < end) && num_logops) {
-                ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
-                ohead = (xlog_op_header_t *)dp;
+                ohead = (struct xlog_op_header *)dp;
-                dp += sizeof(xlog_op_header_t);
+                dp += sizeof(*ohead);
-                if (ohead->oh_clientid != XFS_TRANSACTION &&
+                ASSERT(dp <= end);
-                    ohead->oh_clientid != XFS_LOG) {
-                        xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
+                /* errors will abort recovery */
-                                        __func__, ohead->oh_clientid);
+                error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
-                        ASSERT(0);
+                                                    dp, end, pass);
-                        return -EIO;
+                if (error)
-                }
+                        return error;
-                tid = be32_to_cpu(ohead->oh_tid);
-                hash = XLOG_RHASH(tid);
-                trans = xlog_recover_find_tid(&rhash[hash], tid);
-                if (trans == NULL) {               /* not found; add new tid */
-                        if (ohead->oh_flags & XLOG_START_TRANS)
-                                xlog_recover_new_tid(&rhash[hash], tid,
-                                        be64_to_cpu(rhead->h_lsn));
-                } else {
-                        if (dp + be32_to_cpu(ohead->oh_len) > lp) {
-                                xfs_warn(log->l_mp, "%s: bad length 0x%x",
-                                        __func__, be32_to_cpu(ohead->oh_len));
-                                WARN_ON(1);
-                                return -EIO;
-                        }
-                        flags = ohead->oh_flags & ~XLOG_END_TRANS;
-                        if (flags & XLOG_WAS_CONT_TRANS)
-                                flags &= ~XLOG_CONTINUE_TRANS;
-                        switch (flags) {
-                        case XLOG_COMMIT_TRANS:
-                                error = xlog_recover_commit_trans(log,
-                                                                trans, pass);
-                                break;
-                        case XLOG_UNMOUNT_TRANS:
-                                error = xlog_recover_unmount_trans(log);
-                                break;
-                        case XLOG_WAS_CONT_TRANS:
-                                error = xlog_recover_add_to_cont_trans(log,
-                                                trans, dp,
-                                                be32_to_cpu(ohead->oh_len));
-                                break;
-                        case XLOG_START_TRANS:
-                                xfs_warn(log->l_mp, "%s: bad transaction",
-                                        __func__);
-                                ASSERT(0);
-                                error = -EIO;
-                                break;
-                        case 0:
-                        case XLOG_CONTINUE_TRANS:
-                                error = xlog_recover_add_to_trans(log, trans,
-                                                dp, be32_to_cpu(ohead->oh_len));
-                                break;
-                        default:
-                                xfs_warn(log->l_mp, "%s: bad flag 0x%x",
-                                        __func__, flags);
-                                ASSERT(0);
-                                error = -EIO;
-                                break;
-                        }
-                        if (error) {
-                                xlog_recover_free_trans(trans);
-                                return error;
-                        }
-                }
                dp += be32_to_cpu(ohead->oh_len);
                num_logops--;
        }
@@ -4132,41 +4182,13 @@ xlog_do_recovery_pass(
        }
        memset(rhash, 0, sizeof(rhash));
-        if (tail_blk <= head_blk) {
+        blk_no = tail_blk;
-                for (blk_no = tail_blk; blk_no < head_blk; ) {
+        if (tail_blk > head_blk) {
-                        error = xlog_bread(log, blk_no, hblks, hbp, &offset);
-                        if (error)
-                                goto bread_err2;
-                        rhead = (xlog_rec_header_t *)offset;
-                        error = xlog_valid_rec_header(log, rhead, blk_no);
-                        if (error)
-                                goto bread_err2;
-                        /* blocks in data section */
-                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
-                        error = xlog_bread(log, blk_no + hblks, bblks, dbp,
-                                           &offset);
-                        if (error)
-                                goto bread_err2;
-                        error = xlog_unpack_data(rhead, offset, log);
-                        if (error)
-                                goto bread_err2;
-                        error = xlog_recover_process_data(log,
-                                                rhash, rhead, offset, pass);
-                        if (error)
-                                goto bread_err2;
-                        blk_no += bblks + hblks;
-                }
-        } else {
                /*
                 * Perform recovery around the end of the physical log.
                 * When the head is not on the same cycle number as the tail,
-                 * we can't do a sequential recovery as above.
+                 * we can't do a sequential recovery.
                 */
-                blk_no = tail_blk;
                while (blk_no < log->l_logBBsize) {
                        /*
                         * Check for header wrapping around physical end-of-log
@@ -4280,34 +4302,35 @@ xlog_do_recovery_pass(
                ASSERT(blk_no >= log->l_logBBsize);
                blk_no -= log->l_logBBsize;
+        }
-                /* read first part of physical log */
+        /* read first part of physical log */
-                while (blk_no < head_blk) {
+        while (blk_no < head_blk) {
-                        error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+                error = xlog_bread(log, blk_no, hblks, hbp, &offset);
-                        if (error)
+                if (error)
-                                goto bread_err2;
+                        goto bread_err2;
-                        rhead = (xlog_rec_header_t *)offset;
+                rhead = (xlog_rec_header_t *)offset;
-                        error = xlog_valid_rec_header(log, rhead, blk_no);
+                error = xlog_valid_rec_header(log, rhead, blk_no);
-                        if (error)
+                if (error)
-                                goto bread_err2;
+                        goto bread_err2;
-                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+                /* blocks in data section */
-                        error = xlog_bread(log, blk_no+hblks, bblks, dbp,
+                bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
-                                           &offset);
+                error = xlog_bread(log, blk_no+hblks, bblks, dbp,
-                        if (error)
+                                   &offset);
-                                goto bread_err2;
+                if (error)
+                        goto bread_err2;
-                        error = xlog_unpack_data(rhead, offset, log);
+                error = xlog_unpack_data(rhead, offset, log);
-                        if (error)
+                if (error)
-                                goto bread_err2;
+                        goto bread_err2;
-                        error = xlog_recover_process_data(log, rhash,
+                error = xlog_recover_process_data(log, rhash,
-                                                        rhead, offset, pass);
+                                                rhead, offset, pass);
-                        if (error)
+                if (error)
-                                goto bread_err2;
+                        goto bread_err2;
-                        blk_no += bblks + hblks;
+                blk_no += bblks + hblks;
-                }
        }
 bread_err2:
@@ -4427,16 +4450,12 @@ xlog_do_recover(
        XFS_BUF_UNASYNC(bp);
        bp->b_ops = &xfs_sb_buf_ops;
-        if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+        error = xfs_buf_submit_wait(bp);
-                xfs_buf_relse(bp);
-                return -EIO;
-        }
-        xfs_buf_iorequest(bp);
-        error = xfs_buf_iowait(bp);
        if (error) {
-                xfs_buf_ioerror_alert(bp, __func__);
+                if (!XFS_FORCED_SHUTDOWN(log->l_mp)) {
-                ASSERT(0);
+                        xfs_buf_ioerror_alert(bp, __func__);
+                        ASSERT(0);
+                }
                xfs_buf_relse(bp);
                return error;
        }
@@ -4509,6 +4528,18 @@ xlog_recover(
                        return -EINVAL;
                }
+                /*
+                 * Delay log recovery if the debug hook is set. This is debug
+                 * instrumention to coordinate simulation of I/O failures with
+                 * log recovery.
+                 */
+                if (xfs_globals.log_recovery_delay) {
+                        xfs_notice(log->l_mp,
+                                "Delaying log recovery for %d seconds.",
+                                xfs_globals.log_recovery_delay);
+                        msleep(xfs_globals.log_recovery_delay * 1000);
+                }
                xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
                                log->l_mp->m_logname ? log->l_mp->m_logname
                                                     : "internal");
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index fbf0384a466f..51435dbce9c4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -61,8 +61,6 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
 static uuid_t *xfs_uuid_table;
-extern struct kset *xfs_kset;
 /*
 * See if the UUID is unique among mounted XFS filesystems.
 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
@@ -302,21 +300,15 @@ xfs_readsb(
         * access to the superblock.
         */
 reread:
-        bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
+        error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-                                   BTOBB(sector_size), 0, buf_ops);
+                                   BTOBB(sector_size), 0, &bp, buf_ops);
-        if (!bp) {
+        if (error) {
-                if (loud)
-                        xfs_warn(mp, "SB buffer read failed");
-                return -EIO;
-        }
-        if (bp->b_error) {
-                error = bp->b_error;
                if (loud)
                        xfs_warn(mp, "SB validate failed with error %d.", error);
                /* bad CRC means corrupted metadata */
                if (error == -EFSBADCRC)
                        error = -EFSCORRUPTED;
-                goto release_buf;
+                return error;
        }
        /*
@@ -546,40 +538,43 @@ xfs_set_inoalignment(xfs_mount_t *mp)
 * Check that the data (and log if separate) is an ok size.
 */
 STATIC int
-xfs_check_sizes(xfs_mount_t *mp)
+xfs_check_sizes(
+        struct xfs_mount *mp)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf  *bp;
        xfs_daddr_t     d;
+        int             error;
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
                xfs_warn(mp, "filesystem size mismatch detected");
                return -EFBIG;
        }
-        bp = xfs_buf_read_uncached(mp->m_ddev_targp,
+        error = xfs_buf_read_uncached(mp->m_ddev_targp,
                                        d - XFS_FSS_TO_BB(mp, 1),
-                                        XFS_FSS_TO_BB(mp, 1), 0, NULL);
+                                        XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
-        if (!bp) {
+        if (error) {
                xfs_warn(mp, "last sector read failed");
-                return -EIO;
+                return error;
        }
        xfs_buf_relse(bp);
-        if (mp->m_logdev_targp != mp->m_ddev_targp) {
+        if (mp->m_logdev_targp == mp->m_ddev_targp)
-                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+                return 0;
-                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-                        xfs_warn(mp, "log size mismatch detected");
+        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
-                        return -EFBIG;
+        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-                }
+                xfs_warn(mp, "log size mismatch detected");
-                bp = xfs_buf_read_uncached(mp->m_logdev_targp,
+                return -EFBIG;
+        }
+        error = xfs_buf_read_uncached(mp->m_logdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
-                                        XFS_FSB_TO_BB(mp, 1), 0, NULL);
+                                        XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
-                if (!bp) {
+        if (error) {
-                        xfs_warn(mp, "log device read failed");
+                xfs_warn(mp, "log device read failed");
-                        return -EIO;
+                return error;
-                }
-                xfs_buf_relse(bp);
        }
+        xfs_buf_relse(bp);
        return 0;
 }
@@ -729,7 +724,6 @@ xfs_mountfs(
        xfs_set_maxicount(mp);
-        mp->m_kobj.kobject.kset = xfs_kset;
        error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
        if (error)
                goto out;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 1eb6f3df698c..30ecca3037e3 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -304,7 +304,8 @@ _xfs_mru_cache_reap(
 int
 xfs_mru_cache_init(void)
 {
-        xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
+        xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
+                                WQ_MEM_RECLAIM|WQ_FREEZABLE, 1);
        if (!xfs_mru_reap_wq)
                return -ENOMEM;
        return 0;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 10232102b4a6..d68f23021af3 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -434,6 +434,7 @@ xfs_qm_dquot_isolate(
        struct list_head        *item,
        spinlock_t              *lru_lock,
        void                    *arg)
+                __releases(lru_lock) __acquires(lru_lock)
 {
        struct xfs_dquot        *dqp = container_of(item,
                                                struct xfs_dquot, q_lru);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 909e143b87ae..e1175ea9b551 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -46,7 +46,7 @@
 * Keeps track of a current summary block, so we don't keep reading
 * it from the buffer cache.
 */
-STATIC int                              /* error */
+static int
 xfs_rtget_summary(
        xfs_mount_t     *mp,            /* file system mount structure */
        xfs_trans_t     *tp,            /* transaction pointer */
@@ -56,60 +56,9 @@ xfs_rtget_summary(
        xfs_fsblock_t   *rsb,           /* in/out: summary block number */
        xfs_suminfo_t   *sum)           /* out: summary info for this block */
 {
-        xfs_buf_t       *bp;            /* buffer for summary block */
+        return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum);
-        int             error;          /* error value */
-        xfs_fsblock_t   sb;             /* summary fsblock */
-        int             so;             /* index into the summary file */
-        xfs_suminfo_t   *sp;            /* pointer to returned data */
-        /*
-         * Compute entry number in the summary file.
-         */
-        so = XFS_SUMOFFS(mp, log, bbno);
-        /*
-         * Compute the block number in the summary file.
-         */
-        sb = XFS_SUMOFFSTOBLOCK(mp, so);
-        /*
-         * If we have an old buffer, and the block number matches, use that.
-         */
-        if (rbpp && *rbpp && *rsb == sb)
-                bp = *rbpp;
-        /*
-         * Otherwise we have to get the buffer.
-         */
-        else {
-                /*
-                 * If there was an old one, get rid of it first.
-                 */
-                if (rbpp && *rbpp)
-                        xfs_trans_brelse(tp, *rbpp);
-                error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
-                if (error) {
-                        return error;
-                }
-                /*
-                 * Remember this buffer and block for the next call.
-                 */
-                if (rbpp) {
-                        *rbpp = bp;
-                        *rsb = sb;
-                }
-        }
-        /*
-         * Point to the summary information & copy it out.
-         */
-        sp = XFS_SUMPTR(mp, bp, so);
-        *sum = *sp;
-        /*
-         * Drop the buffer if we're not asked to remember it.
-         */
-        if (!rbpp)
-                xfs_trans_brelse(tp, bp);
-        return 0;
 }
 /*
 * Return whether there are any free extents in the size range given
 * by low and high, for the bitmap block bbno.
@@ -972,16 +921,11 @@ xfs_growfs_rt(
        /*
         * Read in the last block of the device, make sure it exists.
         */
-        bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
+        error = xfs_buf_read_uncached(mp->m_rtdev_targp,
                                XFS_FSB_TO_BB(mp, nrblocks - 1),
-                                XFS_FSB_TO_BB(mp, 1), 0, NULL);
+                                XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
-        if (!bp)
+        if (error)
-                return -EIO;
-        if (bp->b_error) {
-                error = bp->b_error;
-                xfs_buf_relse(bp);
                return error;
-        }
        xfs_buf_relse(bp);
        /*
@@ -1235,11 +1179,12 @@ xfs_rtallocate_extent(
 */
 int                             /* error */
 xfs_rtmount_init(
-        xfs_mount_t     *mp)    /* file system mount structure */
+        struct xfs_mount        *mp)    /* file system mount structure */
 {
-        xfs_buf_t       *bp;    /* buffer for last block of subvolume */
+        struct xfs_buf          *bp;    /* buffer for last block of subvolume */
-        xfs_daddr_t     d;      /* address of last block of subvolume */
+        struct xfs_sb           *sbp;   /* filesystem superblock copy in mount */
-        xfs_sb_t        *sbp;   /* filesystem superblock copy in mount */
+        xfs_daddr_t             d;      /* address of last block of subvolume */
+        int                     error;
        sbp = &mp->m_sb;
        if (sbp->sb_rblocks == 0)
@@ -1265,14 +1210,12 @@ xfs_rtmount_init(
                        (unsigned long long) mp->m_sb.sb_rblocks);
                return -EFBIG;
        }
-        bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
+        error = xfs_buf_read_uncached(mp->m_rtdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
-                                        XFS_FSB_TO_BB(mp, 1), 0, NULL);
+                                        XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
-        if (!bp || bp->b_error) {
+        if (error) {
                xfs_warn(mp, "realtime device size check failed");
-                if (bp)
+                return error;
-                        xfs_buf_relse(bp);
-                return -EIO;
        }
        xfs_buf_relse(bp);
        return 0;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index c642795324af..76c0a4a9bb17 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -111,6 +111,10 @@ int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
                    xfs_rtblock_t *rtblock);
 int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
                       xfs_rtblock_t start, xfs_extlen_t len, int val);
+int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
+                             int log, xfs_rtblock_t bbno, int delta,
+                             xfs_buf_t **rbpp, xfs_fsblock_t *rsb,
+                             xfs_suminfo_t *sum);
 int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
                         xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp,
                         xfs_fsblock_t *rsb);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b194652033cd..9f622feda6a4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -47,6 +47,7 @@
 #include "xfs_dinode.h"
 #include "xfs_filestream.h"
 #include "xfs_quota.h"
+#include "xfs_sysfs.h"
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -61,7 +62,11 @@
 static const struct super_operations xfs_super_operations;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
-struct kset *xfs_kset;
+static struct kset *xfs_kset;           /* top-level xfs sysfs dir */
+#ifdef DEBUG
+static struct xfs_kobj xfs_dbg_kobj;    /* global debug sysfs attrs */
+#endif
 #define MNTOPT_LOGBUFS  "logbufs"       /* number of XFS log buffers */
 #define MNTOPT_LOGBSIZE "logbsize"      /* size of XFS log buffers */
@@ -838,32 +843,32 @@ xfs_init_mount_workqueues(
        struct xfs_mount        *mp)
 {
        mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
-                        WQ_MEM_RECLAIM, 0, mp->m_fsname);
+                        WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
        if (!mp->m_data_workqueue)
                goto out;
        mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
-                        WQ_MEM_RECLAIM, 0, mp->m_fsname);
+                        WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
        if (!mp->m_unwritten_workqueue)
                goto out_destroy_data_iodone_queue;
        mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
-                        WQ_MEM_RECLAIM, 0, mp->m_fsname);
+                        WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
        if (!mp->m_cil_workqueue)
                goto out_destroy_unwritten;
        mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
-                        0, 0, mp->m_fsname);
+                        WQ_FREEZABLE, 0, mp->m_fsname);
        if (!mp->m_reclaim_workqueue)
                goto out_destroy_cil;
        mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
-                        0, 0, mp->m_fsname);
+                        WQ_FREEZABLE, 0, mp->m_fsname);
        if (!mp->m_log_workqueue)
                goto out_destroy_reclaim;
        mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
-                        0, 0, mp->m_fsname);
+                        WQ_FREEZABLE, 0, mp->m_fsname);
        if (!mp->m_eofblocks_workqueue)
                goto out_destroy_log;
@@ -1406,6 +1411,7 @@ xfs_fs_fill_super(
        atomic_set(&mp->m_active_trans, 0);
        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
        INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
+        mp->m_kobj.kobject.kset = xfs_kset;
        mp->m_super = sb;
        sb->s_fs_info = mp;
@@ -1715,7 +1721,8 @@ xfs_init_workqueues(void)
         * AGs in all the filesystems mounted. Hence use the default large
         * max_active value for this workqueue.
         */
-        xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
+        xfs_alloc_wq = alloc_workqueue("xfsalloc",
+                        WQ_MEM_RECLAIM|WQ_FREEZABLE, 0);
        if (!xfs_alloc_wq)
                return -ENOMEM;
@@ -1768,9 +1775,16 @@ init_xfs_fs(void)
                goto out_sysctl_unregister;;
        }
-        error = xfs_qm_init();
+#ifdef DEBUG
+        xfs_dbg_kobj.kobject.kset = xfs_kset;
+        error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
        if (error)
                goto out_kset_unregister;
+#endif
+        error = xfs_qm_init();
+        if (error)
+                goto out_remove_kobj;
        error = register_filesystem(&xfs_fs_type);
        if (error)
@@ -1779,7 +1793,11 @@ init_xfs_fs(void)
 out_qm_exit:
        xfs_qm_exit();
+ out_remove_kobj:
+#ifdef DEBUG
+        xfs_sysfs_del(&xfs_dbg_kobj);
 out_kset_unregister:
+#endif
        kset_unregister(xfs_kset);
 out_sysctl_unregister:
        xfs_sysctl_unregister();
@@ -1802,6 +1820,9 @@ exit_xfs_fs(void)
 {
        xfs_qm_exit();
        unregister_filesystem(&xfs_fs_type);
+#ifdef DEBUG
+        xfs_sysfs_del(&xfs_dbg_kobj);
+#endif
        kset_unregister(xfs_kset);
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 6a944a2cd36f..02ae62a998e0 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -269,9 +269,11 @@ xfs_symlink(
        /*
         * Check for ability to enter directory entry, if no space reserved.
         */
-        error = xfs_dir_canenter(tp, dp, link_name, resblks);
+        if (!resblks) {
-        if (error)
+                error = xfs_dir_canenter(tp, dp, link_name);
-                goto error_return;
+                if (error)
+                        goto error_return;
+        }
        /*
         * Initialize the bmap freelist prior to calling either
         * bmapi or the directory create code.
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index bd8e157c20ef..ffef45375754 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -92,6 +92,11 @@ enum {
 extern xfs_param_t      xfs_params;
+struct xfs_globals {
+        int     log_recovery_delay;     /* log recovery delay (secs) */
+};
+extern struct xfs_globals       xfs_globals;
 #ifdef CONFIG_SYSCTL
 extern int xfs_sysctl_register(void);
 extern void xfs_sysctl_unregister(void);
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 9835139ce1ec..aa03670851d8 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -51,6 +51,80 @@ struct kobj_type xfs_mp_ktype = {
        .release = xfs_sysfs_release,
 };
+#ifdef DEBUG
+/* debug */
+STATIC ssize_t
+log_recovery_delay_store(
+        const char      *buf,
+        size_t          count,
+        void            *data)
+{
+        int             ret;
+        int             val;
+        ret = kstrtoint(buf, 0, &val);
+        if (ret)
+                return ret;
+        if (val < 0 || val > 60)
+                return -EINVAL;
+        xfs_globals.log_recovery_delay = val;
+        return count;
+}
+STATIC ssize_t
+log_recovery_delay_show(
+        char    *buf,
+        void    *data)
+{
+        return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
+}
+XFS_SYSFS_ATTR_RW(log_recovery_delay);
+static struct attribute *xfs_dbg_attrs[] = {
+        ATTR_LIST(log_recovery_delay),
+        NULL,
+};
+STATIC ssize_t
+xfs_dbg_show(
+        struct kobject          *kobject,
+        struct attribute        *attr,
+        char                    *buf)
+{
+        struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+        return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
+}
+STATIC ssize_t
+xfs_dbg_store(
+        struct kobject          *kobject,
+        struct attribute        *attr,
+        const char              *buf,
+        size_t                  count)
+{
+        struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+        return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
+}
+static struct sysfs_ops xfs_dbg_ops = {
+        .show = xfs_dbg_show,
+        .store = xfs_dbg_store,
+};
+struct kobj_type xfs_dbg_ktype = {
+        .release = xfs_sysfs_release,
+        .sysfs_ops = &xfs_dbg_ops,
+        .default_attrs = xfs_dbg_attrs,
+};
+#endif /* DEBUG */
 /* xlog */
 STATIC ssize_t
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 54a2091183c0..240eee35f342 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -20,6 +20,7 @@
 #define __XFS_SYSFS_H__
 extern struct kobj_type xfs_mp_ktype;   /* xfs_mount */
+extern struct kobj_type xfs_dbg_ktype;  /* debug */
 extern struct kobj_type xfs_log_ktype;  /* xlog */
 static inline struct xfs_kobj *
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 152f82782630..51372e34d988 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -349,7 +349,8 @@ DEFINE_BUF_EVENT(xfs_buf_free);
 DEFINE_BUF_EVENT(xfs_buf_hold);
 DEFINE_BUF_EVENT(xfs_buf_rele);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
-DEFINE_BUF_EVENT(xfs_buf_iorequest);
+DEFINE_BUF_EVENT(xfs_buf_submit);
+DEFINE_BUF_EVENT(xfs_buf_submit_wait);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 96c898e7ac9a..e2b2216b1635 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -318,20 +318,10 @@ xfs_trans_read_buf_map(
                        XFS_BUF_READ(bp);
                        bp->b_ops = ops;
-                        /*
+                        error = xfs_buf_submit_wait(bp);
-                         * XXX(hch): clean up the error handling here to be less
-                         * of a mess..
-                         */
-                        if (XFS_FORCED_SHUTDOWN(mp)) {
-                                trace_xfs_bdstrat_shut(bp, _RET_IP_);
-                                xfs_bioerror_relse(bp);
-                        } else {
-                                xfs_buf_iorequest(bp);
-                        }
-                        error = xfs_buf_iowait(bp);
                        if (error) {
-                                xfs_buf_ioerror_alert(bp, __func__);
+                                if (!XFS_FORCED_SHUTDOWN(mp))
+                                        xfs_buf_ioerror_alert(bp, __func__);
                                xfs_buf_relse(bp);
                                /*
                                 * We can gracefully recover from most read
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 50c3f5614288..cdb4d86520e1 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -70,7 +70,7 @@ xfs_trans_ichgtime(
        int                     flags)
 {
        struct inode            *inode = VFS_I(ip);
-        timespec_t              tv;
+        struct timespec         tv;
        ASSERT(tp);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-10-13 06:06:54 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-10-13 06:06:54 -0400
commit	5ff0b9e1a1da58b584aa4b8ea234be20b5a1164b (patch)
tree	4849a305c073d4add184c1474a6c000a847285e7
parent	77c688ac87183537ed0fb84ec2cb8fa8ec97c458 (diff)
parent	6889e783cd68b79f8330ad4d10a2571c67c3f7df (diff)