xfs: Use delayed write for inodes rather than async V2

We currently do background inode flush asynchronously, resulting in inodes being written in whatever order the background writeback issues them. Not only that, there are also blocking and non-blocking asynchronous inode flushes, depending on where the flush comes from. This patch completely removes asynchronous inode writeback. It removes all the strange writeback modes and replaces them with either a synchronous flush or a non-blocking delayed write flush. That is, inode flushes will only issue IO directly if they are synchronous, and background flushing may do nothing if the operation would block (e.g. on a pinned inode or buffer lock). Delayed write flushes will now result in the inode buffer sitting in the delwri queue of the buffer cache to be flushed by either an AIL push or by the xfsbufd timing out the buffer. This will allow accumulation of dirty inode buffers in memory and allow optimisation of inode cluster writeback at the xfsbufd level where we have much greater queue depths than the block layer elevators. We will also get adjacent inode cluster buffer IO merging for free when a later patch in the series allows sorting of the delayed write buffers before dispatch. This effectively means that any inode that is written back by background writeback will be seen as flush locked during AIL pushing, and will result in the buffers being pushed from there. This writeback path is currently non-optimal, but the next patch in the series will fix that problem. A side effect of this delayed write mechanism is that background inode reclaim will no longer directly flush inodes, nor can it wait on the flush lock. The result is that inode reclaim must leave the inode in the reclaimable state until it is clean. Hence attempts to reclaim a dirty inode in the background will simply skip the inode until it is clean and this allows other mechanisms (i.e. xfsbufd) to do more optimal writeback of the dirty buffers. As a result, the inode reclaim code has been rewritten so that it no longer relies on the ambiguous return values of xfs_iflush() to determine whether it is safe to reclaim an inode. Portions of this patch are derived from patches by Christoph Hellwig. Version 2: - cleanup reclaim code as suggested by Christoph - log background reclaim inode flush errors - just pass sync flags to xfs_iflush Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
author: Dave Chinner <david@fromorbit.com> 2010-02-05 20:39:36 -0500
committer: Dave Chinner <david@fromorbit.com> 2010-02-05 20:39:36 -0500
commit: c854363e80b49dd04a4de18ebc379eb8c8806674 (patch)
tree: 8c8d0dec26d961631a3cd8b6c402b5d1444336e5
parent: 777df5afdb26c71634edd60582be620ff94e87a0 (diff)
6 files changed, 102 insertions, 115 deletions
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 6ce828e0e17b..3b5b46b8e3b9 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1064,7 +1064,7 @@ xfs_fs_write_inode(
                xfs_ilock(ip, XFS_ILOCK_SHARED);
                xfs_iflock(ip);
-                error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+                error = xfs_iflush(ip, SYNC_WAIT);
        } else {
                error = EAGAIN;
                if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
@@ -1072,7 +1072,7 @@ xfs_fs_write_inode(
                if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
                        goto out_unlock;
-                error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
+                error = xfs_iflush(ip, 0);
        }
 out_unlock:
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 525260c7617f..a9f6d20aff41 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -270,8 +270,7 @@ xfs_sync_inode_attr(
                goto out_unlock;
        }
-        error = xfs_iflush(ip, (flags & SYNC_WAIT) ?
+        error = xfs_iflush(ip, flags);
-                           XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -460,16 +459,18 @@ xfs_quiesce_fs(
 {
        int     count = 0, pincount;
+        xfs_reclaim_inodes(mp, 0);
        xfs_flush_buftarg(mp->m_ddev_targp, 0);
-        xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
        /*
         * This loop must run at least twice.  The first instance of the loop
         * will flush most meta data but that will generate more meta data
         * (typically directory updates).  Which then must be flushed and
-         * logged before we can write the unmount record.
+         * logged before we can write the unmount record. We also so sync
+         * reclaim of inodes to catch any that the above delwri flush skipped.
         */
        do {
+                xfs_reclaim_inodes(mp, SYNC_WAIT);
                xfs_sync_attr(mp, SYNC_WAIT);
                pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
                if (!pincount) {
@@ -585,7 +586,7 @@ xfs_sync_worker(
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
                xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
                error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -719,21 +720,42 @@ __xfs_inode_clear_reclaim_tag(
 *      shutdown                EIO             unpin and reclaim
 *      clean, unpinned         0               reclaim
 *      stale, unpinned         0               reclaim
- *      clean, pinned(*)        0               unpin and reclaim
+ *      clean, pinned(*)        0               requeue
- *      stale, pinned           0               unpin and reclaim
+ *      stale, pinned           EAGAIN          requeue
- *      dirty, async            0               block on flush lock, reclaim
+ *      dirty, delwri ok        0               requeue
- *      dirty, sync flush       0               block on flush lock, reclaim
+ *      dirty, delwri blocked   EAGAIN          requeue
+ *      dirty, sync flush       0               reclaim
 *
 * (*) dgc: I don't think the clean, pinned state is possible but it gets
 * handled anyway given the order of checks implemented.
 *
+ * As can be seen from the table, the return value of xfs_iflush() is not
+ * sufficient to correctly decide the reclaim action here. The checks in
+ * xfs_iflush() might look like duplicates, but they are not.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean. The clean inode check needs to be done before flushing
+ * the inode delwri otherwise we would loop forever requeuing clean inodes as
+ * we cannot tell apart a successful delwri flush and a clean inode from the
+ * return value of xfs_iflush().
+ *
+ * Note that because the inode is flushed delayed write by background
+ * writeback, the flush lock may already be held here and waiting on it can
+ * result in very long latencies. Hence for sync reclaims, where we wait on the
+ * flush lock, the caller should push out delayed write inodes first before
+ * trying to reclaim them to minimise the amount of time spent waiting. For
+ * background relaim, we just requeue the inode for the next pass.
+ *
 * Hence the order of actions after gaining the locks should be:
 *      bad             => reclaim
 *      shutdown        => unpin and reclaim
- *      pinned          => unpin
+ *      pinned, delwri  => requeue
+ *      pinned, sync    => unpin
 *      stale           => reclaim
 *      clean           => reclaim
- *      dirty           => flush, wait and reclaim
+ *      dirty, delwri   => flush and requeue
+ *      dirty, sync     => flush, wait and reclaim
 */
 STATIC int
 xfs_reclaim_inode(
@@ -741,7 +763,7 @@ xfs_reclaim_inode(
        struct xfs_perag        *pag,
        int                     sync_mode)
 {
-        int     error;
+        int     error = 0;
        /*
         * The radix tree lock here protects a thread in xfs_iget from racing
@@ -761,7 +783,11 @@ xfs_reclaim_inode(
        write_unlock(&pag->pag_ici_lock);
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_iflock(ip);
+        if (!xfs_iflock_nowait(ip)) {
+                if (!(sync_mode & SYNC_WAIT))
+                        goto out;
+                xfs_iflock(ip);
+        }
        if (is_bad_inode(VFS_I(ip)))
                goto reclaim;
@@ -769,8 +795,13 @@ xfs_reclaim_inode(
                xfs_iunpin_wait(ip);
                goto reclaim;
        }
-        if (xfs_ipincount(ip))
+        if (xfs_ipincount(ip)) {
+                if (!(sync_mode & SYNC_WAIT)) {
+                        xfs_ifunlock(ip);
+                        goto out;
+                }
                xfs_iunpin_wait(ip);
+        }
        if (xfs_iflags_test(ip, XFS_ISTALE))
                goto reclaim;
        if (xfs_inode_clean(ip))
@@ -778,27 +809,43 @@ xfs_reclaim_inode(
        /* Now we have an inode that needs flushing */
        error = xfs_iflush(ip, sync_mode);
-        if (!error) {
+        if (sync_mode & SYNC_WAIT) {
-                switch(sync_mode) {
+                xfs_iflock(ip);
-                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
+                goto reclaim;
-                case XFS_IFLUSH_DELWRI:
-                case XFS_IFLUSH_ASYNC:
-                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-                case XFS_IFLUSH_SYNC:
-                        /* IO issued, synchronise with IO completion */
-                        xfs_iflock(ip);
-                        break;
-                default:
-                        ASSERT(0);
-                        break;
-                }
        }
+        /*
+         * When we have to flush an inode but don't have SYNC_WAIT set, we
+         * flush the inode out using a delwri buffer and wait for the next
+         * call into reclaim to find it in a clean state instead of waiting for
+         * it now. We also don't return errors here - if the error is transient
+         * then the next reclaim pass will flush the inode, and if the error
+         * is permanent then the next sync reclaim will relcaim the inode and
+         * pass on the error.
+         */
+        if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                        "inode 0x%llx background reclaim flush failed with %d",
+                        (long long)ip->i_ino, error);
+        }
+out:
+        xfs_iflags_clear(ip, XFS_IRECLAIM);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        /*
+         * We could return EAGAIN here to make reclaim rescan the inode tree in
+         * a short while. However, this just burns CPU time scanning the tree
+         * waiting for IO to complete and xfssyncd never goes back to the idle
+         * state. Instead, return 0 to let the next scheduled background reclaim
+         * attempt to reclaim the inode again.
+         */
+        return 0;
 reclaim:
        xfs_ifunlock(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        xfs_ireclaim(ip);
-        return 0;
+        return error;
 }
 int
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8d0666dd170a..fa31360046d4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2835,8 +2835,6 @@ xfs_iflush(
        xfs_dinode_t            *dip;
        xfs_mount_t             *mp;
        int                     error;
-        int                     noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
-        enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
        XFS_STATS_INC(xs_iflush_count);
@@ -2859,7 +2857,7 @@ xfs_iflush(
         * in the same cluster are dirty, they will probably write the inode
         * out for us if they occur after the log force completes.
         */
-        if (noblock && xfs_ipincount(ip)) {
+        if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
                xfs_iunpin_nowait(ip);
                xfs_ifunlock(ip);
                return EAGAIN;
@@ -2893,60 +2891,10 @@ xfs_iflush(
        }
        /*
-         * Decide how buffer will be flushed out.  This is done before
-         * the call to xfs_iflush_int because this field is zeroed by it.
-         */
-        if (iip != NULL && iip->ili_format.ilf_fields != 0) {
-                /*
-                 * Flush out the inode buffer according to the directions
-                 * of the caller.  In the cases where the caller has given
-                 * us a choice choose the non-delwri case.  This is because
-                 * the inode is in the AIL and we need to get it out soon.
-                 */
-                switch (flags) {
-                case XFS_IFLUSH_SYNC:
-                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-                        flags = 0;
-                        break;
-                case XFS_IFLUSH_ASYNC_NOBLOCK:
-                case XFS_IFLUSH_ASYNC:
-                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
-                        flags = INT_ASYNC;
-                        break;
-                case XFS_IFLUSH_DELWRI:
-                        flags = INT_DELWRI;
-                        break;
-                default:
-                        ASSERT(0);
-                        flags = 0;
-                        break;
-                }
-        } else {
-                switch (flags) {
-                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
-                case XFS_IFLUSH_DELWRI:
-                        flags = INT_DELWRI;
-                        break;
-                case XFS_IFLUSH_ASYNC_NOBLOCK:
-                case XFS_IFLUSH_ASYNC:
-                        flags = INT_ASYNC;
-                        break;
-                case XFS_IFLUSH_SYNC:
-                        flags = 0;
-                        break;
-                default:
-                        ASSERT(0);
-                        flags = 0;
-                        break;
-                }
-        }
-        /*
         * Get the buffer containing the on-disk inode.
         */
        error = xfs_itobp(mp, NULL, ip, &dip, &bp,
-                                noblock ? XBF_TRYLOCK : XBF_LOCK);
+                                (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
        if (error || !bp) {
                xfs_ifunlock(ip);
                return error;
@@ -2974,13 +2922,10 @@ xfs_iflush(
        if (error)
                goto cluster_corrupt_out;
-        if (flags & INT_DELWRI) {
+        if (flags & SYNC_WAIT)
-                xfs_bdwrite(mp, bp);
-        } else if (flags & INT_ASYNC) {
-                error = xfs_bawrite(mp, bp);
-        } else {
                error = xfs_bwrite(mp, bp);
-        }
+        else
+                xfs_bdwrite(mp, bp);
        return error;
 corrupt_out:
@@ -3015,16 +2960,6 @@ xfs_iflush_int(
        iip = ip->i_itemp;
        mp = ip->i_mount;
-        /*
-         * If the inode isn't dirty, then just release the inode
-         * flush lock and do nothing.
-         */
-        if (xfs_inode_clean(ip)) {
-                xfs_ifunlock(ip);
-                return 0;
-        }
        /* set *dip = inode's place in the buffer */
        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 8b618ea4d692..6c912b027596 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -420,16 +420,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
 /*
- * Flags for xfs_iflush()
- */
-#define XFS_IFLUSH_DELWRI_ELSE_SYNC     1
-#define XFS_IFLUSH_DELWRI_ELSE_ASYNC    2
-#define XFS_IFLUSH_SYNC                 3
-#define XFS_IFLUSH_ASYNC                4
-#define XFS_IFLUSH_DELWRI               5
-#define XFS_IFLUSH_ASYNC_NOBLOCK        6
-/*
 * Flags for xfs_itruncate_start().
 */
 #define XFS_ITRUNC_DEFINITE     0x1
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 48ec1c0b23ce..207553e82954 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -866,10 +866,14 @@ xfs_inode_item_push(
               iip->ili_format.ilf_fields != 0);
        /*
-         * Write out the inode.  The completion routine ('iflush_done') will
+         * Push the inode to it's backing buffer. This will not remove the
-         * pull it from the AIL, mark it clean, unlock the flush lock.
+         * inode from the AIL - a further push will be required to trigger a
+         * buffer push. However, this allows all the dirty inodes to be pushed
+         * to the buffer before it is pushed to disk. THe buffer IO completion
+         * will pull th einode from the AIL, mark it clean and unlock the flush
+         * lock.
         */
-        (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC);
+        (void) xfs_iflush(ip, 0);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        return;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5061149b2cc4..6afaaeb2950a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1468,7 +1468,18 @@ xfs_unmountfs(
         * need to force the log first.
         */
        xfs_log_force(mp, XFS_LOG_SYNC);
-        xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);
+        /*
+         * Do a delwri reclaim pass first so that as many dirty inodes are
+         * queued up for IO as possible. Then flush the buffers before making
+         * a synchronous path to catch all the remaining inodes are reclaimed.
+         * This makes the reclaim process as quick as possible by avoiding
+         * synchronous writeout and blocking on inodes already in the delwri
+         * state as much as possible.
+         */
+        xfs_reclaim_inodes(mp, 0);
+        XFS_bflush(mp->m_ddev_targp);
+        xfs_reclaim_inodes(mp, SYNC_WAIT);
        xfs_qm_unmount(mp);
author	Dave Chinner <david@fromorbit.com>	2010-02-05 20:39:36 -0500
committer	Dave Chinner <david@fromorbit.com>	2010-02-05 20:39:36 -0500
commit	c854363e80b49dd04a4de18ebc379eb8c8806674 (patch)
tree	8c8d0dec26d961631a3cd8b6c402b5d1444336e5
parent	777df5afdb26c71634edd60582be620ff94e87a0 (diff)

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 6ce828e0e17b..3b5b46b8e3b9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1064,7 +1064,7 @@ xfs_fs_write_inode(
1064	xfs_ilock(ip, XFS_ILOCK_SHARED);	1064	xfs_ilock(ip, XFS_ILOCK_SHARED);
1065	xfs_iflock(ip);	1065	xfs_iflock(ip);
1066		1066
1067	error = xfs_iflush(ip, XFS_IFLUSH_SYNC);	1067	error = xfs_iflush(ip, SYNC_WAIT);
1068	} else {	1068	} else {
1069	error = EAGAIN;	1069	error = EAGAIN;
1070	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))	1070	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
@@ -1072,7 +1072,7 @@ xfs_fs_write_inode(
1072	if (xfs_ipincount(ip) \|\| !xfs_iflock_nowait(ip))	1072	if (xfs_ipincount(ip) \|\| !xfs_iflock_nowait(ip))
1073	goto out_unlock;	1073	goto out_unlock;
1074		1074
1075	error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);	1075	error = xfs_iflush(ip, 0);
1076	}	1076	}
1077		1077
1078	out_unlock:	1078	out_unlock:


diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 525260c7617f..a9f6d20aff41 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -270,8 +270,7 @@ xfs_sync_inode_attr(
270	goto out_unlock;	270	goto out_unlock;
271	}	271	}
272		272
273	error = xfs_iflush(ip, (flags & SYNC_WAIT) ?	273	error = xfs_iflush(ip, flags);
274	XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
275		274
276	out_unlock:	275	out_unlock:
277	xfs_iunlock(ip, XFS_ILOCK_SHARED);	276	xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -460,16 +459,18 @@ xfs_quiesce_fs(
460	{	459	{
461	int count = 0, pincount;	460	int count = 0, pincount;
462		461
		462	xfs_reclaim_inodes(mp, 0);
463	xfs_flush_buftarg(mp->m_ddev_targp, 0);	463	xfs_flush_buftarg(mp->m_ddev_targp, 0);
464	xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
465		464
466	/*	465	/*
467	* This loop must run at least twice. The first instance of the loop	466	* This loop must run at least twice. The first instance of the loop
468	* will flush most meta data but that will generate more meta data	467	* will flush most meta data but that will generate more meta data
469	* (typically directory updates). Which then must be flushed and	468	* (typically directory updates). Which then must be flushed and
470	* logged before we can write the unmount record.	469	* logged before we can write the unmount record. We also so sync
		470	* reclaim of inodes to catch any that the above delwri flush skipped.
471	*/	471	*/
472	do {	472	do {
		473	xfs_reclaim_inodes(mp, SYNC_WAIT);
473	xfs_sync_attr(mp, SYNC_WAIT);	474	xfs_sync_attr(mp, SYNC_WAIT);
474	pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);	475	pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
475	if (!pincount) {	476	if (!pincount) {
@@ -585,7 +586,7 @@ xfs_sync_worker(
585		586
586	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {	587	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
587	xfs_log_force(mp, 0);	588	xfs_log_force(mp, 0);
588	xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);	589	xfs_reclaim_inodes(mp, 0);
589	/* dgc: errors ignored here */	590	/* dgc: errors ignored here */
590	error = xfs_qm_sync(mp, SYNC_TRYLOCK);	591	error = xfs_qm_sync(mp, SYNC_TRYLOCK);
591	error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);	592	error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -719,21 +720,42 @@ __xfs_inode_clear_reclaim_tag(
719	* shutdown EIO unpin and reclaim	720	* shutdown EIO unpin and reclaim
720	* clean, unpinned 0 reclaim	721	* clean, unpinned 0 reclaim
721	* stale, unpinned 0 reclaim	722	* stale, unpinned 0 reclaim
722	* clean, pinned(*) 0 unpin and reclaim	723	* clean, pinned(*) 0 requeue
723	* stale, pinned 0 unpin and reclaim	724	* stale, pinned EAGAIN requeue
724	* dirty, async 0 block on flush lock, reclaim	725	* dirty, delwri ok 0 requeue
725	* dirty, sync flush 0 block on flush lock, reclaim	726	* dirty, delwri blocked EAGAIN requeue
		727	* dirty, sync flush 0 reclaim
726	*	728	*
727	* (*) dgc: I don't think the clean, pinned state is possible but it gets	729	* (*) dgc: I don't think the clean, pinned state is possible but it gets
728	* handled anyway given the order of checks implemented.	730	* handled anyway given the order of checks implemented.
729	*	731	*
		732	* As can be seen from the table, the return value of xfs_iflush() is not
		733	* sufficient to correctly decide the reclaim action here. The checks in
		734	* xfs_iflush() might look like duplicates, but they are not.
		735	*
		736	* Also, because we get the flush lock first, we know that any inode that has
		737	* been flushed delwri has had the flush completed by the time we check that
		738	* the inode is clean. The clean inode check needs to be done before flushing
		739	* the inode delwri otherwise we would loop forever requeuing clean inodes as
		740	* we cannot tell apart a successful delwri flush and a clean inode from the
		741	* return value of xfs_iflush().
		742	*
		743	* Note that because the inode is flushed delayed write by background
		744	* writeback, the flush lock may already be held here and waiting on it can
		745	* result in very long latencies. Hence for sync reclaims, where we wait on the
		746	* flush lock, the caller should push out delayed write inodes first before
		747	* trying to reclaim them to minimise the amount of time spent waiting. For
		748	* background relaim, we just requeue the inode for the next pass.
		749	*
730	* Hence the order of actions after gaining the locks should be:	750	* Hence the order of actions after gaining the locks should be:
731	* bad => reclaim	751	* bad => reclaim
732	* shutdown => unpin and reclaim	752	* shutdown => unpin and reclaim
733	* pinned => unpin	753	* pinned, delwri => requeue
		754	* pinned, sync => unpin
734	* stale => reclaim	755	* stale => reclaim
735	* clean => reclaim	756	* clean => reclaim
736	* dirty => flush, wait and reclaim	757	* dirty, delwri => flush and requeue
		758	* dirty, sync => flush, wait and reclaim
737	*/	759	*/
738	STATIC int	760	STATIC int
739	xfs_reclaim_inode(	761	xfs_reclaim_inode(
@@ -741,7 +763,7 @@ xfs_reclaim_inode(
741	struct xfs_perag *pag,	763	struct xfs_perag *pag,
742	int sync_mode)	764	int sync_mode)
743	{	765	{
744	int error;	766	int error = 0;
745		767
746	/*	768	/*
747	* The radix tree lock here protects a thread in xfs_iget from racing	769	* The radix tree lock here protects a thread in xfs_iget from racing
@@ -761,7 +783,11 @@ xfs_reclaim_inode(
761	write_unlock(&pag->pag_ici_lock);	783	write_unlock(&pag->pag_ici_lock);
762		784
763	xfs_ilock(ip, XFS_ILOCK_EXCL);	785	xfs_ilock(ip, XFS_ILOCK_EXCL);
764	xfs_iflock(ip);	786	if (!xfs_iflock_nowait(ip)) {
		787	if (!(sync_mode & SYNC_WAIT))
		788	goto out;
		789	xfs_iflock(ip);
		790	}
765		791
766	if (is_bad_inode(VFS_I(ip)))	792	if (is_bad_inode(VFS_I(ip)))
767	goto reclaim;	793	goto reclaim;
@@ -769,8 +795,13 @@ xfs_reclaim_inode(
769	xfs_iunpin_wait(ip);	795	xfs_iunpin_wait(ip);
770	goto reclaim;	796	goto reclaim;
771	}	797	}
772	if (xfs_ipincount(ip))	798	if (xfs_ipincount(ip)) {
		799	if (!(sync_mode & SYNC_WAIT)) {
		800	xfs_ifunlock(ip);
		801	goto out;
		802	}
773	xfs_iunpin_wait(ip);	803	xfs_iunpin_wait(ip);
		804	}
774	if (xfs_iflags_test(ip, XFS_ISTALE))	805	if (xfs_iflags_test(ip, XFS_ISTALE))
775	goto reclaim;	806	goto reclaim;
776	if (xfs_inode_clean(ip))	807	if (xfs_inode_clean(ip))
@@ -778,27 +809,43 @@ xfs_reclaim_inode(
778		809
779	/* Now we have an inode that needs flushing */	810	/* Now we have an inode that needs flushing */
780	error = xfs_iflush(ip, sync_mode);	811	error = xfs_iflush(ip, sync_mode);
781	if (!error) {	812	if (sync_mode & SYNC_WAIT) {
782	switch(sync_mode) {	813	xfs_iflock(ip);
783	case XFS_IFLUSH_DELWRI_ELSE_ASYNC:	814	goto reclaim;
784	case XFS_IFLUSH_DELWRI:
785	case XFS_IFLUSH_ASYNC:
786	case XFS_IFLUSH_DELWRI_ELSE_SYNC:
787	case XFS_IFLUSH_SYNC:
788	/* IO issued, synchronise with IO completion */
789	xfs_iflock(ip);
790	break;
791	default:
792	ASSERT(0);
793	break;
794	}
795	}	815	}
796		816
		817	/*
		818	* When we have to flush an inode but don't have SYNC_WAIT set, we
		819	* flush the inode out using a delwri buffer and wait for the next
		820	* call into reclaim to find it in a clean state instead of waiting for
		821	* it now. We also don't return errors here - if the error is transient
		822	* then the next reclaim pass will flush the inode, and if the error
		823	* is permanent then the next sync reclaim will relcaim the inode and
		824	* pass on the error.
		825	*/
		826	if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
		827	xfs_fs_cmn_err(CE_WARN, ip->i_mount,
		828	"inode 0x%llx background reclaim flush failed with %d",
		829	(long long)ip->i_ino, error);
		830	}
		831	out:
		832	xfs_iflags_clear(ip, XFS_IRECLAIM);
		833	xfs_iunlock(ip, XFS_ILOCK_EXCL);
		834	/*
		835	* We could return EAGAIN here to make reclaim rescan the inode tree in
		836	* a short while. However, this just burns CPU time scanning the tree
		837	* waiting for IO to complete and xfssyncd never goes back to the idle
		838	* state. Instead, return 0 to let the next scheduled background reclaim
		839	* attempt to reclaim the inode again.
		840	*/
		841	return 0;
		842
797	reclaim:	843	reclaim:
798	xfs_ifunlock(ip);	844	xfs_ifunlock(ip);
799	xfs_iunlock(ip, XFS_ILOCK_EXCL);	845	xfs_iunlock(ip, XFS_ILOCK_EXCL);
800	xfs_ireclaim(ip);	846	xfs_ireclaim(ip);
801	return 0;	847	return error;
		848
802	}	849	}
803		850
804	int	851	int


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8d0666dd170a..fa31360046d4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c
@@ -2835,8 +2835,6 @@ xfs_iflush(
2835	xfs_dinode_t *dip;	2835	xfs_dinode_t *dip;
2836	xfs_mount_t *mp;	2836	xfs_mount_t *mp;
2837	int error;	2837	int error;
2838	int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
2839	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
2840		2838
2841	XFS_STATS_INC(xs_iflush_count);	2839	XFS_STATS_INC(xs_iflush_count);
2842		2840
@@ -2859,7 +2857,7 @@ xfs_iflush(
2859	* in the same cluster are dirty, they will probably write the inode	2857	* in the same cluster are dirty, they will probably write the inode
2860	* out for us if they occur after the log force completes.	2858	* out for us if they occur after the log force completes.
2861	*/	2859	*/
2862	if (noblock && xfs_ipincount(ip)) {	2860	if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2863	xfs_iunpin_nowait(ip);	2861	xfs_iunpin_nowait(ip);
2864	xfs_ifunlock(ip);	2862	xfs_ifunlock(ip);
2865	return EAGAIN;	2863	return EAGAIN;
@@ -2893,60 +2891,10 @@ xfs_iflush(
2893	}	2891	}
2894		2892
2895	/*	2893	/*
2896	* Decide how buffer will be flushed out. This is done before
2897	* the call to xfs_iflush_int because this field is zeroed by it.
2898	*/
2899	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
2900	/*
2901	* Flush out the inode buffer according to the directions
2902	* of the caller. In the cases where the caller has given
2903	* us a choice choose the non-delwri case. This is because
2904	* the inode is in the AIL and we need to get it out soon.
2905	*/
2906	switch (flags) {
2907	case XFS_IFLUSH_SYNC:
2908	case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2909	flags = 0;
2910	break;
2911	case XFS_IFLUSH_ASYNC_NOBLOCK:
2912	case XFS_IFLUSH_ASYNC:
2913	case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2914	flags = INT_ASYNC;
2915	break;
2916	case XFS_IFLUSH_DELWRI:
2917	flags = INT_DELWRI;
2918	break;
2919	default:
2920	ASSERT(0);
2921	flags = 0;
2922	break;
2923	}
2924	} else {
2925	switch (flags) {
2926	case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2927	case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2928	case XFS_IFLUSH_DELWRI:
2929	flags = INT_DELWRI;
2930	break;
2931	case XFS_IFLUSH_ASYNC_NOBLOCK:
2932	case XFS_IFLUSH_ASYNC:
2933	flags = INT_ASYNC;
2934	break;
2935	case XFS_IFLUSH_SYNC:
2936	flags = 0;
2937	break;
2938	default:
2939	ASSERT(0);
2940	flags = 0;
2941	break;
2942	}
2943	}
2944
2945	/*
2946	* Get the buffer containing the on-disk inode.	2894	* Get the buffer containing the on-disk inode.
2947	*/	2895	*/
2948	error = xfs_itobp(mp, NULL, ip, &dip, &bp,	2896	error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2949	noblock ? XBF_TRYLOCK : XBF_LOCK);	2897	(flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
2950	if (error \|\| !bp) {	2898	if (error \|\| !bp) {
2951	xfs_ifunlock(ip);	2899	xfs_ifunlock(ip);
2952	return error;	2900	return error;
@@ -2974,13 +2922,10 @@ xfs_iflush(
2974	if (error)	2922	if (error)
2975	goto cluster_corrupt_out;	2923	goto cluster_corrupt_out;
2976		2924
2977	if (flags & INT_DELWRI) {	2925	if (flags & SYNC_WAIT)
2978	xfs_bdwrite(mp, bp);
2979	} else if (flags & INT_ASYNC) {
2980	error = xfs_bawrite(mp, bp);
2981	} else {
2982	error = xfs_bwrite(mp, bp);	2926	error = xfs_bwrite(mp, bp);
2983	}	2927	else
		2928	xfs_bdwrite(mp, bp);
2984	return error;	2929	return error;
2985		2930
2986	corrupt_out:	2931	corrupt_out:
@@ -3015,16 +2960,6 @@ xfs_iflush_int(
3015	iip = ip->i_itemp;	2960	iip = ip->i_itemp;
3016	mp = ip->i_mount;	2961	mp = ip->i_mount;
3017		2962
3018
3019	/*
3020	* If the inode isn't dirty, then just release the inode
3021	* flush lock and do nothing.
3022	*/
3023	if (xfs_inode_clean(ip)) {
3024	xfs_ifunlock(ip);
3025	return 0;
3026	}
3027
3028	/* set dip = inode's place in the buffer /	2963	/* set dip = inode's place in the buffer /
3029	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);	2964	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
3030		2965


diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8b618ea4d692..6c912b027596 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h
@@ -420,16 +420,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
420	#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)	420	#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
421		421
422	/*	422	/*
423	* Flags for xfs_iflush()
424	*/
425	#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1
426	#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2
427	#define XFS_IFLUSH_SYNC 3
428	#define XFS_IFLUSH_ASYNC 4
429	#define XFS_IFLUSH_DELWRI 5
430	#define XFS_IFLUSH_ASYNC_NOBLOCK 6
431
432	/*
433	* Flags for xfs_itruncate_start().	423	* Flags for xfs_itruncate_start().
434	*/	424	*/
435	#define XFS_ITRUNC_DEFINITE 0x1	425	#define XFS_ITRUNC_DEFINITE 0x1


diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 48ec1c0b23ce..207553e82954 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c
@@ -866,10 +866,14 @@ xfs_inode_item_push(
866	iip->ili_format.ilf_fields != 0);	866	iip->ili_format.ilf_fields != 0);
867		867
868	/*	868	/*
869	* Write out the inode. The completion routine ('iflush_done') will	869	* Push the inode to it's backing buffer. This will not remove the
870	* pull it from the AIL, mark it clean, unlock the flush lock.	870	* inode from the AIL - a further push will be required to trigger a
		871	* buffer push. However, this allows all the dirty inodes to be pushed
		872	* to the buffer before it is pushed to disk. THe buffer IO completion
		873	* will pull th einode from the AIL, mark it clean and unlock the flush
		874	* lock.
871	*/	875	*/
872	(void) xfs_iflush(ip, XFS_IFLUSH_ASYNC);	876	(void) xfs_iflush(ip, 0);
873	xfs_iunlock(ip, XFS_ILOCK_SHARED);	877	xfs_iunlock(ip, XFS_ILOCK_SHARED);
874		878
875	return;	879	return;


diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 5061149b2cc4..6afaaeb2950a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c
@@ -1468,7 +1468,18 @@ xfs_unmountfs(
1468	* need to force the log first.	1468	* need to force the log first.
1469	*/	1469	*/
1470	xfs_log_force(mp, XFS_LOG_SYNC);	1470	xfs_log_force(mp, XFS_LOG_SYNC);
1471	xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);	1471
		1472	/*
		1473	* Do a delwri reclaim pass first so that as many dirty inodes are
		1474	* queued up for IO as possible. Then flush the buffers before making
		1475	* a synchronous path to catch all the remaining inodes are reclaimed.
		1476	* This makes the reclaim process as quick as possible by avoiding
		1477	* synchronous writeout and blocking on inodes already in the delwri
		1478	* state as much as possible.
		1479	*/
		1480	xfs_reclaim_inodes(mp, 0);
		1481	XFS_bflush(mp->m_ddev_targp);
		1482	xfs_reclaim_inodes(mp, SYNC_WAIT);
1472		1483
1473	xfs_qm_unmount(mp);	1484	xfs_qm_unmount(mp);
1474		1485