6 files changed, 102 insertions, 115 deletions
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 6ce828e0e17b..3b5b46b8e3b9 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1064,7 +1064,7 @@ xfs_fs_write_inode(
                xfs_ilock(ip, XFS_ILOCK_SHARED);
                xfs_iflock(ip);
-                error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+                error = xfs_iflush(ip, SYNC_WAIT);
        } else {
                error = EAGAIN;
                if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
@@ -1072,7 +1072,7 @@ xfs_fs_write_inode(
                if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
                        goto out_unlock;
-                error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
+                error = xfs_iflush(ip, 0);
        }
 out_unlock:
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 525260c7617f..a9f6d20aff41 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -270,8 +270,7 @@ xfs_sync_inode_attr(
                goto out_unlock;
        }
-        error = xfs_iflush(ip, (flags & SYNC_WAIT) ?
+        error = xfs_iflush(ip, flags);
-                           XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -460,16 +459,18 @@ xfs_quiesce_fs(
 {
        int     count = 0, pincount;
+        xfs_reclaim_inodes(mp, 0);
        xfs_flush_buftarg(mp->m_ddev_targp, 0);
-        xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
        /*
         * This loop must run at least twice.  The first instance of the loop
         * will flush most meta data but that will generate more meta data
         * (typically directory updates).  Which then must be flushed and
-         * logged before we can write the unmount record.
+         * logged before we can write the unmount record. We also so sync
+         * reclaim of inodes to catch any that the above delwri flush skipped.
         */
        do {
+                xfs_reclaim_inodes(mp, SYNC_WAIT);
                xfs_sync_attr(mp, SYNC_WAIT);
                pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
                if (!pincount) {
@@ -585,7 +586,7 @@ xfs_sync_worker(
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
                xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
                error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -719,21 +720,42 @@ __xfs_inode_clear_reclaim_tag(
 *      shutdown                EIO             unpin and reclaim
 *      clean, unpinned         0               reclaim
 *      stale, unpinned         0               reclaim
- *      clean, pinned(*)        0               unpin and reclaim
+ *      clean, pinned(*)        0               requeue
- *      stale, pinned           0               unpin and reclaim
+ *      stale, pinned           EAGAIN          requeue
- *      dirty, async            0               block on flush lock, reclaim
+ *      dirty, delwri ok        0               requeue
- *      dirty, sync flush       0               block on flush lock, reclaim
+ *      dirty, delwri blocked   EAGAIN          requeue
+ *      dirty, sync flush       0               reclaim
 *
 * (*) dgc: I don't think the clean, pinned state is possible but it gets
 * handled anyway given the order of checks implemented.
 *
+ * As can be seen from the table, the return value of xfs_iflush() is not
+ * sufficient to correctly decide the reclaim action here. The checks in
+ * xfs_iflush() might look like duplicates, but they are not.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean. The clean inode check needs to be done before flushing
+ * the inode delwri otherwise we would loop forever requeuing clean inodes as
+ * we cannot tell apart a successful delwri flush and a clean inode from the
+ * return value of xfs_iflush().
+ *
+ * Note that because the inode is flushed delayed write by background
+ * writeback, the flush lock may already be held here and waiting on it can
+ * result in very long latencies. Hence for sync reclaims, where we wait on the
+ * flush lock, the caller should push out delayed write inodes first before
+ * trying to reclaim them to minimise the amount of time spent waiting. For
+ * background relaim, we just requeue the inode for the next pass.
+ *
 * Hence the order of actions after gaining the locks should be:
 *      bad             => reclaim
 *      shutdown        => unpin and reclaim
- *      pinned          => unpin
+ *      pinned, delwri  => requeue
+ *      pinned, sync    => unpin
 *      stale           => reclaim
 *      clean           => reclaim
- *      dirty           => flush, wait and reclaim
+ *      dirty, delwri   => flush and requeue
+ *      dirty, sync     => flush, wait and reclaim
 */
 STATIC int
 xfs_reclaim_inode(
@@ -741,7 +763,7 @@ xfs_reclaim_inode(
        struct xfs_perag        *pag,
        int                     sync_mode)
 {
-        int     error;
+        int     error = 0;
        /*
         * The radix tree lock here protects a thread in xfs_iget from racing
@@ -761,7 +783,11 @@ xfs_reclaim_inode(
        write_unlock(&pag->pag_ici_lock);
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_iflock(ip);
+        if (!xfs_iflock_nowait(ip)) {
+                if (!(sync_mode & SYNC_WAIT))
+                        goto out;
+                xfs_iflock(ip);
+        }
        if (is_bad_inode(VFS_I(ip)))
                goto reclaim;
@@ -769,8 +795,13 @@ xfs_reclaim_inode(
                xfs_iunpin_wait(ip);
                goto reclaim;
        }
-        if (xfs_ipincount(ip))
+        if (xfs_ipincount(ip)) {
+                if (!(sync_mode & SYNC_WAIT)) {
+                        xfs_ifunlock(ip);
+                        goto out;
+                }
                xfs_iunpin_wait(ip);
+        }
        if (xfs_iflags_test(ip, XFS_ISTALE))
                goto reclaim;
        if (xfs_inode_clean(ip))
@@ -778,27 +809,43 @@ xfs_reclaim_inode(
        /* Now we have an inode that needs flushing */
        error = xfs_iflush(ip, sync_mode);
-        if (!error) {
+        if (sync_mode & SYNC_WAIT) {
-                switch(sync_mode) {
+                xfs_iflock(ip);
-                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
+                goto reclaim;
-                case XFS_IFLUSH_DELWRI:
-                case XFS_IFLUSH_ASYNC:
-                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-                case XFS_IFLUSH_SYNC:
-                        /* IO issued, synchronise with IO completion */
-                        xfs_iflock(ip);
-                        break;
-                default:
-                        ASSERT(0);
-                        break;
-                }
        }
+        /*
+         * When we have to flush an inode but don't have SYNC_WAIT set, we
+         * flush the inode out using a delwri buffer and wait for the next
+         * call into reclaim to find it in a clean state instead of waiting for
+         * it now. We also don't return errors here - if the error is transient
+         * then the next reclaim pass will flush the inode, and if the error
+         * is permanent then the next sync reclaim will relcaim the inode and
+         * pass on the error.
+         */
+        if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                        "inode 0x%llx background reclaim flush failed with %d",
+                        (long long)ip->i_ino, error);
+        }
+out:
+        xfs_iflags_clear(ip, XFS_IRECLAIM);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        /*
+         * We could return EAGAIN here to make reclaim rescan the inode tree in
+         * a short while. However, this just burns CPU time scanning the tree
+         * waiting for IO to complete and xfssyncd never goes back to the idle
+         * state. Instead, return 0 to let the next scheduled background reclaim
+         * attempt to reclaim the inode again.
+         */
+        return 0;
 reclaim:
        xfs_ifunlock(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        xfs_ireclaim(ip);
-        return 0;
+        return error;
 }
 int
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8d0666dd170a..fa31360046d4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2835,8 +2835,6 @@ xfs_iflush(
        xfs_dinode_t            *dip;
        xfs_mount_t             *mp;
        int                     error;
-        int                     noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
-        enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
        XFS_STATS_INC(xs_iflush_count);
@@ -2859,7 +2857,7 @@ xfs_iflush(
         * in the same cluster are dirty, they will probably write the inode
         * out for us if they occur after the log force completes.
         */
-        if (noblock && xfs_ipincount(ip)) {
+        if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
                xfs_iunpin_nowait(ip);
                xfs_ifunlock(ip);
                return EAGAIN;
@@ -2893,60 +2891,10 @@ xfs_iflush(
        }
        /*
-         * Decide how buffer will be flushed out.  This is done before
-         * the call to xfs_iflush_int because this field is zeroed by it.
-         */
-        if (iip != NULL && iip->ili_format.ilf_fields != 0) {
-                /*
-                 * Flush out the inode buffer according to the directions
-                 * of the caller.  In the cases where the caller has given
-                 * us a choice choose the non-delwri case.  This is because
-                 * the inode is in the AIL and we need to get it out soon.
-                 */
-                switch (flags) {
-                case XFS_IFLUSH_SYNC:
-                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-                        flags = 0;
-                        break;
-                case XFS_IFLUSH_ASYNC_NOBLOCK:
-                case XFS_IFLUSH_ASYNC:
-                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
-                        flags = INT_ASYNC;
-                        break;
-                case XFS_IFLUSH_DELWRI:
-                        flags = INT_DELWRI;
-                        break;
-                default:
-                        ASSERT(0);
-                        flags = 0;
-                        break;
-                }
-        } else {
-                switch (flags) {
-                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
-                case XFS_IFLUSH_DELWRI:
-                        flags = INT_DELWRI;
-                        break;
-                case XFS_IFLUSH_ASYNC_NOBLOCK:
-                case XFS_IFLUSH_ASYNC:
-                        flags = INT_ASYNC;
-                        break;
-                case XFS_IFLUSH_SYNC:
-                        flags = 0;
-                        break;
-                default:
-                        ASSERT(0);
-                        flags = 0;
-                        break;
-                }
-        }
-        /*
         * Get the buffer containing the on-disk inode.
         */
        error = xfs_itobp(mp, NULL, ip, &dip, &bp,
-                                noblock ? XBF_TRYLOCK : XBF_LOCK);
+                                (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
        if (error || !bp) {
                xfs_ifunlock(ip);
                return error;
@@ -2974,13 +2922,10 @@ xfs_iflush(
        if (error)
                goto cluster_corrupt_out;
-        if (flags & INT_DELWRI) {
+        if (flags & SYNC_WAIT)
-                xfs_bdwrite(mp, bp);
-        } else if (flags & INT_ASYNC) {
-                error = xfs_bawrite(mp, bp);
-        } else {
                error = xfs_bwrite(mp, bp);
-        }
+        else
+                xfs_bdwrite(mp, bp);
        return error;
 corrupt_out:
@@ -3015,16 +2960,6 @@ xfs_iflush_int(
        iip = ip->i_itemp;
        mp = ip->i_mount;
-        /*
-         * If the inode isn't dirty, then just release the inode
-         * flush lock and do nothing.
-         */
-        if (xfs_inode_clean(ip)) {
-                xfs_ifunlock(ip);
-                return 0;
-        }
        /* set *dip = inode's place in the buffer */
        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 8b618ea4d692..6c912b027596 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -420,16 +420,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
 /*
- * Flags for xfs_iflush()
- */
-#define XFS_IFLUSH_DELWRI_ELSE_SYNC     1
-#define XFS_IFLUSH_DELWRI_ELSE_ASYNC    2
-#define XFS_IFLUSH_SYNC                 3
-#define XFS_IFLUSH_ASYNC                4
-#define XFS_IFLUSH_DELWRI               5
-#define XFS_IFLUSH_ASYNC_NOBLOCK        6
-/*
 * Flags for xfs_itruncate_start().
 */
 #define XFS_ITRUNC_DEFINITE     0x1
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 48ec1c0b23ce..207553e82954 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -866,10 +866,14 @@ xfs_inode_item_push(
               iip->ili_format.ilf_fields != 0);
        /*
-         * Write out the inode.  The completion routine ('iflush_done') will
+         * Push the inode to it's backing buffer. This will not remove the
-         * pull it from the AIL, mark it clean, unlock the flush lock.
+         * inode from the AIL - a further push will be required to trigger a
+         * buffer push. However, this allows all the dirty inodes to be pushed
+         * to the buffer before it is pushed to disk. THe buffer IO completion
+         * will pull th einode from the AIL, mark it clean and unlock the flush
+         * lock.
         */
-        (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC);
+        (void) xfs_iflush(ip, 0);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        return;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5061149b2cc4..6afaaeb2950a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1468,7 +1468,18 @@ xfs_unmountfs(
         * need to force the log first.
         */
        xfs_log_force(mp, XFS_LOG_SYNC);
-        xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);
+        /*
+         * Do a delwri reclaim pass first so that as many dirty inodes are
+         * queued up for IO as possible. Then flush the buffers before making
+         * a synchronous path to catch all the remaining inodes are reclaimed.
+         * This makes the reclaim process as quick as possible by avoiding
+         * synchronous writeout and blocking on inodes already in the delwri
+         * state as much as possible.
+         */
+        xfs_reclaim_inodes(mp, 0);
+        XFS_bflush(mp->m_ddev_targp);
+        xfs_reclaim_inodes(mp, SYNC_WAIT);
        xfs_qm_unmount(mp);

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 6ce828e0e17b..3b5b46b8e3b9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1064,7 +1064,7 @@ xfs_fs_write_inode(
1064	xfs_ilock(ip, XFS_ILOCK_SHARED);	1064	xfs_ilock(ip, XFS_ILOCK_SHARED);
1065	xfs_iflock(ip);	1065	xfs_iflock(ip);
1066		1066
1067	error = xfs_iflush(ip, XFS_IFLUSH_SYNC);	1067	error = xfs_iflush(ip, SYNC_WAIT);
1068	} else {	1068	} else {
1069	error = EAGAIN;	1069	error = EAGAIN;
1070	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))	1070	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
@@ -1072,7 +1072,7 @@ xfs_fs_write_inode(
1072	if (xfs_ipincount(ip) \|\| !xfs_iflock_nowait(ip))	1072	if (xfs_ipincount(ip) \|\| !xfs_iflock_nowait(ip))
1073	goto out_unlock;	1073	goto out_unlock;
1074		1074
1075	error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);	1075	error = xfs_iflush(ip, 0);
1076	}	1076	}
1077		1077
1078	out_unlock:	1078	out_unlock:


diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 525260c7617f..a9f6d20aff41 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -270,8 +270,7 @@ xfs_sync_inode_attr(
270	goto out_unlock;	270	goto out_unlock;
271	}	271	}
272		272
273	error = xfs_iflush(ip, (flags & SYNC_WAIT) ?	273	error = xfs_iflush(ip, flags);
274	XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
275		274
276	out_unlock:	275	out_unlock:
277	xfs_iunlock(ip, XFS_ILOCK_SHARED);	276	xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -460,16 +459,18 @@ xfs_quiesce_fs(
460	{	459	{
461	int count = 0, pincount;	460	int count = 0, pincount;
462		461
		462	xfs_reclaim_inodes(mp, 0);
463	xfs_flush_buftarg(mp->m_ddev_targp, 0);	463	xfs_flush_buftarg(mp->m_ddev_targp, 0);
464	xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
465		464
466	/*	465	/*
467	* This loop must run at least twice. The first instance of the loop	466	* This loop must run at least twice. The first instance of the loop
468	* will flush most meta data but that will generate more meta data	467	* will flush most meta data but that will generate more meta data
469	* (typically directory updates). Which then must be flushed and	468	* (typically directory updates). Which then must be flushed and
470	* logged before we can write the unmount record.	469	* logged before we can write the unmount record. We also so sync
		470	* reclaim of inodes to catch any that the above delwri flush skipped.
471	*/	471	*/
472	do {	472	do {
		473	xfs_reclaim_inodes(mp, SYNC_WAIT);
473	xfs_sync_attr(mp, SYNC_WAIT);	474	xfs_sync_attr(mp, SYNC_WAIT);
474	pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);	475	pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
475	if (!pincount) {	476	if (!pincount) {
@@ -585,7 +586,7 @@ xfs_sync_worker(
585		586
586	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {	587	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
587	xfs_log_force(mp, 0);	588	xfs_log_force(mp, 0);
588	xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);	589	xfs_reclaim_inodes(mp, 0);
589	/* dgc: errors ignored here */	590	/* dgc: errors ignored here */
590	error = xfs_qm_sync(mp, SYNC_TRYLOCK);	591	error = xfs_qm_sync(mp, SYNC_TRYLOCK);
591	error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);	592	error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -719,21 +720,42 @@ __xfs_inode_clear_reclaim_tag(
719	* shutdown EIO unpin and reclaim	720	* shutdown EIO unpin and reclaim
720	* clean, unpinned 0 reclaim	721	* clean, unpinned 0 reclaim
721	* stale, unpinned 0 reclaim	722	* stale, unpinned 0 reclaim
722	* clean, pinned(*) 0 unpin and reclaim	723	* clean, pinned(*) 0 requeue
723	* stale, pinned 0 unpin and reclaim	724	* stale, pinned EAGAIN requeue
724	* dirty, async 0 block on flush lock, reclaim	725	* dirty, delwri ok 0 requeue
725	* dirty, sync flush 0 block on flush lock, reclaim	726	* dirty, delwri blocked EAGAIN requeue
		727	* dirty, sync flush 0 reclaim
726	*	728	*
727	* (*) dgc: I don't think the clean, pinned state is possible but it gets	729	* (*) dgc: I don't think the clean, pinned state is possible but it gets
728	* handled anyway given the order of checks implemented.	730	* handled anyway given the order of checks implemented.
729	*	731	*
		732	* As can be seen from the table, the return value of xfs_iflush() is not
		733	* sufficient to correctly decide the reclaim action here. The checks in
		734	* xfs_iflush() might look like duplicates, but they are not.
		735	*
		736	* Also, because we get the flush lock first, we know that any inode that has
		737	* been flushed delwri has had the flush completed by the time we check that
		738	* the inode is clean. The clean inode check needs to be done before flushing
		739	* the inode delwri otherwise we would loop forever requeuing clean inodes as
		740	* we cannot tell apart a successful delwri flush and a clean inode from the
		741	* return value of xfs_iflush().
		742	*
		743	* Note that because the inode is flushed delayed write by background
		744	* writeback, the flush lock may already be held here and waiting on it can
		745	* result in very long latencies. Hence for sync reclaims, where we wait on the
		746	* flush lock, the caller should push out delayed write inodes first before
		747	* trying to reclaim them to minimise the amount of time spent waiting. For
		748	* background relaim, we just requeue the inode for the next pass.
		749	*
730	* Hence the order of actions after gaining the locks should be:	750	* Hence the order of actions after gaining the locks should be:
731	* bad => reclaim	751	* bad => reclaim
732	* shutdown => unpin and reclaim	752	* shutdown => unpin and reclaim
733	* pinned => unpin	753	* pinned, delwri => requeue
		754	* pinned, sync => unpin
734	* stale => reclaim	755	* stale => reclaim
735	* clean => reclaim	756	* clean => reclaim
736	* dirty => flush, wait and reclaim	757	* dirty, delwri => flush and requeue
		758	* dirty, sync => flush, wait and reclaim
737	*/	759	*/
738	STATIC int	760	STATIC int
739	xfs_reclaim_inode(	761	xfs_reclaim_inode(
@@ -741,7 +763,7 @@ xfs_reclaim_inode(
741	struct xfs_perag *pag,	763	struct xfs_perag *pag,
742	int sync_mode)	764	int sync_mode)
743	{	765	{
744	int error;	766	int error = 0;
745		767
746	/*	768	/*
747	* The radix tree lock here protects a thread in xfs_iget from racing	769	* The radix tree lock here protects a thread in xfs_iget from racing
@@ -761,7 +783,11 @@ xfs_reclaim_inode(
761	write_unlock(&pag->pag_ici_lock);	783	write_unlock(&pag->pag_ici_lock);
762		784
763	xfs_ilock(ip, XFS_ILOCK_EXCL);	785	xfs_ilock(ip, XFS_ILOCK_EXCL);
764	xfs_iflock(ip);	786	if (!xfs_iflock_nowait(ip)) {
		787	if (!(sync_mode & SYNC_WAIT))
		788	goto out;
		789	xfs_iflock(ip);
		790	}
765		791
766	if (is_bad_inode(VFS_I(ip)))	792	if (is_bad_inode(VFS_I(ip)))
767	goto reclaim;	793	goto reclaim;
@@ -769,8 +795,13 @@ xfs_reclaim_inode(
769	xfs_iunpin_wait(ip);	795	xfs_iunpin_wait(ip);
770	goto reclaim;	796	goto reclaim;
771	}	797	}
772	if (xfs_ipincount(ip))	798	if (xfs_ipincount(ip)) {
		799	if (!(sync_mode & SYNC_WAIT)) {
		800	xfs_ifunlock(ip);
		801	goto out;
		802	}
773	xfs_iunpin_wait(ip);	803	xfs_iunpin_wait(ip);
		804	}
774	if (xfs_iflags_test(ip, XFS_ISTALE))	805	if (xfs_iflags_test(ip, XFS_ISTALE))
775	goto reclaim;	806	goto reclaim;
776	if (xfs_inode_clean(ip))	807	if (xfs_inode_clean(ip))
@@ -778,27 +809,43 @@ xfs_reclaim_inode(
778		809
779	/* Now we have an inode that needs flushing */	810	/* Now we have an inode that needs flushing */
780	error = xfs_iflush(ip, sync_mode);	811	error = xfs_iflush(ip, sync_mode);
781	if (!error) {	812	if (sync_mode & SYNC_WAIT) {
782	switch(sync_mode) {	813	xfs_iflock(ip);
783	case XFS_IFLUSH_DELWRI_ELSE_ASYNC:	814	goto reclaim;
784	case XFS_IFLUSH_DELWRI:
785	case XFS_IFLUSH_ASYNC:
786	case XFS_IFLUSH_DELWRI_ELSE_SYNC:
787	case XFS_IFLUSH_SYNC:
788	/* IO issued, synchronise with IO completion */
789	xfs_iflock(ip);
790	break;
791	default:
792	ASSERT(0);
793	break;
794	}
795	}	815	}
796		816
		817	/*
		818	* When we have to flush an inode but don't have SYNC_WAIT set, we
		819	* flush the inode out using a delwri buffer and wait for the next
		820	* call into reclaim to find it in a clean state instead of waiting for
		821	* it now. We also don't return errors here - if the error is transient
		822	* then the next reclaim pass will flush the inode, and if the error
		823	* is permanent then the next sync reclaim will relcaim the inode and
		824	* pass on the error.
		825	*/
		826	if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
		827	xfs_fs_cmn_err(CE_WARN, ip->i_mount,
		828	"inode 0x%llx background reclaim flush failed with %d",
		829	(long long)ip->i_ino, error);
		830	}
		831	out:
		832	xfs_iflags_clear(ip, XFS_IRECLAIM);
		833	xfs_iunlock(ip, XFS_ILOCK_EXCL);
		834	/*
		835	* We could return EAGAIN here to make reclaim rescan the inode tree in
		836	* a short while. However, this just burns CPU time scanning the tree
		837	* waiting for IO to complete and xfssyncd never goes back to the idle
		838	* state. Instead, return 0 to let the next scheduled background reclaim
		839	* attempt to reclaim the inode again.
		840	*/
		841	return 0;
		842
797	reclaim:	843	reclaim:
798	xfs_ifunlock(ip);	844	xfs_ifunlock(ip);
799	xfs_iunlock(ip, XFS_ILOCK_EXCL);	845	xfs_iunlock(ip, XFS_ILOCK_EXCL);
800	xfs_ireclaim(ip);	846	xfs_ireclaim(ip);
801	return 0;	847	return error;
		848
802	}	849	}
803		850
804	int	851	int


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8d0666dd170a..fa31360046d4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c
@@ -2835,8 +2835,6 @@ xfs_iflush(
2835	xfs_dinode_t *dip;	2835	xfs_dinode_t *dip;
2836	xfs_mount_t *mp;	2836	xfs_mount_t *mp;
2837	int error;	2837	int error;
2838	int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
2839	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
2840		2838
2841	XFS_STATS_INC(xs_iflush_count);	2839	XFS_STATS_INC(xs_iflush_count);
2842		2840
@@ -2859,7 +2857,7 @@ xfs_iflush(
2859	* in the same cluster are dirty, they will probably write the inode	2857	* in the same cluster are dirty, they will probably write the inode
2860	* out for us if they occur after the log force completes.	2858	* out for us if they occur after the log force completes.
2861	*/	2859	*/
2862	if (noblock && xfs_ipincount(ip)) {	2860	if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2863	xfs_iunpin_nowait(ip);	2861	xfs_iunpin_nowait(ip);
2864	xfs_ifunlock(ip);	2862	xfs_ifunlock(ip);
2865	return EAGAIN;	2863	return EAGAIN;
@@ -2893,60 +2891,10 @@ xfs_iflush(
2893	}	2891	}
2894		2892
2895	/*	2893	/*
2896	* Decide how buffer will be flushed out. This is done before
2897	* the call to xfs_iflush_int because this field is zeroed by it.
2898	*/
2899	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
2900	/*
2901	* Flush out the inode buffer according to the directions
2902	* of the caller. In the cases where the caller has given
2903	* us a choice choose the non-delwri case. This is because
2904	* the inode is in the AIL and we need to get it out soon.
2905	*/
2906	switch (flags) {
2907	case XFS_IFLUSH_SYNC:
2908	case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2909	flags = 0;
2910	break;
2911	case XFS_IFLUSH_ASYNC_NOBLOCK:
2912	case XFS_IFLUSH_ASYNC:
2913	case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2914	flags = INT_ASYNC;
2915	break;
2916	case XFS_IFLUSH_DELWRI:
2917	flags = INT_DELWRI;
2918	break;
2919	default:
2920	ASSERT(0);
2921	flags = 0;
2922	break;
2923	}
2924	} else {
2925	switch (flags) {
2926	case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2927	case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2928	case XFS_IFLUSH_DELWRI:
2929	flags = INT_DELWRI;
2930	break;
2931	case XFS_IFLUSH_ASYNC_NOBLOCK:
2932	case XFS_IFLUSH_ASYNC:
2933	flags = INT_ASYNC;
2934	break;
2935	case XFS_IFLUSH_SYNC:
2936	flags = 0;
2937	break;
2938	default:
2939	ASSERT(0);
2940	flags = 0;
2941	break;
2942	}
2943	}
2944
2945	/*
2946	* Get the buffer containing the on-disk inode.	2894	* Get the buffer containing the on-disk inode.
2947	*/	2895	*/
2948	error = xfs_itobp(mp, NULL, ip, &dip, &bp,	2896	error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2949	noblock ? XBF_TRYLOCK : XBF_LOCK);	2897	(flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
2950	if (error \|\| !bp) {	2898	if (error \|\| !bp) {
2951	xfs_ifunlock(ip);	2899	xfs_ifunlock(ip);
2952	return error;	2900	return error;
@@ -2974,13 +2922,10 @@ xfs_iflush(
2974	if (error)	2922	if (error)
2975	goto cluster_corrupt_out;	2923	goto cluster_corrupt_out;
2976		2924
2977	if (flags & INT_DELWRI) {	2925	if (flags & SYNC_WAIT)
2978	xfs_bdwrite(mp, bp);
2979	} else if (flags & INT_ASYNC) {
2980	error = xfs_bawrite(mp, bp);
2981	} else {
2982	error = xfs_bwrite(mp, bp);	2926	error = xfs_bwrite(mp, bp);
2983	}	2927	else
		2928	xfs_bdwrite(mp, bp);
2984	return error;	2929	return error;
2985		2930
2986	corrupt_out:	2931	corrupt_out:
@@ -3015,16 +2960,6 @@ xfs_iflush_int(
3015	iip = ip->i_itemp;	2960	iip = ip->i_itemp;
3016	mp = ip->i_mount;	2961	mp = ip->i_mount;
3017		2962
3018
3019	/*
3020	* If the inode isn't dirty, then just release the inode
3021	* flush lock and do nothing.
3022	*/
3023	if (xfs_inode_clean(ip)) {
3024	xfs_ifunlock(ip);
3025	return 0;
3026	}
3027
3028	/* set dip = inode's place in the buffer /	2963	/* set dip = inode's place in the buffer /
3029	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);	2964	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
3030		2965


diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8b618ea4d692..6c912b027596 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h
@@ -420,16 +420,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
420	#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)	420	#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
421		421
422	/*	422	/*
423	* Flags for xfs_iflush()
424	*/
425	#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1
426	#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2
427	#define XFS_IFLUSH_SYNC 3
428	#define XFS_IFLUSH_ASYNC 4
429	#define XFS_IFLUSH_DELWRI 5
430	#define XFS_IFLUSH_ASYNC_NOBLOCK 6
431
432	/*
433	* Flags for xfs_itruncate_start().	423	* Flags for xfs_itruncate_start().
434	*/	424	*/
435	#define XFS_ITRUNC_DEFINITE 0x1	425	#define XFS_ITRUNC_DEFINITE 0x1


diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 48ec1c0b23ce..207553e82954 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c
@@ -866,10 +866,14 @@ xfs_inode_item_push(
866	iip->ili_format.ilf_fields != 0);	866	iip->ili_format.ilf_fields != 0);
867		867
868	/*	868	/*
869	* Write out the inode. The completion routine ('iflush_done') will	869	* Push the inode to it's backing buffer. This will not remove the
870	* pull it from the AIL, mark it clean, unlock the flush lock.	870	* inode from the AIL - a further push will be required to trigger a
		871	* buffer push. However, this allows all the dirty inodes to be pushed
		872	* to the buffer before it is pushed to disk. THe buffer IO completion
		873	* will pull th einode from the AIL, mark it clean and unlock the flush
		874	* lock.
871	*/	875	*/
872	(void) xfs_iflush(ip, XFS_IFLUSH_ASYNC);	876	(void) xfs_iflush(ip, 0);
873	xfs_iunlock(ip, XFS_ILOCK_SHARED);	877	xfs_iunlock(ip, XFS_ILOCK_SHARED);
874		878
875	return;	879	return;


diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 5061149b2cc4..6afaaeb2950a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c
@@ -1468,7 +1468,18 @@ xfs_unmountfs(
1468	* need to force the log first.	1468	* need to force the log first.
1469	*/	1469	*/
1470	xfs_log_force(mp, XFS_LOG_SYNC);	1470	xfs_log_force(mp, XFS_LOG_SYNC);
1471	xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);	1471
		1472	/*
		1473	* Do a delwri reclaim pass first so that as many dirty inodes are
		1474	* queued up for IO as possible. Then flush the buffers before making
		1475	* a synchronous path to catch all the remaining inodes are reclaimed.
		1476	* This makes the reclaim process as quick as possible by avoiding
		1477	* synchronous writeout and blocking on inodes already in the delwri
		1478	* state as much as possible.
		1479	*/
		1480	xfs_reclaim_inodes(mp, 0);
		1481	XFS_bflush(mp->m_ddev_targp);
		1482	xfs_reclaim_inodes(mp, SYNC_WAIT);
1472		1483
1473	xfs_qm_unmount(mp);	1484	xfs_qm_unmount(mp);
1474		1485