xfs: Don't issue buffer IO direct from AIL push V2

All buffers logged into the AIL are marked as delayed write. When the AIL needs to push the buffer out, it issues an async write of the buffer. This means that IO patterns are dependent on the order of buffers in the AIL. Instead of flushing the buffer, promote the buffer in the delayed write list so that the next time the xfsbufd is run the buffer will be flushed by the xfsbufd. Return the state to the xfsaild that the buffer was promoted so that the xfsaild knows that it needs to cause the xfsbufd to run to flush the buffers that were promoted. Using the xfsbufd for issuing the IO allows us to dispatch all buffer IO from the one queue. This means that we can make much more enlightened decisions on what order to flush buffers to disk as we don't have multiple places issuing IO. Optimisations to xfsbufd will be in a future patch. Version 2 - kill XFS_ITEM_FLUSHING as it is now unused. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
author: Dave Chinner <david@fromorbit.com> 2010-02-01 18:13:42 -0500
committer: Dave Chinner <david@fromorbit.com> 2010-02-01 18:13:42 -0500
commit: d808f617ad00a413585b806de340feda5ad9a2da (patch)
tree: ed03d4d019a9d8b566ffd454e112e9fbce70bad8 /fs/xfs/xfs_inode_item.c
parent: c854363e80b49dd04a4de18ebc379eb8c8806674 (diff)
1 files changed, 15 insertions, 83 deletions
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 207553e8295..d4dc063111f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -602,33 +602,20 @@ xfs_inode_item_trylock(
        if (!xfs_iflock_nowait(ip)) {
                /*
-                 * If someone else isn't already trying to push the inode
+                 * inode has already been flushed to the backing buffer,
-                 * buffer, we get to do it.
+                 * leave it locked in shared mode, pushbuf routine will
+                 * unlock it.
                 */
-                if (iip->ili_pushbuf_flag == 0) {
+                return XFS_ITEM_PUSHBUF;
-                        iip->ili_pushbuf_flag = 1;
-#ifdef DEBUG
-                        iip->ili_push_owner = current_pid();
-#endif
-                        /*
-                         * Inode is left locked in shared mode.
-                         * Pushbuf routine gets to unlock it.
-                         */
-                        return XFS_ITEM_PUSHBUF;
-                } else {
-                        /*
-                         * We hold the AIL lock, so we must specify the
-                         * NONOTIFY flag so that we won't double trip.
-                         */
-                        xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
-                        return XFS_ITEM_FLUSHING;
-                }
-                /* NOTREACHED */
        }
        /* Stale items should force out the iclog */
        if (ip->i_flags & XFS_ISTALE) {
                xfs_ifunlock(ip);
+                /*
+                 * we hold the AIL lock - notify the unlock routine of this
+                 * so it doesn't try to get the lock again.
+                 */
                xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
                return XFS_ITEM_PINNED;
        }
@@ -746,11 +733,8 @@ xfs_inode_item_committed(
 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
 * failed to get the inode flush lock but did get the inode locked SHARED.
 * Here we're trying to see if the inode buffer is incore, and if so whether it's
- * marked delayed write. If that's the case, we'll initiate a bawrite on that
+ * marked delayed write. If that's the case, we'll promote it and that will
- * buffer to expedite the process.
+ * allow the caller to write the buffer by triggering the xfsbufd to run.
- *
- * We aren't holding the AIL lock (or the flush lock) when this gets called,
- * so it is inherently race-y.
 */
 STATIC void
 xfs_inode_item_pushbuf(
@@ -759,26 +743,16 @@ xfs_inode_item_pushbuf(
        xfs_inode_t     *ip;
        xfs_mount_t     *mp;
        xfs_buf_t       *bp;
-        uint            dopush;
        ip = iip->ili_inode;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
        /*
-         * The ili_pushbuf_flag keeps others from
-         * trying to duplicate our effort.
-         */
-        ASSERT(iip->ili_pushbuf_flag != 0);
-        ASSERT(iip->ili_push_owner == current_pid());
-        /*
         * If a flush is not in progress anymore, chances are that the
         * inode was taken off the AIL. So, just get out.
         */
        if (completion_done(&ip->i_flush) ||
            ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
-                iip->ili_pushbuf_flag = 0;
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                return;
        }
@@ -787,53 +761,12 @@ xfs_inode_item_pushbuf(
        bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
                    iip->ili_format.ilf_len, XBF_TRYLOCK);
-        if (bp != NULL) {
-                if (XFS_BUF_ISDELAYWRITE(bp)) {
-                        /*
-                         * We were racing with iflush because we don't hold
-                         * the AIL lock or the flush lock. However, at this point,
-                         * we have the buffer, and we know that it's dirty.
-                         * So, it's possible that iflush raced with us, and
-                         * this item is already taken off the AIL.
-                         * If not, we can flush it async.
-                         */
-                        dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
-                                  !completion_done(&ip->i_flush));
-                        iip->ili_pushbuf_flag = 0;
-                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                        trace_xfs_inode_item_push(bp, _RET_IP_);
-                        if (XFS_BUF_ISPINNED(bp))
-                                xfs_log_force(mp, 0);
-                        if (dopush) {
-                                int     error;
-                                error = xfs_bawrite(mp, bp);
-                                if (error)
-                                        xfs_fs_cmn_err(CE_WARN, mp,
-                "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
-                                                        error, iip, bp);
-                        } else {
-                                xfs_buf_relse(bp);
-                        }
-                } else {
-                        iip->ili_pushbuf_flag = 0;
-                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                        xfs_buf_relse(bp);
-                }
-                return;
-        }
-        /*
-         * We have to be careful about resetting pushbuf flag too early (above).
-         * Even though in theory we can do it as soon as we have the buflock,
-         * we don't want others to be doing work needlessly. They'll come to
-         * this function thinking that pushing the buffer is their
-         * responsibility only to find that the buffer is still locked by
-         * another doing the same thing
-         */
-        iip->ili_pushbuf_flag = 0;
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        if (!bp)
+                return;
+        if (XFS_BUF_ISDELAYWRITE(bp))
+                xfs_buf_delwri_promote(bp);
+        xfs_buf_relse(bp);
        return;
 }
@@ -937,7 +870,6 @@ xfs_inode_item_init(
        /*
           We have zeroed memory. No need ...
           iip->ili_extents_buf = NULL;
-           iip->ili_pushbuf_flag = 0;
         */
        iip->ili_format.ilf_type = XFS_LI_INODE;
author	Dave Chinner <david@fromorbit.com>	2010-02-01 18:13:42 -0500
committer	Dave Chinner <david@fromorbit.com>	2010-02-01 18:13:42 -0500
commit	d808f617ad00a413585b806de340feda5ad9a2da (patch)
tree	ed03d4d019a9d8b566ffd454e112e9fbce70bad8 /fs/xfs/xfs_inode_item.c
parent	c854363e80b49dd04a4de18ebc379eb8c8806674 (diff)

diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 207553e8295..d4dc063111f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c
@@ -602,33 +602,20 @@ xfs_inode_item_trylock(
602		602
603	if (!xfs_iflock_nowait(ip)) {	603	if (!xfs_iflock_nowait(ip)) {
604	/*	604	/*
605	* If someone else isn't already trying to push the inode	605	* inode has already been flushed to the backing buffer,
606	* buffer, we get to do it.	606	* leave it locked in shared mode, pushbuf routine will
		607	* unlock it.
607	*/	608	*/
608	if (iip->ili_pushbuf_flag == 0) {	609	return XFS_ITEM_PUSHBUF;
609	iip->ili_pushbuf_flag = 1;
610	#ifdef DEBUG
611	iip->ili_push_owner = current_pid();
612	#endif
613	/*
614	* Inode is left locked in shared mode.
615	* Pushbuf routine gets to unlock it.
616	*/
617	return XFS_ITEM_PUSHBUF;
618	} else {
619	/*
620	* We hold the AIL lock, so we must specify the
621	* NONOTIFY flag so that we won't double trip.
622	*/
623	xfs_iunlock(ip, XFS_ILOCK_SHARED\|XFS_IUNLOCK_NONOTIFY);
624	return XFS_ITEM_FLUSHING;
625	}
626	/* NOTREACHED */
627	}	610	}
628		611
629	/* Stale items should force out the iclog */	612	/* Stale items should force out the iclog */
630	if (ip->i_flags & XFS_ISTALE) {	613	if (ip->i_flags & XFS_ISTALE) {
631	xfs_ifunlock(ip);	614	xfs_ifunlock(ip);
		615	/*
		616	* we hold the AIL lock - notify the unlock routine of this
		617	* so it doesn't try to get the lock again.
		618	*/
632	xfs_iunlock(ip, XFS_ILOCK_SHARED\|XFS_IUNLOCK_NONOTIFY);	619	xfs_iunlock(ip, XFS_ILOCK_SHARED\|XFS_IUNLOCK_NONOTIFY);
633	return XFS_ITEM_PINNED;	620	return XFS_ITEM_PINNED;
634	}	621	}
@@ -746,11 +733,8 @@ xfs_inode_item_committed(
746	* This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK	733	* This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
747	* failed to get the inode flush lock but did get the inode locked SHARED.	734	* failed to get the inode flush lock but did get the inode locked SHARED.
748	* Here we're trying to see if the inode buffer is incore, and if so whether it's	735	* Here we're trying to see if the inode buffer is incore, and if so whether it's
749	* marked delayed write. If that's the case, we'll initiate a bawrite on that	736	* marked delayed write. If that's the case, we'll promote it and that will
750	* buffer to expedite the process.	737	* allow the caller to write the buffer by triggering the xfsbufd to run.
751	*
752	* We aren't holding the AIL lock (or the flush lock) when this gets called,
753	* so it is inherently race-y.
754	*/	738	*/
755	STATIC void	739	STATIC void
756	xfs_inode_item_pushbuf(	740	xfs_inode_item_pushbuf(
@@ -759,26 +743,16 @@ xfs_inode_item_pushbuf(
759	xfs_inode_t *ip;	743	xfs_inode_t *ip;
760	xfs_mount_t *mp;	744	xfs_mount_t *mp;
761	xfs_buf_t *bp;	745	xfs_buf_t *bp;
762	uint dopush;
763		746
764	ip = iip->ili_inode;	747	ip = iip->ili_inode;
765
766	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));	748	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
767		749
768	/*	750	/*
769	* The ili_pushbuf_flag keeps others from
770	* trying to duplicate our effort.
771	*/
772	ASSERT(iip->ili_pushbuf_flag != 0);
773	ASSERT(iip->ili_push_owner == current_pid());
774
775	/*
776	* If a flush is not in progress anymore, chances are that the	751	* If a flush is not in progress anymore, chances are that the
777	* inode was taken off the AIL. So, just get out.	752	* inode was taken off the AIL. So, just get out.
778	*/	753	*/
779	if (completion_done(&ip->i_flush) \|\|	754	if (completion_done(&ip->i_flush) \|\|
780	((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {	755	((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
781	iip->ili_pushbuf_flag = 0;
782	xfs_iunlock(ip, XFS_ILOCK_SHARED);	756	xfs_iunlock(ip, XFS_ILOCK_SHARED);
783	return;	757	return;
784	}	758	}
@@ -787,53 +761,12 @@ xfs_inode_item_pushbuf(
787	bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,	761	bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
788	iip->ili_format.ilf_len, XBF_TRYLOCK);	762	iip->ili_format.ilf_len, XBF_TRYLOCK);
789		763
790	if (bp != NULL) {
791	if (XFS_BUF_ISDELAYWRITE(bp)) {
792	/*
793	* We were racing with iflush because we don't hold
794	* the AIL lock or the flush lock. However, at this point,
795	* we have the buffer, and we know that it's dirty.
796	* So, it's possible that iflush raced with us, and
797	* this item is already taken off the AIL.
798	* If not, we can flush it async.
799	*/
800	dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
801	!completion_done(&ip->i_flush));
802	iip->ili_pushbuf_flag = 0;
803	xfs_iunlock(ip, XFS_ILOCK_SHARED);
804
805	trace_xfs_inode_item_push(bp, _RET_IP_);
806
807	if (XFS_BUF_ISPINNED(bp))
808	xfs_log_force(mp, 0);
809
810	if (dopush) {
811	int error;
812	error = xfs_bawrite(mp, bp);
813	if (error)
814	xfs_fs_cmn_err(CE_WARN, mp,
815	"xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
816	error, iip, bp);
817	} else {
818	xfs_buf_relse(bp);
819	}
820	} else {
821	iip->ili_pushbuf_flag = 0;
822	xfs_iunlock(ip, XFS_ILOCK_SHARED);
823	xfs_buf_relse(bp);
824	}
825	return;
826	}
827	/*
828	* We have to be careful about resetting pushbuf flag too early (above).
829	* Even though in theory we can do it as soon as we have the buflock,
830	* we don't want others to be doing work needlessly. They'll come to
831	* this function thinking that pushing the buffer is their
832	* responsibility only to find that the buffer is still locked by
833	* another doing the same thing
834	*/
835	iip->ili_pushbuf_flag = 0;
836	xfs_iunlock(ip, XFS_ILOCK_SHARED);	764	xfs_iunlock(ip, XFS_ILOCK_SHARED);
		765	if (!bp)
		766	return;
		767	if (XFS_BUF_ISDELAYWRITE(bp))
		768	xfs_buf_delwri_promote(bp);
		769	xfs_buf_relse(bp);
837	return;	770	return;
838	}	771	}
839		772
@@ -937,7 +870,6 @@ xfs_inode_item_init(
937	/*	870	/*
938	We have zeroed memory. No need ...	871	We have zeroed memory. No need ...
939	iip->ili_extents_buf = NULL;	872	iip->ili_extents_buf = NULL;
940	iip->ili_pushbuf_flag = 0;
941	*/	873	*/
942		874
943	iip->ili_format.ilf_type = XFS_LI_INODE;	875	iip->ili_format.ilf_type = XFS_LI_INODE;