xfs: introduce inode cluster buffer trylocks for xfs_iflush

There is an ABBA deadlock between synchronous inode flushing in xfs_reclaim_inode and xfs_icluster_free. xfs_icluster_free locks the buffer, then takes inode ilocks, whilst synchronous reclaim takes the ilock followed by the buffer lock in xfs_iflush(). To avoid this deadlock, separate the inode cluster buffer locking semantics from the synchronous inode flush semantics, allowing callers to attempt to lock the buffer but still issue synchronous IO if it can get the buffer. This requires xfs_iflush() calls that currently use non-blocking semantics to pass SYNC_TRYLOCK rather than 0 as the flags parameter. This allows xfs_reclaim_inode to avoid the deadlock on the buffer lock and detect the failure so that it can drop the inode ilock and restart the reclaim attempt on the inode. This allows xfs_ifree_cluster to obtain the inode lock, mark the inode stale and release it and hence defuse the deadlock situation. It also has the pleasant side effect of avoiding IO in xfs_reclaim_inode when it tries to next reclaim the inode as it is now marked stale. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Alex Elder <aelder@sgi.com>
author: Dave Chinner <dchinner@redhat.com> 2011-03-25 18:13:55 -0400
committer: Dave Chinner <david@fromorbit.com> 2011-03-25 18:13:55 -0400
commit: 1bfd8d04190c615bb8d1d98188dead0c09702208 (patch)
tree: f2eb5d4e9b730b7ecef25bcc1cc26caa2b306339 /fs/xfs/linux-2.6/xfs_sync.c
parent: a19fb380961f209a3a406443686647bcd01bb9a6 (diff)
1 files changed, 27 insertions, 3 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 6c10f1d2e3d3..594cd822d84d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -761,8 +761,10 @@ xfs_reclaim_inode(
        struct xfs_perag        *pag,
        int                     sync_mode)
 {
-        int     error = 0;
+        int     error;
+restart:
+        error = 0;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if (!xfs_iflock_nowait(ip)) {
                if (!(sync_mode & SYNC_WAIT))
@@ -788,9 +790,31 @@ xfs_reclaim_inode(
        if (xfs_inode_clean(ip))
                goto reclaim;
-        /* Now we have an inode that needs flushing */
+        /*
-        error = xfs_iflush(ip, sync_mode);
+         * Now we have an inode that needs flushing.
+         *
+         * We do a nonblocking flush here even if we are doing a SYNC_WAIT
+         * reclaim as we can deadlock with inode cluster removal.
+         * xfs_ifree_cluster() can lock the inode buffer before it locks the
+         * ip->i_lock, and we are doing the exact opposite here. As a result,
+         * doing a blocking xfs_itobp() to get the cluster buffer will result
+         * in an ABBA deadlock with xfs_ifree_cluster().
+         *
+         * As xfs_ifree_cluser() must gather all inodes that are active in the
+         * cache to mark them stale, if we hit this case we don't actually want
+         * to do IO here - we want the inode marked stale so we can simply
+         * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
+         * just unlock the inode, back off and try again. Hopefully the next
+         * pass through will see the stale flag set on the inode.
+         */
+        error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
        if (sync_mode & SYNC_WAIT) {
+                if (error == EAGAIN) {
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                        /* backoff longer than in xfs_ifree_cluster */
+                        delay(2);
+                        goto restart;
+                }
                xfs_iflock(ip);
                goto reclaim;
        }
author	Dave Chinner <dchinner@redhat.com>	2011-03-25 18:13:55 -0400
committer	Dave Chinner <david@fromorbit.com>	2011-03-25 18:13:55 -0400
commit	1bfd8d04190c615bb8d1d98188dead0c09702208 (patch)
tree	f2eb5d4e9b730b7ecef25bcc1cc26caa2b306339 /fs/xfs/linux-2.6/xfs_sync.c
parent	a19fb380961f209a3a406443686647bcd01bb9a6 (diff)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 6c10f1d2e3d3..594cd822d84d 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -761,8 +761,10 @@ xfs_reclaim_inode(
761	struct xfs_perag *pag,	761	struct xfs_perag *pag,
762	int sync_mode)	762	int sync_mode)
763	{	763	{
764	int error = 0;	764	int error;
765		765
		766	restart:
		767	error = 0;
766	xfs_ilock(ip, XFS_ILOCK_EXCL);	768	xfs_ilock(ip, XFS_ILOCK_EXCL);
767	if (!xfs_iflock_nowait(ip)) {	769	if (!xfs_iflock_nowait(ip)) {
768	if (!(sync_mode & SYNC_WAIT))	770	if (!(sync_mode & SYNC_WAIT))
@@ -788,9 +790,31 @@ xfs_reclaim_inode(
788	if (xfs_inode_clean(ip))	790	if (xfs_inode_clean(ip))
789	goto reclaim;	791	goto reclaim;
790		792
791	/* Now we have an inode that needs flushing */	793	/*
792	error = xfs_iflush(ip, sync_mode);	794	* Now we have an inode that needs flushing.
		795	*
		796	* We do a nonblocking flush here even if we are doing a SYNC_WAIT
		797	* reclaim as we can deadlock with inode cluster removal.
		798	* xfs_ifree_cluster() can lock the inode buffer before it locks the
		799	* ip->i_lock, and we are doing the exact opposite here. As a result,
		800	* doing a blocking xfs_itobp() to get the cluster buffer will result
		801	* in an ABBA deadlock with xfs_ifree_cluster().
		802	*
		803	* As xfs_ifree_cluser() must gather all inodes that are active in the
		804	* cache to mark them stale, if we hit this case we don't actually want
		805	* to do IO here - we want the inode marked stale so we can simply
		806	* reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
		807	* just unlock the inode, back off and try again. Hopefully the next
		808	* pass through will see the stale flag set on the inode.
		809	*/
		810	error = xfs_iflush(ip, SYNC_TRYLOCK \| sync_mode);
793	if (sync_mode & SYNC_WAIT) {	811	if (sync_mode & SYNC_WAIT) {
		812	if (error == EAGAIN) {
		813	xfs_iunlock(ip, XFS_ILOCK_EXCL);
		814	/* backoff longer than in xfs_ifree_cluster */
		815	delay(2);
		816	goto restart;
		817	}
794	xfs_iflock(ip);	818	xfs_iflock(ip);
795	goto reclaim;	819	goto reclaim;
796	}	820	}