aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2011-03-25 18:13:55 -0400
committerDave Chinner <david@fromorbit.com>2011-03-25 18:13:55 -0400
commit1bfd8d04190c615bb8d1d98188dead0c09702208 (patch)
treef2eb5d4e9b730b7ecef25bcc1cc26caa2b306339
parenta19fb380961f209a3a406443686647bcd01bb9a6 (diff)
xfs: introduce inode cluster buffer trylocks for xfs_iflush
There is an ABBA deadlock between synchronous inode flushing in xfs_reclaim_inode and xfs_icluster_free. xfs_icluster_free locks the buffer, then takes inode ilocks, whilst synchronous reclaim takes the ilock followed by the buffer lock in xfs_iflush(). To avoid this deadlock, separate the inode cluster buffer locking semantics from the synchronous inode flush semantics, allowing callers to attempt to lock the buffer but still issue synchronous IO if it can get the buffer. This requires xfs_iflush() calls that currently use non-blocking semantics to pass SYNC_TRYLOCK rather than 0 as the flags parameter. This allows xfs_reclaim_inode to avoid the deadlock on the buffer lock and detect the failure so that it can drop the inode ilock and restart the reclaim attempt on the inode. This allows xfs_ifree_cluster to obtain the inode lock, mark the inode stale and release it and hence defuse the deadlock situation. It also has the pleasant side effect of avoiding IO in xfs_reclaim_inode when it tries to next reclaim the inode as it is now marked stale. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Alex Elder <aelder@sgi.com>
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c30
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_inode_item.c6
4 files changed, 32 insertions, 8 deletions
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 818c4cf2de86..8a70b2a17d6f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1078,7 +1078,7 @@ xfs_fs_write_inode(
1078 error = 0; 1078 error = 0;
1079 goto out_unlock; 1079 goto out_unlock;
1080 } 1080 }
1081 error = xfs_iflush(ip, 0); 1081 error = xfs_iflush(ip, SYNC_TRYLOCK);
1082 } 1082 }
1083 1083
1084 out_unlock: 1084 out_unlock:
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 6c10f1d2e3d3..594cd822d84d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -761,8 +761,10 @@ xfs_reclaim_inode(
761 struct xfs_perag *pag, 761 struct xfs_perag *pag,
762 int sync_mode) 762 int sync_mode)
763{ 763{
764 int error = 0; 764 int error;
765 765
766restart:
767 error = 0;
766 xfs_ilock(ip, XFS_ILOCK_EXCL); 768 xfs_ilock(ip, XFS_ILOCK_EXCL);
767 if (!xfs_iflock_nowait(ip)) { 769 if (!xfs_iflock_nowait(ip)) {
768 if (!(sync_mode & SYNC_WAIT)) 770 if (!(sync_mode & SYNC_WAIT))
@@ -788,9 +790,31 @@ xfs_reclaim_inode(
788 if (xfs_inode_clean(ip)) 790 if (xfs_inode_clean(ip))
789 goto reclaim; 791 goto reclaim;
790 792
791 /* Now we have an inode that needs flushing */ 793 /*
792 error = xfs_iflush(ip, sync_mode); 794 * Now we have an inode that needs flushing.
795 *
796 * We do a nonblocking flush here even if we are doing a SYNC_WAIT
797 * reclaim as we can deadlock with inode cluster removal.
798 * xfs_ifree_cluster() can lock the inode buffer before it locks the
799 * ip->i_lock, and we are doing the exact opposite here. As a result,
800 * doing a blocking xfs_itobp() to get the cluster buffer will result
801 * in an ABBA deadlock with xfs_ifree_cluster().
802 *
803 * As xfs_ifree_cluser() must gather all inodes that are active in the
804 * cache to mark them stale, if we hit this case we don't actually want
805 * to do IO here - we want the inode marked stale so we can simply
806 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
807 * just unlock the inode, back off and try again. Hopefully the next
808 * pass through will see the stale flag set on the inode.
809 */
810 error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
793 if (sync_mode & SYNC_WAIT) { 811 if (sync_mode & SYNC_WAIT) {
812 if (error == EAGAIN) {
813 xfs_iunlock(ip, XFS_ILOCK_EXCL);
814 /* backoff longer than in xfs_ifree_cluster */
815 delay(2);
816 goto restart;
817 }
794 xfs_iflock(ip); 818 xfs_iflock(ip);
795 goto reclaim; 819 goto reclaim;
796 } 820 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index da871f532236..742c8330994a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2835,7 +2835,7 @@ xfs_iflush(
2835 * Get the buffer containing the on-disk inode. 2835 * Get the buffer containing the on-disk inode.
2836 */ 2836 */
2837 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2837 error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2838 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK); 2838 (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
2839 if (error || !bp) { 2839 if (error || !bp) {
2840 xfs_ifunlock(ip); 2840 xfs_ifunlock(ip);
2841 return error; 2841 return error;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fd4f398bd6f1..46cc40131d4a 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -760,11 +760,11 @@ xfs_inode_item_push(
760 * Push the inode to it's backing buffer. This will not remove the 760 * Push the inode to it's backing buffer. This will not remove the
761 * inode from the AIL - a further push will be required to trigger a 761 * inode from the AIL - a further push will be required to trigger a
762 * buffer push. However, this allows all the dirty inodes to be pushed 762 * buffer push. However, this allows all the dirty inodes to be pushed
763 * to the buffer before it is pushed to disk. THe buffer IO completion 763 * to the buffer before it is pushed to disk. The buffer IO completion
764 * will pull th einode from the AIL, mark it clean and unlock the flush 764 * will pull the inode from the AIL, mark it clean and unlock the flush
765 * lock. 765 * lock.
766 */ 766 */
767 (void) xfs_iflush(ip, 0); 767 (void) xfs_iflush(ip, SYNC_TRYLOCK);
768 xfs_iunlock(ip, XFS_ILOCK_SHARED); 768 xfs_iunlock(ip, XFS_ILOCK_SHARED);
769} 769}
770 770