aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2010-02-05 20:37:26 -0500
committerDave Chinner <david@fromorbit.com>2010-02-05 20:37:26 -0500
commit777df5afdb26c71634edd60582be620ff94e87a0 (patch)
tree26ed86f1ec114250230e4e090be18980c94ce73f
parentd5db0f97fbbeff11c88dec1aaf1536a975afbaeb (diff)
xfs: Make inode reclaim states explicit
A.K.A.: don't rely on xfs_iflush() return value in reclaim We have gradually been moving checks out of the reclaim code because they are duplicated in xfs_iflush(). We've had a history of problems in this area, and many of them stem from the overloading of the return values from xfs_iflush() and interaction with inode flush locking to determine if the inode is safe to reclaim. With the desire to move to delayed write flushing of inodes and non-blocking inode tree reclaim walks, the overloading of the return value of xfs_iflush makes it very difficult to determine the correct thing to do next. This patch explicitly re-adds the checks to the inode reclaim code, removing the reliance on the return value of xfs_iflush() to determine what to do next. It also means that we can clearly document all the inode states that reclaim must handle and hence we can easily see that we handled all the necessary cases. This also removes the need for the xfs_inode_clean() check in xfs_iflush() as all callers now check this first (safely). Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c81
-rw-r--r--fs/xfs/xfs_inode.c11
-rw-r--r--fs/xfs/xfs_inode.h1
3 files changed, 64 insertions, 29 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index c9b863eacab7..525260c7617f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -706,12 +706,43 @@ __xfs_inode_clear_reclaim_tag(
706 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 706 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
707} 707}
708 708
709/*
710 * Inodes in different states need to be treated differently, and the return
711 * value of xfs_iflush is not sufficient to get this right. The following table
712 * lists the inode states and the reclaim actions necessary for non-blocking
713 * reclaim:
714 *
715 *
716 * inode state iflush ret required action
717 * --------------- ---------- ---------------
718 * bad - reclaim
719 * shutdown EIO unpin and reclaim
720 * clean, unpinned 0 reclaim
721 * stale, unpinned 0 reclaim
722 * clean, pinned(*) 0 unpin and reclaim
723 * stale, pinned 0 unpin and reclaim
724 * dirty, async 0 block on flush lock, reclaim
725 * dirty, sync flush 0 block on flush lock, reclaim
726 *
727 * (*) dgc: I don't think the clean, pinned state is possible but it gets
728 * handled anyway given the order of checks implemented.
729 *
730 * Hence the order of actions after gaining the locks should be:
731 * bad => reclaim
732 * shutdown => unpin and reclaim
733 * pinned => unpin
734 * stale => reclaim
735 * clean => reclaim
736 * dirty => flush, wait and reclaim
737 */
709STATIC int 738STATIC int
710xfs_reclaim_inode( 739xfs_reclaim_inode(
711 struct xfs_inode *ip, 740 struct xfs_inode *ip,
712 struct xfs_perag *pag, 741 struct xfs_perag *pag,
713 int sync_mode) 742 int sync_mode)
714{ 743{
744 int error;
745
715 /* 746 /*
716 * The radix tree lock here protects a thread in xfs_iget from racing 747 * The radix tree lock here protects a thread in xfs_iget from racing
717 * with us starting reclaim on the inode. Once we have the 748 * with us starting reclaim on the inode. Once we have the
@@ -729,30 +760,42 @@ xfs_reclaim_inode(
729 spin_unlock(&ip->i_flags_lock); 760 spin_unlock(&ip->i_flags_lock);
730 write_unlock(&pag->pag_ici_lock); 761 write_unlock(&pag->pag_ici_lock);
731 762
732 /*
733 * If the inode is still dirty, then flush it out. If the inode
734 * is not in the AIL, then it will be OK to flush it delwri as
735 * long as xfs_iflush() does not keep any references to the inode.
736 * We leave that decision up to xfs_iflush() since it has the
737 * knowledge of whether it's OK to simply do a delwri flush of
738 * the inode or whether we need to wait until the inode is
739 * pulled from the AIL.
740 * We get the flush lock regardless, though, just to make sure
741 * we don't free it while it is being flushed.
742 */
743 xfs_ilock(ip, XFS_ILOCK_EXCL); 763 xfs_ilock(ip, XFS_ILOCK_EXCL);
744 xfs_iflock(ip); 764 xfs_iflock(ip);
745 765
746 /* 766 if (is_bad_inode(VFS_I(ip)))
747 * In the case of a forced shutdown we rely on xfs_iflush() to 767 goto reclaim;
748 * wait for the inode to be unpinned before returning an error. 768 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
749 */ 769 xfs_iunpin_wait(ip);
750 if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { 770 goto reclaim;
751 /* synchronize with xfs_iflush_done */ 771 }
752 xfs_iflock(ip); 772 if (xfs_ipincount(ip))
753 xfs_ifunlock(ip); 773 xfs_iunpin_wait(ip);
774 if (xfs_iflags_test(ip, XFS_ISTALE))
775 goto reclaim;
776 if (xfs_inode_clean(ip))
777 goto reclaim;
778
779 /* Now we have an inode that needs flushing */
780 error = xfs_iflush(ip, sync_mode);
781 if (!error) {
782 switch(sync_mode) {
783 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
784 case XFS_IFLUSH_DELWRI:
785 case XFS_IFLUSH_ASYNC:
786 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
787 case XFS_IFLUSH_SYNC:
788 /* IO issued, synchronise with IO completion */
789 xfs_iflock(ip);
790 break;
791 default:
792 ASSERT(0);
793 break;
794 }
754 } 795 }
755 796
797reclaim:
798 xfs_ifunlock(ip);
756 xfs_iunlock(ip, XFS_ILOCK_EXCL); 799 xfs_iunlock(ip, XFS_ILOCK_EXCL);
757 xfs_ireclaim(ip); 800 xfs_ireclaim(ip);
758 return 0; 801 return 0;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d0d1b5a05183..8d0666dd170a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2493,7 +2493,7 @@ __xfs_iunpin_wait(
2493 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); 2493 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2494} 2494}
2495 2495
2496static inline void 2496void
2497xfs_iunpin_wait( 2497xfs_iunpin_wait(
2498 xfs_inode_t *ip) 2498 xfs_inode_t *ip)
2499{ 2499{
@@ -2849,15 +2849,6 @@ xfs_iflush(
2849 mp = ip->i_mount; 2849 mp = ip->i_mount;
2850 2850
2851 /* 2851 /*
2852 * If the inode isn't dirty, then just release the inode flush lock and
2853 * do nothing.
2854 */
2855 if (xfs_inode_clean(ip)) {
2856 xfs_ifunlock(ip);
2857 return 0;
2858 }
2859
2860 /*
2861 * We can't flush the inode until it is unpinned, so wait for it if we 2852 * We can't flush the inode until it is unpinned, so wait for it if we
2862 * are allowed to block. We know noone new can pin it, because we are 2853 * are allowed to block. We know noone new can pin it, because we are
2863 * holding the inode lock shared and you need to hold it exclusively to 2854 * holding the inode lock shared and you need to hold it exclusively to
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ec1f28c4fc4f..8b618ea4d692 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -483,6 +483,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
483void xfs_iext_realloc(xfs_inode_t *, int, int); 483void xfs_iext_realloc(xfs_inode_t *, int, int);
484void xfs_ipin(xfs_inode_t *); 484void xfs_ipin(xfs_inode_t *);
485void xfs_iunpin(xfs_inode_t *); 485void xfs_iunpin(xfs_inode_t *);
486void xfs_iunpin_wait(xfs_inode_t *);
486int xfs_iflush(xfs_inode_t *, uint); 487int xfs_iflush(xfs_inode_t *, uint);
487void xfs_ichgtime(xfs_inode_t *, int); 488void xfs_ichgtime(xfs_inode_t *, int);
488void xfs_lock_inodes(xfs_inode_t **, int, uint); 489void xfs_lock_inodes(xfs_inode_t **, int, uint);