diff options
author | Dave Chinner <david@fromorbit.com> | 2010-06-03 02:22:29 -0400 |
---|---|---|
committer | Dave Chinner <david@fromorbit.com> | 2010-06-03 02:22:29 -0400 |
commit | 5b257b4a1f9239624c6b5e669763de04e482c2b3 (patch) | |
tree | 4896f326cc9e30c091ad2a370244837613f998ee | |
parent | fb3b504adeee942e55393396fea8fdf406acf037 (diff) |
xfs: fix race in inode cluster freeing failing to stale inodes
When an inode cluster is freed, it needs to mark all inodes in memory as
XFS_ISTALE before marking the buffer as stale. This is eeded because the inodes
have a different life cycle to the buffer, and once the buffer is torn down
during transaction completion, we must ensure none of the inodes get written
back (which is what XFS_ISTALE does).
Unfortunately, xfs_ifree_cluster() has some bugs that lead to inodes not being
marked with XFS_ISTALE. This shows up when xfs_iflush() is called on these
inodes either during inode reclaim or tail pushing on the AIL. The buffer is
read back, but no longer contains inodes and so triggers assert failures and
shutdowns. This was reproducable with at run.dbench10 invocation from xfstests.
There are two main causes of xfs_ifree_cluster() failing. The first is simple -
it checks in-memory inodes it finds in the per-ag icache to see if they are
clean without holding the flush lock. if they are clean it skips them
completely. However, If an inode is flushed delwri, it will
appear clean, but is not guaranteed to be written back until the flush lock has
been dropped. Hence we may have raced on the clean check and the inode may
actually be dirty. Hence always mark inodes found in memory stale before we
check properly if they are clean.
The second is more complex, and makes the first problem easier to hit.
Basically the in-memory inode scan is done with full knowledge it can be racing
with inode flushing and AIl tail pushing, which means that inodes that it can't
get the flush lock on might not be attached to the buffer after then in-memory
inode scan due to IO completion occurring. This is actually documented in the
code as "needs better interlocking". i.e. this is a zero-day bug.
Effectively, the in-memory scan must be done while the inode buffer is locked
and Io cannot be issued on it while we do the in-memory inode scan. This
ensures that inodes we couldn't get the flush lock on are guaranteed to be
attached to the cluster buffer, so we can then catch all in-memory inodes and
mark them stale.
Now that the inode cluster buffer is locked before the in-memory scan is done,
there is no need for the two-phase update of the in-memory inodes, so simplify
the code into two loops and remove the allocation of the temporary buffer used
to hold locked inodes across the phases.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
-rw-r--r-- | fs/xfs/xfs_inode.c | 142 |
1 files changed, 62 insertions, 80 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6ba7765026f6..d53c39de7d05 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c | |||
@@ -1940,10 +1940,10 @@ xfs_ifree_cluster( | |||
1940 | int blks_per_cluster; | 1940 | int blks_per_cluster; |
1941 | int nbufs; | 1941 | int nbufs; |
1942 | int ninodes; | 1942 | int ninodes; |
1943 | int i, j, found, pre_flushed; | 1943 | int i, j; |
1944 | xfs_daddr_t blkno; | 1944 | xfs_daddr_t blkno; |
1945 | xfs_buf_t *bp; | 1945 | xfs_buf_t *bp; |
1946 | xfs_inode_t *ip, **ip_found; | 1946 | xfs_inode_t *ip; |
1947 | xfs_inode_log_item_t *iip; | 1947 | xfs_inode_log_item_t *iip; |
1948 | xfs_log_item_t *lip; | 1948 | xfs_log_item_t *lip; |
1949 | struct xfs_perag *pag; | 1949 | struct xfs_perag *pag; |
@@ -1960,114 +1960,97 @@ xfs_ifree_cluster( | |||
1960 | nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; | 1960 | nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; |
1961 | } | 1961 | } |
1962 | 1962 | ||
1963 | ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS); | ||
1964 | |||
1965 | for (j = 0; j < nbufs; j++, inum += ninodes) { | 1963 | for (j = 0; j < nbufs; j++, inum += ninodes) { |
1964 | int found = 0; | ||
1965 | |||
1966 | blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), | 1966 | blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), |
1967 | XFS_INO_TO_AGBNO(mp, inum)); | 1967 | XFS_INO_TO_AGBNO(mp, inum)); |
1968 | 1968 | ||
1969 | /* | ||
1970 | * We obtain and lock the backing buffer first in the process | ||
1971 | * here, as we have to ensure that any dirty inode that we | ||
1972 | * can't get the flush lock on is attached to the buffer. | ||
1973 | * If we scan the in-memory inodes first, then buffer IO can | ||
1974 | * complete before we get a lock on it, and hence we may fail | ||
1975 | * to mark all the active inodes on the buffer stale. | ||
1976 | */ | ||
1977 | bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, | ||
1978 | mp->m_bsize * blks_per_cluster, | ||
1979 | XBF_LOCK); | ||
1980 | |||
1981 | /* | ||
1982 | * Walk the inodes already attached to the buffer and mark them | ||
1983 | * stale. These will all have the flush locks held, so an | ||
1984 | * in-memory inode walk can't lock them. | ||
1985 | */ | ||
1986 | lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); | ||
1987 | while (lip) { | ||
1988 | if (lip->li_type == XFS_LI_INODE) { | ||
1989 | iip = (xfs_inode_log_item_t *)lip; | ||
1990 | ASSERT(iip->ili_logged == 1); | ||
1991 | lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; | ||
1992 | xfs_trans_ail_copy_lsn(mp->m_ail, | ||
1993 | &iip->ili_flush_lsn, | ||
1994 | &iip->ili_item.li_lsn); | ||
1995 | xfs_iflags_set(iip->ili_inode, XFS_ISTALE); | ||
1996 | found++; | ||
1997 | } | ||
1998 | lip = lip->li_bio_list; | ||
1999 | } | ||
1969 | 2000 | ||
1970 | /* | 2001 | /* |
1971 | * Look for each inode in memory and attempt to lock it, | 2002 | * For each inode in memory attempt to add it to the inode |
1972 | * we can be racing with flush and tail pushing here. | 2003 | * buffer and set it up for being staled on buffer IO |
1973 | * any inode we get the locks on, add to an array of | 2004 | * completion. This is safe as we've locked out tail pushing |
1974 | * inode items to process later. | 2005 | * and flushing by locking the buffer. |
1975 | * | 2006 | * |
1976 | * The get the buffer lock, we could beat a flush | 2007 | * We have already marked every inode that was part of a |
1977 | * or tail pushing thread to the lock here, in which | 2008 | * transaction stale above, which means there is no point in |
1978 | * case they will go looking for the inode buffer | 2009 | * even trying to lock them. |
1979 | * and fail, we need some other form of interlock | ||
1980 | * here. | ||
1981 | */ | 2010 | */ |
1982 | found = 0; | ||
1983 | for (i = 0; i < ninodes; i++) { | 2011 | for (i = 0; i < ninodes; i++) { |
1984 | read_lock(&pag->pag_ici_lock); | 2012 | read_lock(&pag->pag_ici_lock); |
1985 | ip = radix_tree_lookup(&pag->pag_ici_root, | 2013 | ip = radix_tree_lookup(&pag->pag_ici_root, |
1986 | XFS_INO_TO_AGINO(mp, (inum + i))); | 2014 | XFS_INO_TO_AGINO(mp, (inum + i))); |
1987 | 2015 | ||
1988 | /* Inode not in memory or we found it already, | 2016 | /* Inode not in memory or stale, nothing to do */ |
1989 | * nothing to do | ||
1990 | */ | ||
1991 | if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { | 2017 | if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { |
1992 | read_unlock(&pag->pag_ici_lock); | 2018 | read_unlock(&pag->pag_ici_lock); |
1993 | continue; | 2019 | continue; |
1994 | } | 2020 | } |
1995 | 2021 | ||
1996 | if (xfs_inode_clean(ip)) { | 2022 | /* don't try to lock/unlock the current inode */ |
1997 | read_unlock(&pag->pag_ici_lock); | 2023 | if (ip != free_ip && |
1998 | continue; | 2024 | !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { |
1999 | } | ||
2000 | |||
2001 | /* If we can get the locks then add it to the | ||
2002 | * list, otherwise by the time we get the bp lock | ||
2003 | * below it will already be attached to the | ||
2004 | * inode buffer. | ||
2005 | */ | ||
2006 | |||
2007 | /* This inode will already be locked - by us, lets | ||
2008 | * keep it that way. | ||
2009 | */ | ||
2010 | |||
2011 | if (ip == free_ip) { | ||
2012 | if (xfs_iflock_nowait(ip)) { | ||
2013 | xfs_iflags_set(ip, XFS_ISTALE); | ||
2014 | if (xfs_inode_clean(ip)) { | ||
2015 | xfs_ifunlock(ip); | ||
2016 | } else { | ||
2017 | ip_found[found++] = ip; | ||
2018 | } | ||
2019 | } | ||
2020 | read_unlock(&pag->pag_ici_lock); | 2025 | read_unlock(&pag->pag_ici_lock); |
2021 | continue; | 2026 | continue; |
2022 | } | 2027 | } |
2028 | read_unlock(&pag->pag_ici_lock); | ||
2023 | 2029 | ||
2024 | if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { | 2030 | if (!xfs_iflock_nowait(ip)) { |
2025 | if (xfs_iflock_nowait(ip)) { | 2031 | if (ip != free_ip) |
2026 | xfs_iflags_set(ip, XFS_ISTALE); | ||
2027 | |||
2028 | if (xfs_inode_clean(ip)) { | ||
2029 | xfs_ifunlock(ip); | ||
2030 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
2031 | } else { | ||
2032 | ip_found[found++] = ip; | ||
2033 | } | ||
2034 | } else { | ||
2035 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 2032 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
2036 | } | 2033 | continue; |
2037 | } | 2034 | } |
2038 | read_unlock(&pag->pag_ici_lock); | ||
2039 | } | ||
2040 | 2035 | ||
2041 | bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, | 2036 | xfs_iflags_set(ip, XFS_ISTALE); |
2042 | mp->m_bsize * blks_per_cluster, | 2037 | if (xfs_inode_clean(ip)) { |
2043 | XBF_LOCK); | 2038 | ASSERT(ip != free_ip); |
2044 | 2039 | xfs_ifunlock(ip); | |
2045 | pre_flushed = 0; | 2040 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
2046 | lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); | 2041 | continue; |
2047 | while (lip) { | ||
2048 | if (lip->li_type == XFS_LI_INODE) { | ||
2049 | iip = (xfs_inode_log_item_t *)lip; | ||
2050 | ASSERT(iip->ili_logged == 1); | ||
2051 | lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; | ||
2052 | xfs_trans_ail_copy_lsn(mp->m_ail, | ||
2053 | &iip->ili_flush_lsn, | ||
2054 | &iip->ili_item.li_lsn); | ||
2055 | xfs_iflags_set(iip->ili_inode, XFS_ISTALE); | ||
2056 | pre_flushed++; | ||
2057 | } | 2042 | } |
2058 | lip = lip->li_bio_list; | ||
2059 | } | ||
2060 | 2043 | ||
2061 | for (i = 0; i < found; i++) { | ||
2062 | ip = ip_found[i]; | ||
2063 | iip = ip->i_itemp; | 2044 | iip = ip->i_itemp; |
2064 | |||
2065 | if (!iip) { | 2045 | if (!iip) { |
2046 | /* inode with unlogged changes only */ | ||
2047 | ASSERT(ip != free_ip); | ||
2066 | ip->i_update_core = 0; | 2048 | ip->i_update_core = 0; |
2067 | xfs_ifunlock(ip); | 2049 | xfs_ifunlock(ip); |
2068 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 2050 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
2069 | continue; | 2051 | continue; |
2070 | } | 2052 | } |
2053 | found++; | ||
2071 | 2054 | ||
2072 | iip->ili_last_fields = iip->ili_format.ilf_fields; | 2055 | iip->ili_last_fields = iip->ili_format.ilf_fields; |
2073 | iip->ili_format.ilf_fields = 0; | 2056 | iip->ili_format.ilf_fields = 0; |
@@ -2078,17 +2061,16 @@ xfs_ifree_cluster( | |||
2078 | xfs_buf_attach_iodone(bp, | 2061 | xfs_buf_attach_iodone(bp, |
2079 | (void(*)(xfs_buf_t*,xfs_log_item_t*)) | 2062 | (void(*)(xfs_buf_t*,xfs_log_item_t*)) |
2080 | xfs_istale_done, (xfs_log_item_t *)iip); | 2063 | xfs_istale_done, (xfs_log_item_t *)iip); |
2081 | if (ip != free_ip) { | 2064 | |
2065 | if (ip != free_ip) | ||
2082 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 2066 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
2083 | } | ||
2084 | } | 2067 | } |
2085 | 2068 | ||
2086 | if (found || pre_flushed) | 2069 | if (found) |
2087 | xfs_trans_stale_inode_buf(tp, bp); | 2070 | xfs_trans_stale_inode_buf(tp, bp); |
2088 | xfs_trans_binval(tp, bp); | 2071 | xfs_trans_binval(tp, bp); |
2089 | } | 2072 | } |
2090 | 2073 | ||
2091 | kmem_free(ip_found); | ||
2092 | xfs_perag_put(pag); | 2074 | xfs_perag_put(pag); |
2093 | } | 2075 | } |
2094 | 2076 | ||