aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2010-06-03 02:22:29 -0400
committerDave Chinner <david@fromorbit.com>2010-06-03 02:22:29 -0400
commit5b257b4a1f9239624c6b5e669763de04e482c2b3 (patch)
tree4896f326cc9e30c091ad2a370244837613f998ee /fs/xfs
parentfb3b504adeee942e55393396fea8fdf406acf037 (diff)
xfs: fix race in inode cluster freeing failing to stale inodes
When an inode cluster is freed, it needs to mark all inodes in memory as XFS_ISTALE before marking the buffer as stale. This is eeded because the inodes have a different life cycle to the buffer, and once the buffer is torn down during transaction completion, we must ensure none of the inodes get written back (which is what XFS_ISTALE does). Unfortunately, xfs_ifree_cluster() has some bugs that lead to inodes not being marked with XFS_ISTALE. This shows up when xfs_iflush() is called on these inodes either during inode reclaim or tail pushing on the AIL. The buffer is read back, but no longer contains inodes and so triggers assert failures and shutdowns. This was reproducable with at run.dbench10 invocation from xfstests. There are two main causes of xfs_ifree_cluster() failing. The first is simple - it checks in-memory inodes it finds in the per-ag icache to see if they are clean without holding the flush lock. if they are clean it skips them completely. However, If an inode is flushed delwri, it will appear clean, but is not guaranteed to be written back until the flush lock has been dropped. Hence we may have raced on the clean check and the inode may actually be dirty. Hence always mark inodes found in memory stale before we check properly if they are clean. The second is more complex, and makes the first problem easier to hit. Basically the in-memory inode scan is done with full knowledge it can be racing with inode flushing and AIl tail pushing, which means that inodes that it can't get the flush lock on might not be attached to the buffer after then in-memory inode scan due to IO completion occurring. This is actually documented in the code as "needs better interlocking". i.e. this is a zero-day bug. Effectively, the in-memory scan must be done while the inode buffer is locked and Io cannot be issued on it while we do the in-memory inode scan. This ensures that inodes we couldn't get the flush lock on are guaranteed to be attached to the cluster buffer, so we can then catch all in-memory inodes and mark them stale. Now that the inode cluster buffer is locked before the in-memory scan is done, there is no need for the two-phase update of the in-memory inodes, so simplify the code into two loops and remove the allocation of the temporary buffer used to hold locked inodes across the phases. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_inode.c142
1 files changed, 62 insertions, 80 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6ba7765026f6..d53c39de7d05 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1940,10 +1940,10 @@ xfs_ifree_cluster(
1940 int blks_per_cluster; 1940 int blks_per_cluster;
1941 int nbufs; 1941 int nbufs;
1942 int ninodes; 1942 int ninodes;
1943 int i, j, found, pre_flushed; 1943 int i, j;
1944 xfs_daddr_t blkno; 1944 xfs_daddr_t blkno;
1945 xfs_buf_t *bp; 1945 xfs_buf_t *bp;
1946 xfs_inode_t *ip, **ip_found; 1946 xfs_inode_t *ip;
1947 xfs_inode_log_item_t *iip; 1947 xfs_inode_log_item_t *iip;
1948 xfs_log_item_t *lip; 1948 xfs_log_item_t *lip;
1949 struct xfs_perag *pag; 1949 struct xfs_perag *pag;
@@ -1960,114 +1960,97 @@ xfs_ifree_cluster(
1960 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 1960 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
1961 } 1961 }
1962 1962
1963 ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
1964
1965 for (j = 0; j < nbufs; j++, inum += ninodes) { 1963 for (j = 0; j < nbufs; j++, inum += ninodes) {
1964 int found = 0;
1965
1966 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1966 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1967 XFS_INO_TO_AGBNO(mp, inum)); 1967 XFS_INO_TO_AGBNO(mp, inum));
1968 1968
1969 /*
1970 * We obtain and lock the backing buffer first in the process
1971 * here, as we have to ensure that any dirty inode that we
1972 * can't get the flush lock on is attached to the buffer.
1973 * If we scan the in-memory inodes first, then buffer IO can
1974 * complete before we get a lock on it, and hence we may fail
1975 * to mark all the active inodes on the buffer stale.
1976 */
1977 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1978 mp->m_bsize * blks_per_cluster,
1979 XBF_LOCK);
1980
1981 /*
1982 * Walk the inodes already attached to the buffer and mark them
1983 * stale. These will all have the flush locks held, so an
1984 * in-memory inode walk can't lock them.
1985 */
1986 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1987 while (lip) {
1988 if (lip->li_type == XFS_LI_INODE) {
1989 iip = (xfs_inode_log_item_t *)lip;
1990 ASSERT(iip->ili_logged == 1);
1991 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
1992 xfs_trans_ail_copy_lsn(mp->m_ail,
1993 &iip->ili_flush_lsn,
1994 &iip->ili_item.li_lsn);
1995 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1996 found++;
1997 }
1998 lip = lip->li_bio_list;
1999 }
1969 2000
1970 /* 2001 /*
1971 * Look for each inode in memory and attempt to lock it, 2002 * For each inode in memory attempt to add it to the inode
1972 * we can be racing with flush and tail pushing here. 2003 * buffer and set it up for being staled on buffer IO
1973 * any inode we get the locks on, add to an array of 2004 * completion. This is safe as we've locked out tail pushing
1974 * inode items to process later. 2005 * and flushing by locking the buffer.
1975 * 2006 *
1976 * The get the buffer lock, we could beat a flush 2007 * We have already marked every inode that was part of a
1977 * or tail pushing thread to the lock here, in which 2008 * transaction stale above, which means there is no point in
1978 * case they will go looking for the inode buffer 2009 * even trying to lock them.
1979 * and fail, we need some other form of interlock
1980 * here.
1981 */ 2010 */
1982 found = 0;
1983 for (i = 0; i < ninodes; i++) { 2011 for (i = 0; i < ninodes; i++) {
1984 read_lock(&pag->pag_ici_lock); 2012 read_lock(&pag->pag_ici_lock);
1985 ip = radix_tree_lookup(&pag->pag_ici_root, 2013 ip = radix_tree_lookup(&pag->pag_ici_root,
1986 XFS_INO_TO_AGINO(mp, (inum + i))); 2014 XFS_INO_TO_AGINO(mp, (inum + i)));
1987 2015
1988 /* Inode not in memory or we found it already, 2016 /* Inode not in memory or stale, nothing to do */
1989 * nothing to do
1990 */
1991 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 2017 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
1992 read_unlock(&pag->pag_ici_lock); 2018 read_unlock(&pag->pag_ici_lock);
1993 continue; 2019 continue;
1994 } 2020 }
1995 2021
1996 if (xfs_inode_clean(ip)) { 2022 /* don't try to lock/unlock the current inode */
1997 read_unlock(&pag->pag_ici_lock); 2023 if (ip != free_ip &&
1998 continue; 2024 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1999 }
2000
2001 /* If we can get the locks then add it to the
2002 * list, otherwise by the time we get the bp lock
2003 * below it will already be attached to the
2004 * inode buffer.
2005 */
2006
2007 /* This inode will already be locked - by us, lets
2008 * keep it that way.
2009 */
2010
2011 if (ip == free_ip) {
2012 if (xfs_iflock_nowait(ip)) {
2013 xfs_iflags_set(ip, XFS_ISTALE);
2014 if (xfs_inode_clean(ip)) {
2015 xfs_ifunlock(ip);
2016 } else {
2017 ip_found[found++] = ip;
2018 }
2019 }
2020 read_unlock(&pag->pag_ici_lock); 2025 read_unlock(&pag->pag_ici_lock);
2021 continue; 2026 continue;
2022 } 2027 }
2028 read_unlock(&pag->pag_ici_lock);
2023 2029
2024 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2030 if (!xfs_iflock_nowait(ip)) {
2025 if (xfs_iflock_nowait(ip)) { 2031 if (ip != free_ip)
2026 xfs_iflags_set(ip, XFS_ISTALE);
2027
2028 if (xfs_inode_clean(ip)) {
2029 xfs_ifunlock(ip);
2030 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2031 } else {
2032 ip_found[found++] = ip;
2033 }
2034 } else {
2035 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2032 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2036 } 2033 continue;
2037 } 2034 }
2038 read_unlock(&pag->pag_ici_lock);
2039 }
2040 2035
2041 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2036 xfs_iflags_set(ip, XFS_ISTALE);
2042 mp->m_bsize * blks_per_cluster, 2037 if (xfs_inode_clean(ip)) {
2043 XBF_LOCK); 2038 ASSERT(ip != free_ip);
2044 2039 xfs_ifunlock(ip);
2045 pre_flushed = 0; 2040 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2046 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 2041 continue;
2047 while (lip) {
2048 if (lip->li_type == XFS_LI_INODE) {
2049 iip = (xfs_inode_log_item_t *)lip;
2050 ASSERT(iip->ili_logged == 1);
2051 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
2052 xfs_trans_ail_copy_lsn(mp->m_ail,
2053 &iip->ili_flush_lsn,
2054 &iip->ili_item.li_lsn);
2055 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2056 pre_flushed++;
2057 } 2042 }
2058 lip = lip->li_bio_list;
2059 }
2060 2043
2061 for (i = 0; i < found; i++) {
2062 ip = ip_found[i];
2063 iip = ip->i_itemp; 2044 iip = ip->i_itemp;
2064
2065 if (!iip) { 2045 if (!iip) {
2046 /* inode with unlogged changes only */
2047 ASSERT(ip != free_ip);
2066 ip->i_update_core = 0; 2048 ip->i_update_core = 0;
2067 xfs_ifunlock(ip); 2049 xfs_ifunlock(ip);
2068 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2050 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2069 continue; 2051 continue;
2070 } 2052 }
2053 found++;
2071 2054
2072 iip->ili_last_fields = iip->ili_format.ilf_fields; 2055 iip->ili_last_fields = iip->ili_format.ilf_fields;
2073 iip->ili_format.ilf_fields = 0; 2056 iip->ili_format.ilf_fields = 0;
@@ -2078,17 +2061,16 @@ xfs_ifree_cluster(
2078 xfs_buf_attach_iodone(bp, 2061 xfs_buf_attach_iodone(bp,
2079 (void(*)(xfs_buf_t*,xfs_log_item_t*)) 2062 (void(*)(xfs_buf_t*,xfs_log_item_t*))
2080 xfs_istale_done, (xfs_log_item_t *)iip); 2063 xfs_istale_done, (xfs_log_item_t *)iip);
2081 if (ip != free_ip) { 2064
2065 if (ip != free_ip)
2082 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2066 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2083 }
2084 } 2067 }
2085 2068
2086 if (found || pre_flushed) 2069 if (found)
2087 xfs_trans_stale_inode_buf(tp, bp); 2070 xfs_trans_stale_inode_buf(tp, bp);
2088 xfs_trans_binval(tp, bp); 2071 xfs_trans_binval(tp, bp);
2089 } 2072 }
2090 2073
2091 kmem_free(ip_found);
2092 xfs_perag_put(pag); 2074 xfs_perag_put(pag);
2093} 2075}
2094 2076