aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVlad Apostolov <vapo@sgi.com>2007-10-11 03:44:18 -0400
committerTim Shimmin <tes@chook.melbourne.sgi.com>2007-10-15 22:21:15 -0400
commit859d718279b6e1d6bc27a701db47c1be720b5907 (patch)
treea3b867865c5fcd44fdf5f1f1a0c960a2deb173d6
parentba532a980b7dcccf5eebd2cd409a9cb37faa2bb4 (diff)
[XFS] get_bulkall() could return incorrect inode state
In the following scenario xfs_bulkstat() returns incorrect stale inode state: 1. File_A is created and its inode synced to disk. 2. File_A is unlinked and doesn't exist anymore. 3. Filesystem sync is invoked. 4. File_B is created. File_B happens to reclaim File_A's inode. 5. xfs_bulkstat() is called and detects File_B but reports the incorrect File_A inode state. Explanation for the incorrect inode state is that inodes are not immediately synced on file create for performance reasons. This leaves the on-disk inode buffer uninitialized (or with old state from a previous generation inode) and this is what xfs_bulkstat() would report. The patch marks the on-disk inode buffer "dirty" on unlink. When the inode is reclaimed (by a new file create), xfs_bulkstat() would filter this inode by the "dirty" mark. Once the inode is flushed to disk, the on-disk buffer "dirty" mark is automatically removed and a following xfs_bulkstat() would return the correct inode state. Marking the on-disk inode buffer "dirty" on unlink is achieved by setting the on-disk di_nlink field to 0. Note that the in-core di_nlink has already been set to 0 and a corresponding transaction logged by xfs_droplink(). This is an exception from the rule that any on-disk inode buffer changes has to be followed by a disk write (inode flush). Synchronizing the in-core to on-disk di_nlink values in advance (before the actual inode flush to disk) should be fine in this case because the inode is already unlinked and it would never change its di_nlink again for this inode generation. SGI-PV: 970842 SGI-Modid: xfs-linux-melb:xfs-kern:29757a Signed-off-by: Vlad Apostolov <vapo@sgi.com> Signed-off-by: Alex Elder <aelder@sgi.com> Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Mark Goodwin <markgw@sgi.com> Signed-off-by: Tim Shimmin <tes@sgi.com>
-rw-r--r--fs/xfs/xfs_inode.c26
-rw-r--r--fs/xfs/xfs_itable.c10
2 files changed, 29 insertions, 7 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3d8ba8fec191..abf509a88915 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1931,9 +1931,9 @@ xfs_iunlink(
1931 */ 1931 */
1932 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, 1932 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
1933 XFS_FSS_TO_BB(mp, 1), 0, &agibp); 1933 XFS_FSS_TO_BB(mp, 1), 0, &agibp);
1934 if (error) { 1934 if (error)
1935 return error; 1935 return error;
1936 } 1936
1937 /* 1937 /*
1938 * Validate the magic number of the agi block. 1938 * Validate the magic number of the agi block.
1939 */ 1939 */
@@ -1957,6 +1957,24 @@ xfs_iunlink(
1957 ASSERT(agi->agi_unlinked[bucket_index]); 1957 ASSERT(agi->agi_unlinked[bucket_index]);
1958 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); 1958 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1959 1959
1960 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
1961 if (error)
1962 return error;
1963
1964 /*
1965 * Clear the on-disk di_nlink. This is to prevent xfs_bulkstat
1966 * from picking up this inode when it is reclaimed (its incore state
1967 * initialzed but not flushed to disk yet). The in-core di_nlink is
1968 * already cleared in xfs_droplink() and a corresponding transaction
1969 * logged. The hack here just synchronizes the in-core to on-disk
1970 * di_nlink value in advance before the actual inode sync to disk.
1971 * This is OK because the inode is already unlinked and would never
1972 * change its di_nlink again for this inode generation.
1973 * This is a temporary hack that would require a proper fix
1974 * in the future.
1975 */
1976 dip->di_core.di_nlink = 0;
1977
1960 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) { 1978 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) {
1961 /* 1979 /*
1962 * There is already another inode in the bucket we need 1980 * There is already another inode in the bucket we need
@@ -1964,10 +1982,6 @@ xfs_iunlink(
1964 * Here we put the head pointer into our next pointer, 1982 * Here we put the head pointer into our next pointer,
1965 * and then we fall through to point the head at us. 1983 * and then we fall through to point the head at us.
1966 */ 1984 */
1967 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
1968 if (error) {
1969 return error;
1970 }
1971 ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO); 1985 ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO);
1972 /* both on-disk, don't endian flip twice */ 1986 /* both on-disk, don't endian flip twice */
1973 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1987 dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index efeeafe275b9..1edd9afb664b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -290,8 +290,16 @@ xfs_bulkstat_use_dinode(
290 return 1; 290 return 1;
291 dip = (xfs_dinode_t *) 291 dip = (xfs_dinode_t *)
292 xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog); 292 xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog);
293 /*
294 * Check the buffer containing the on-disk inode for di_nlink == 0.
295 * This is to prevent xfs_bulkstat from picking up just reclaimed
296 * inodes that have their in-core state initialized but not flushed
297 * to disk yet. This is a temporary hack that would require a proper
298 * fix in the future.
299 */
293 if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC || 300 if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC ||
294 !XFS_DINODE_GOOD_VERSION(dip->di_core.di_version)) 301 !XFS_DINODE_GOOD_VERSION(dip->di_core.di_version) ||
302 !dip->di_core.di_nlink)
295 return 0; 303 return 0;
296 if (flags & BULKSTAT_FG_QUICK) { 304 if (flags & BULKSTAT_FG_QUICK) {
297 *dipp = dip; 305 *dipp = dip;