diff options
author | Vlad Apostolov <vapo@sgi.com> | 2007-10-11 03:44:18 -0400 |
---|---|---|
committer | Tim Shimmin <tes@chook.melbourne.sgi.com> | 2007-10-15 22:21:15 -0400 |
commit | 859d718279b6e1d6bc27a701db47c1be720b5907 (patch) | |
tree | a3b867865c5fcd44fdf5f1f1a0c960a2deb173d6 | |
parent | ba532a980b7dcccf5eebd2cd409a9cb37faa2bb4 (diff) |
[XFS] get_bulkall() could return incorrect inode state
In the following scenario xfs_bulkstat() returns incorrect stale inode
state:
1. File_A is created and its inode synced to disk. 2. File_A is unlinked
and doesn't exist anymore. 3. Filesystem sync is invoked. 4. File_B is
created. File_B happens to reclaim File_A's inode. 5. xfs_bulkstat() is
called and detects File_B but reports the
incorrect File_A inode state.
Explanation for the incorrect inode state is that inodes are not
immediately synced on file create for performance reasons. This leaves the
on-disk inode buffer uninitialized (or with old state from a previous
generation inode) and this is what xfs_bulkstat() would report.
The patch marks the on-disk inode buffer "dirty" on unlink. When the inode
is reclaimed (by a new file create), xfs_bulkstat() would filter this
inode by the "dirty" mark. Once the inode is flushed to disk, the on-disk
buffer "dirty" mark is automatically removed and a following
xfs_bulkstat() would return the correct inode state.
Marking the on-disk inode buffer "dirty" on unlink is achieved by setting
the on-disk di_nlink field to 0. Note that the in-core di_nlink has
already been set to 0 and a corresponding transaction logged by
xfs_droplink(). This is an exception from the rule that any on-disk inode
buffer changes has to be followed by a disk write (inode flush).
Synchronizing the in-core to on-disk di_nlink values in advance (before
the actual inode flush to disk) should be fine in this case because the
inode is already unlinked and it would never change its di_nlink again for
this inode generation.
SGI-PV: 970842
SGI-Modid: xfs-linux-melb:xfs-kern:29757a
Signed-off-by: Vlad Apostolov <vapo@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Mark Goodwin <markgw@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
-rw-r--r-- | fs/xfs/xfs_inode.c | 26 | ||||
-rw-r--r-- | fs/xfs/xfs_itable.c | 10 |
2 files changed, 29 insertions, 7 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3d8ba8fec191..abf509a88915 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c | |||
@@ -1931,9 +1931,9 @@ xfs_iunlink( | |||
1931 | */ | 1931 | */ |
1932 | error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, | 1932 | error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, |
1933 | XFS_FSS_TO_BB(mp, 1), 0, &agibp); | 1933 | XFS_FSS_TO_BB(mp, 1), 0, &agibp); |
1934 | if (error) { | 1934 | if (error) |
1935 | return error; | 1935 | return error; |
1936 | } | 1936 | |
1937 | /* | 1937 | /* |
1938 | * Validate the magic number of the agi block. | 1938 | * Validate the magic number of the agi block. |
1939 | */ | 1939 | */ |
@@ -1957,6 +1957,24 @@ xfs_iunlink( | |||
1957 | ASSERT(agi->agi_unlinked[bucket_index]); | 1957 | ASSERT(agi->agi_unlinked[bucket_index]); |
1958 | ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); | 1958 | ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); |
1959 | 1959 | ||
1960 | error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); | ||
1961 | if (error) | ||
1962 | return error; | ||
1963 | |||
1964 | /* | ||
1965 | * Clear the on-disk di_nlink. This is to prevent xfs_bulkstat | ||
1966 | * from picking up this inode when it is reclaimed (its incore state | ||
1967 | * initialzed but not flushed to disk yet). The in-core di_nlink is | ||
1968 | * already cleared in xfs_droplink() and a corresponding transaction | ||
1969 | * logged. The hack here just synchronizes the in-core to on-disk | ||
1970 | * di_nlink value in advance before the actual inode sync to disk. | ||
1971 | * This is OK because the inode is already unlinked and would never | ||
1972 | * change its di_nlink again for this inode generation. | ||
1973 | * This is a temporary hack that would require a proper fix | ||
1974 | * in the future. | ||
1975 | */ | ||
1976 | dip->di_core.di_nlink = 0; | ||
1977 | |||
1960 | if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) { | 1978 | if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) { |
1961 | /* | 1979 | /* |
1962 | * There is already another inode in the bucket we need | 1980 | * There is already another inode in the bucket we need |
@@ -1964,10 +1982,6 @@ xfs_iunlink( | |||
1964 | * Here we put the head pointer into our next pointer, | 1982 | * Here we put the head pointer into our next pointer, |
1965 | * and then we fall through to point the head at us. | 1983 | * and then we fall through to point the head at us. |
1966 | */ | 1984 | */ |
1967 | error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); | ||
1968 | if (error) { | ||
1969 | return error; | ||
1970 | } | ||
1971 | ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO); | 1985 | ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO); |
1972 | /* both on-disk, don't endian flip twice */ | 1986 | /* both on-disk, don't endian flip twice */ |
1973 | dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; | 1987 | dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; |
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index efeeafe275b9..1edd9afb664b 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c | |||
@@ -290,8 +290,16 @@ xfs_bulkstat_use_dinode( | |||
290 | return 1; | 290 | return 1; |
291 | dip = (xfs_dinode_t *) | 291 | dip = (xfs_dinode_t *) |
292 | xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog); | 292 | xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog); |
293 | /* | ||
294 | * Check the buffer containing the on-disk inode for di_nlink == 0. | ||
295 | * This is to prevent xfs_bulkstat from picking up just reclaimed | ||
296 | * inodes that have their in-core state initialized but not flushed | ||
297 | * to disk yet. This is a temporary hack that would require a proper | ||
298 | * fix in the future. | ||
299 | */ | ||
293 | if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC || | 300 | if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC || |
294 | !XFS_DINODE_GOOD_VERSION(dip->di_core.di_version)) | 301 | !XFS_DINODE_GOOD_VERSION(dip->di_core.di_version) || |
302 | !dip->di_core.di_nlink) | ||
295 | return 0; | 303 | return 0; |
296 | if (flags & BULKSTAT_FG_QUICK) { | 304 | if (flags & BULKSTAT_FG_QUICK) { |
297 | *dipp = dip; | 305 | *dipp = dip; |