aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2013-07-24 01:47:30 -0400
committerBen Myers <bpm@sgi.com>2013-07-25 11:41:42 -0400
commite1b4271ac261b290fdab51446996fb13e68a57be (patch)
treefd9e9172816960380e8ce7ef46b26ed44fc1c97a /fs
parentad81f0545ef01ea651886dddac4bef6cec930092 (diff)
xfs: di_flushiter considered harmful
When we made all inode updates transactional, we no longer needed the log recovery detection for inodes being newer on disk than the transaction being replayed - it was redundant as replay of the log would always result in the latest version of the inode would be on disk. It was redundant, but left in place because it wasn't considered to be a problem. However, with the new "don't read inodes on create" optimisation, flushiter has come back to bite us. Essentially, the optimisation made always initialises flushiter to zero in the create transaction, and so if we then crash and run recovery and the inode already on disk has a non-zero flushiter it will skip recovery of that inode. As a result, log recovery does the wrong thing and we end up with a corrupt filesystem. Because we have to support old kernel to new kernel upgrades, we can't just get rid of the flushiter support in log recovery as we might be upgrading from a kernel that doesn't have fully transactional inode updates. Unfortunately, for v4 superblocks there is no way to guarantee that log recovery knows about this fact. We cannot add a new inode format flag to say it's a "special inode create" because it won't be understood by older kernels and so recovery could do the wrong thing on downgrade. We cannot specially detect the combination of zero mode/non-zero flushiter on disk to non-zero mode, zero flushiter in the log item during recovery because wrapping of the flushiter can result in false detection. Hence that makes this "don't use flushiter" optimisation limited to a disk format that guarantees that we don't need it. And that means the only fix here is to limit the "no read IO on create" optimisation to version 5 superblocks.... Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com> (cherry picked from commit e60896d8f2b81412421953e14d3feb14177edb56)
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/xfs_dinode.h3
-rw-r--r--fs/xfs/xfs_inode.c31
-rw-r--r--fs/xfs/xfs_log_recover.c13
3 files changed, 36 insertions, 11 deletions
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 07d735a80a0f..e5869b50dc41 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -39,6 +39,9 @@ typedef struct xfs_timestamp {
39 * There is a very similar struct icdinode in xfs_inode which matches the 39 * There is a very similar struct icdinode in xfs_inode which matches the
40 * layout of the first 96 bytes of this structure, but is kept in native 40 * layout of the first 96 bytes of this structure, but is kept in native
41 * format instead of big endian. 41 * format instead of big endian.
42 *
43 * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
44 * padding field for v3 inodes.
42 */ 45 */
43typedef struct xfs_dinode { 46typedef struct xfs_dinode {
44 __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ 47 __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b78481f99d9d..bb262c25c8de 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -896,7 +896,6 @@ xfs_dinode_to_disk(
896 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 896 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
897 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 897 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
898 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 898 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
899 to->di_flushiter = cpu_to_be16(from->di_flushiter);
900 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 899 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
901 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); 900 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
902 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); 901 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
@@ -924,6 +923,9 @@ xfs_dinode_to_disk(
924 to->di_lsn = cpu_to_be64(from->di_lsn); 923 to->di_lsn = cpu_to_be64(from->di_lsn);
925 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); 924 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
926 uuid_copy(&to->di_uuid, &from->di_uuid); 925 uuid_copy(&to->di_uuid, &from->di_uuid);
926 to->di_flushiter = 0;
927 } else {
928 to->di_flushiter = cpu_to_be16(from->di_flushiter);
927 } 929 }
928} 930}
929 931
@@ -1029,10 +1031,14 @@ xfs_dinode_calc_crc(
1029/* 1031/*
1030 * Read the disk inode attributes into the in-core inode structure. 1032 * Read the disk inode attributes into the in-core inode structure.
1031 * 1033 *
1032 * If we are initialising a new inode and we are not utilising the 1034 * For version 5 superblocks, if we are initialising a new inode and we are not
1033 * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core 1035 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
1034 * with a random generation number. If we are keeping inodes around, we need to 1036 * inode core with a random generation number. If we are keeping inodes around,
1035 * read the inode cluster to get the existing generation number off disk. 1037 * we need to read the inode cluster to get the existing generation number off
1038 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
1039 * format) then log recovery is dependent on the di_flushiter field being
1040 * initialised from the current on-disk value and hence we must also read the
1041 * inode off disk.
1036 */ 1042 */
1037int 1043int
1038xfs_iread( 1044xfs_iread(
@@ -1054,6 +1060,7 @@ xfs_iread(
1054 1060
1055 /* shortcut IO on inode allocation if possible */ 1061 /* shortcut IO on inode allocation if possible */
1056 if ((iget_flags & XFS_IGET_CREATE) && 1062 if ((iget_flags & XFS_IGET_CREATE) &&
1063 xfs_sb_version_hascrc(&mp->m_sb) &&
1057 !(mp->m_flags & XFS_MOUNT_IKEEP)) { 1064 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
1058 /* initialise the on-disk inode core */ 1065 /* initialise the on-disk inode core */
1059 memset(&ip->i_d, 0, sizeof(ip->i_d)); 1066 memset(&ip->i_d, 0, sizeof(ip->i_d));
@@ -2882,12 +2889,18 @@ xfs_iflush_int(
2882 __func__, ip->i_ino, ip->i_d.di_forkoff, ip); 2889 __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
2883 goto corrupt_out; 2890 goto corrupt_out;
2884 } 2891 }
2892
2885 /* 2893 /*
2886 * bump the flush iteration count, used to detect flushes which 2894 * Inode item log recovery for v1/v2 inodes are dependent on the
2887 * postdate a log record during recovery. This is redundant as we now 2895 * di_flushiter count for correct sequencing. We bump the flush
2888 * log every change and hence this can't happen. Still, it doesn't hurt. 2896 * iteration count so we can detect flushes which postdate a log record
2897 * during recovery. This is redundant as we now log every change and
2898 * hence this can't happen but we need to still do it to ensure
2899 * backwards compatibility with old kernels that predate logging all
2900 * inode changes.
2889 */ 2901 */
2890 ip->i_d.di_flushiter++; 2902 if (ip->i_d.di_version < 3)
2903 ip->i_d.di_flushiter++;
2891 2904
2892 /* 2905 /*
2893 * Copy the dirty parts of the inode into the on-disk 2906 * Copy the dirty parts of the inode into the on-disk
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6fcc910a50b9..7681b19aa5dc 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2592,8 +2592,16 @@ xlog_recover_inode_pass2(
2592 goto error; 2592 goto error;
2593 } 2593 }
2594 2594
2595 /* Skip replay when the on disk inode is newer than the log one */ 2595 /*
2596 if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { 2596 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
2597 * are transactional and if ordering is necessary we can determine that
2598 * more accurately by the LSN field in the V3 inode core. Don't trust
2599 * the inode versions we might be changing them here - use the
2600 * superblock flag to determine whether we need to look at di_flushiter
2601 * to skip replay when the on disk inode is newer than the log one
2602 */
2603 if (!xfs_sb_version_hascrc(&mp->m_sb) &&
2604 dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2597 /* 2605 /*
2598 * Deal with the wrap case, DI_MAX_FLUSH is less 2606 * Deal with the wrap case, DI_MAX_FLUSH is less
2599 * than smaller numbers 2607 * than smaller numbers
@@ -2608,6 +2616,7 @@ xlog_recover_inode_pass2(
2608 goto error; 2616 goto error;
2609 } 2617 }
2610 } 2618 }
2619
2611 /* Take the opportunity to reset the flush iteration count */ 2620 /* Take the opportunity to reset the flush iteration count */
2612 dicp->di_flushiter = 0; 2621 dicp->di_flushiter = 0;
2613 2622