aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorBrian Foster <bfoster@redhat.com>2016-01-04 15:40:16 -0500
committerDave Chinner <david@fromorbit.com>2016-01-04 15:40:16 -0500
commit7088c4136fa1cba26531fde40bdcfcf3d2ccd533 (patch)
treefad3a925305250a3628c0c7dc7fedac65843a2fe /fs/xfs
parenteed6b462fb2a2661a416c227be6498b0ea2a7aab (diff)
xfs: detect and trim torn writes during log recovery
Certain types of storage, such as persistent memory, do not provide sector atomicity for writes. This means that if a crash occurs while XFS is writing log records, only part of those records might make it to the storage. This is problematic because log recovery uses the cycle value packed at the top of each log block to locate the head/tail of the log. This can lead to CRC verification failures during log recovery and an unmountable fs for a filesystem that is otherwise consistent. Update log recovery to incorporate log record CRC verification as part of the head/tail discovery process. Once the head is located via the traditional algorithm, run a CRC-only pass over the records up to the head of the log. If CRC verification fails, assume that the records are torn as a matter of policy and trim the head block back to the start of the first bad record. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_log_recover.c309
1 files changed, 289 insertions, 20 deletions
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 423c36dbcdea..26e67b4450cc 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -61,6 +61,9 @@ xlog_recover_check_summary(
61#else 61#else
62#define xlog_recover_check_summary(log) 62#define xlog_recover_check_summary(log)
63#endif 63#endif
64STATIC int
65xlog_do_recovery_pass(
66 struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
64 67
65/* 68/*
66 * This structure is used during recovery to record the buf log items which 69 * This structure is used during recovery to record the buf log items which
@@ -941,6 +944,278 @@ out_error:
941} 944}
942 945
943/* 946/*
947 * Seek forward in the log for log record headers.
948 *
949 * Given head and tail blocks, walk forward from the tail block until we find
950 * the provided number of records or hit the head block. The return value is the
951 * number of records encountered or a negative error code. The log block and
952 * buffer pointer of the last record seen are returned in rblk and rhead
953 * respectively.
954 */
955STATIC int
956xlog_seek_logrec_hdr(
957 struct xlog *log,
958 xfs_daddr_t head_blk,
959 xfs_daddr_t tail_blk,
960 int count,
961 struct xfs_buf *bp,
962 xfs_daddr_t *rblk,
963 struct xlog_rec_header **rhead,
964 bool *wrapped)
965{
966 int i;
967 int error;
968 int found = 0;
969 char *offset = NULL;
970 xfs_daddr_t end_blk;
971
972 *wrapped = false;
973
974 /*
975 * Walk forward from the tail block until we hit the head or the last
976 * block in the log.
977 */
978 end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
979 for (i = (int) tail_blk; i <= end_blk; i++) {
980 error = xlog_bread(log, i, 1, bp, &offset);
981 if (error)
982 goto out_error;
983
984 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
985 *rblk = i;
986 *rhead = (struct xlog_rec_header *) offset;
987 if (++found == count)
988 break;
989 }
990 }
991
992 /*
993 * If we haven't hit the head block or the log record header count,
994 * start looking again from the start of the physical log.
995 */
996 if (tail_blk > head_blk && found != count) {
997 for (i = 0; i < (int) head_blk; i++) {
998 error = xlog_bread(log, i, 1, bp, &offset);
999 if (error)
1000 goto out_error;
1001
1002 if (*(__be32 *)offset ==
1003 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
1004 *wrapped = true;
1005 *rblk = i;
1006 *rhead = (struct xlog_rec_header *) offset;
1007 if (++found == count)
1008 break;
1009 }
1010 }
1011 }
1012
1013 return found;
1014
1015out_error:
1016 return error;
1017}
1018
1019/*
1020 * Check the log tail for torn writes. This is required when torn writes are
1021 * detected at the head and the head had to be walked back to a previous record.
1022 * The tail of the previous record must now be verified to ensure the torn
1023 * writes didn't corrupt the previous tail.
1024 *
1025 * Return an error if CRC verification fails as recovery cannot proceed.
1026 */
1027STATIC int
1028xlog_verify_tail(
1029 struct xlog *log,
1030 xfs_daddr_t head_blk,
1031 xfs_daddr_t tail_blk)
1032{
1033 struct xlog_rec_header *thead;
1034 struct xfs_buf *bp;
1035 xfs_daddr_t first_bad;
1036 int count;
1037 int error = 0;
1038 bool wrapped;
1039 xfs_daddr_t tmp_head;
1040
1041 bp = xlog_get_bp(log, 1);
1042 if (!bp)
1043 return -ENOMEM;
1044
1045 /*
1046 * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
1047 * a temporary head block that points after the last possible
1048 * concurrently written record of the tail.
1049 */
1050 count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
1051 XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
1052 &wrapped);
1053 if (count < 0) {
1054 error = count;
1055 goto out;
1056 }
1057
1058 /*
1059 * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
1060 * into the actual log head. tmp_head points to the start of the record
1061 * so update it to the actual head block.
1062 */
1063 if (count < XLOG_MAX_ICLOGS + 1)
1064 tmp_head = head_blk;
1065
1066 /*
1067 * We now have a tail and temporary head block that covers at least
1068 * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
1069 * records were completely written. Run a CRC verification pass from
1070 * tail to head and return the result.
1071 */
1072 error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
1073 XLOG_RECOVER_CRCPASS, &first_bad);
1074
1075out:
1076 xlog_put_bp(bp);
1077 return error;
1078}
1079
1080/*
1081 * Detect and trim torn writes from the head of the log.
1082 *
1083 * Storage without sector atomicity guarantees can result in torn writes in the
1084 * log in the event of a crash. Our only means to detect this scenario is via
1085 * CRC verification. While we can't always be certain that CRC verification
1086 * failure is due to a torn write vs. an unrelated corruption, we do know that
1087 * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
1088 * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
1089 * the log and treat failures in this range as torn writes as a matter of
1090 * policy. In the event of CRC failure, the head is walked back to the last good
1091 * record in the log and the tail is updated from that record and verified.
1092 */
1093STATIC int
1094xlog_verify_head(
1095 struct xlog *log,
1096 xfs_daddr_t *head_blk, /* in/out: unverified head */
1097 xfs_daddr_t *tail_blk, /* out: tail block */
1098 struct xfs_buf *bp,
1099 xfs_daddr_t *rhead_blk, /* start blk of last record */
1100 struct xlog_rec_header **rhead, /* ptr to last record */
1101 bool *wrapped) /* last rec. wraps phys. log */
1102{
1103 struct xlog_rec_header *tmp_rhead;
1104 struct xfs_buf *tmp_bp;
1105 xfs_daddr_t first_bad;
1106 xfs_daddr_t tmp_rhead_blk;
1107 int found;
1108 int error;
1109 bool tmp_wrapped;
1110
1111 /*
1112 * Search backwards through the log looking for the log record header
1113 * block. This wraps all the way back around to the head so something is
1114 * seriously wrong if we can't find it.
1115 */
1116 found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
1117 rhead, wrapped);
1118 if (found < 0)
1119 return found;
1120 if (!found) {
1121 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1122 return -EIO;
1123 }
1124
1125 *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1126
1127 /*
1128 * Now that we have a tail block, check the head of the log for torn
1129 * writes. Search again until we hit the tail or the maximum number of
1130 * log record I/Os that could have been in flight at one time. Use a
1131 * temporary buffer so we don't trash the rhead/bp pointer from the
1132 * call above.
1133 */
1134 tmp_bp = xlog_get_bp(log, 1);
1135 if (!tmp_bp)
1136 return -ENOMEM;
1137 error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
1138 XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
1139 &tmp_rhead, &tmp_wrapped);
1140 xlog_put_bp(tmp_bp);
1141 if (error < 0)
1142 return error;
1143
1144 /*
1145 * Now run a CRC verification pass over the records starting at the
1146 * block found above to the current head. If a CRC failure occurs, the
1147 * log block of the first bad record is saved in first_bad.
1148 */
1149 error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
1150 XLOG_RECOVER_CRCPASS, &first_bad);
1151 if (error == -EFSBADCRC) {
1152 /*
1153 * We've hit a potential torn write. Reset the error and warn
1154 * about it.
1155 */
1156 error = 0;
1157 xfs_warn(log->l_mp,
1158"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
1159 first_bad, *head_blk);
1160
1161 /*
1162 * Get the header block and buffer pointer for the last good
1163 * record before the bad record.
1164 *
1165 * Note that xlog_find_tail() clears the blocks at the new head
1166 * (i.e., the records with invalid CRC) if the cycle number
1167 * matches the the current cycle.
1168 */
1169 found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
1170 rhead_blk, rhead, wrapped);
1171 if (found < 0)
1172 return found;
1173 if (found == 0) /* XXX: right thing to do here? */
1174 return -EIO;
1175
1176 /*
1177 * Reset the head block to the starting block of the first bad
1178 * log record and set the tail block based on the last good
1179 * record.
1180 *
1181 * Bail out if the updated head/tail match as this indicates
1182 * possible corruption outside of the acceptable
1183 * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
1184 */
1185 *head_blk = first_bad;
1186 *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1187 if (*head_blk == *tail_blk) {
1188 ASSERT(0);
1189 return 0;
1190 }
1191
1192 /*
1193 * Now verify the tail based on the updated head. This is
1194 * required because the torn writes trimmed from the head could
1195 * have been written over the tail of a previous record. Return
1196 * any errors since recovery cannot proceed if the tail is
1197 * corrupt.
1198 *
1199 * XXX: This leaves a gap in truly robust protection from torn
1200 * writes in the log. If the head is behind the tail, the tail
1201 * pushes forward to create some space and then a crash occurs
1202 * causing the writes into the previous record's tail region to
1203 * tear, log recovery isn't able to recover.
1204 *
1205 * How likely is this to occur? If possible, can we do something
1206 * more intelligent here? Is it safe to push the tail forward if
1207 * we can determine that the tail is within the range of the
1208 * torn write (e.g., the kernel can only overwrite the tail if
1209 * it has actually been pushed forward)? Alternatively, could we
1210 * somehow prevent this condition at runtime?
1211 */
1212 error = xlog_verify_tail(log, *head_blk, *tail_blk);
1213 }
1214
1215 return error;
1216}
1217
1218/*
944 * Find the sync block number or the tail of the log. 1219 * Find the sync block number or the tail of the log.
945 * 1220 *
946 * This will be the block number of the last record to have its 1221 * This will be the block number of the last record to have its
@@ -966,9 +1241,10 @@ xlog_find_tail(
966 xlog_op_header_t *op_head; 1241 xlog_op_header_t *op_head;
967 char *offset = NULL; 1242 char *offset = NULL;
968 xfs_buf_t *bp; 1243 xfs_buf_t *bp;
969 int error, i, found; 1244 int error;
970 xfs_daddr_t umount_data_blk; 1245 xfs_daddr_t umount_data_blk;
971 xfs_daddr_t after_umount_blk; 1246 xfs_daddr_t after_umount_blk;
1247 xfs_daddr_t rhead_blk;
972 xfs_lsn_t tail_lsn; 1248 xfs_lsn_t tail_lsn;
973 int hblks; 1249 int hblks;
974 bool wrapped = false; 1250 bool wrapped = false;
@@ -995,24 +1271,16 @@ xlog_find_tail(
995 } 1271 }
996 1272
997 /* 1273 /*
998 * Search backwards through the log looking for the log record header 1274 * Trim the head block back to skip over torn records. We can have
999 * block. This wraps all the way back around to the head so something is 1275 * multiple log I/Os in flight at any time, so we assume CRC failures
1000 * seriously wrong if we can't find it. 1276 * back through the previous several records are torn writes and skip
1277 * them.
1001 */ 1278 */
1002 ASSERT(*head_blk < INT_MAX); 1279 ASSERT(*head_blk < INT_MAX);
1003 found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, &i, 1280 error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
1004 &rhead, &wrapped); 1281 &rhead, &wrapped);
1005 if (found < 0) { 1282 if (error)
1006 error = found;
1007 goto done; 1283 goto done;
1008 }
1009 if (!found) {
1010 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1011 xlog_put_bp(bp);
1012 ASSERT(0);
1013 return -EIO;
1014 }
1015 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1016 1284
1017 /* 1285 /*
1018 * Reset log values according to the state of the log when we 1286 * Reset log values according to the state of the log when we
@@ -1024,7 +1292,7 @@ xlog_find_tail(
1024 * written was complete and ended exactly on the end boundary 1292 * written was complete and ended exactly on the end boundary
1025 * of the physical log. 1293 * of the physical log.
1026 */ 1294 */
1027 log->l_prev_block = i; 1295 log->l_prev_block = rhead_blk;
1028 log->l_curr_block = (int)*head_blk; 1296 log->l_curr_block = (int)*head_blk;
1029 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); 1297 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1030 if (wrapped) 1298 if (wrapped)
@@ -1062,12 +1330,13 @@ xlog_find_tail(
1062 } else { 1330 } else {
1063 hblks = 1; 1331 hblks = 1;
1064 } 1332 }
1065 after_umount_blk = (i + hblks + (int) 1333 after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
1066 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; 1334 after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
1067 tail_lsn = atomic64_read(&log->l_tail_lsn); 1335 tail_lsn = atomic64_read(&log->l_tail_lsn);
1068 if (*head_blk == after_umount_blk && 1336 if (*head_blk == after_umount_blk &&
1069 be32_to_cpu(rhead->h_num_logops) == 1) { 1337 be32_to_cpu(rhead->h_num_logops) == 1) {
1070 umount_data_blk = (i + hblks) % log->l_logBBsize; 1338 umount_data_blk = rhead_blk + hblks;
1339 umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
1071 error = xlog_bread(log, umount_data_blk, 1, bp, &offset); 1340 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
1072 if (error) 1341 if (error)
1073 goto done; 1342 goto done;