aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2013-08-28 07:22:47 -0400
committerBen Myers <bpm@sgi.com>2013-08-30 14:44:53 -0400
commit50d5c8d8e938e3c4c0d21db9fc7d64282dc7be20 (patch)
treef3695befa5404b0abd5f1e18ddfcb59d97943401
parentb58fa554e9b940083a0691f7234c13240fc09377 (diff)
xfs: check LSN ordering for v5 superblocks during recovery
Log recovery has some strict ordering requirements which unordered or reordered metadata writeback can defeat. This can occur when an item is logged in a transaction, written back to disk, and then logged in a new transaction before the tail of the log is moved past the original modification. The result of this is that when we read an object off disk for recovery purposes, the buffer that we read may not contain the object type that recovery is expecting and hence at the end of the checkpoint being recovered we have an invalid object in memory. This isn't usually a problem, as recovery will then replay all the other checkpoints and that brings the object back to a valid and correct state, but the issue is that while the object is in the invalid state it can be flushed to disk. This results in the object verifier failing and triggering a corruption shutdown of log recover. This is correct behaviour for the verifiers - the problem is that we are not detecting that the object we've read off disk is newer than the transaction we are replaying. All metadata in v5 filesystems has the LSN of it's last modification stamped in it. This enabled log recover to read that field and determine the age of the object on disk correctly. If the LSN of the object on disk is older than the transaction being replayed, then we replay the modification. If the LSN of the object matches or is more recent than the transaction's LSN, then we should avoid overwriting the object as that is what leads to the transient corrupt state. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
-rw-r--r--fs/xfs/xfs_log_recover.c169
1 files changed, 156 insertions, 13 deletions
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 006ee288246d..dc100fed1973 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1959,6 +1959,104 @@ xlog_recover_do_inode_buffer(
1959} 1959}
1960 1960
1961/* 1961/*
1962 * V5 filesystems know the age of the buffer on disk being recovered. We can
1963 * have newer objects on disk than we are replaying, and so for these cases we
1964 * don't want to replay the current change as that will make the buffer contents
1965 * temporarily invalid on disk.
1966 *
1967 * The magic number might not match the buffer type we are going to recover
1968 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
1969 * extract the LSN of the existing object in the buffer based on it's current
1970 * magic number. If we don't recognise the magic number in the buffer, then
1971 * return a LSN of -1 so that the caller knows it was an unrecognised block and
1972 * so can recover the buffer.
1973 */
1974static xfs_lsn_t
1975xlog_recover_get_buf_lsn(
1976 struct xfs_mount *mp,
1977 struct xfs_buf *bp)
1978{
1979 __uint32_t magic32;
1980 __uint16_t magic16;
1981 __uint16_t magicda;
1982 void *blk = bp->b_addr;
1983
1984 /* v4 filesystems always recover immediately */
1985 if (!xfs_sb_version_hascrc(&mp->m_sb))
1986 goto recover_immediately;
1987
1988 magic32 = be32_to_cpu(*(__be32 *)blk);
1989 switch (magic32) {
1990 case XFS_ABTB_CRC_MAGIC:
1991 case XFS_ABTC_CRC_MAGIC:
1992 case XFS_ABTB_MAGIC:
1993 case XFS_ABTC_MAGIC:
1994 case XFS_IBT_CRC_MAGIC:
1995 case XFS_IBT_MAGIC:
1996 return be64_to_cpu(
1997 ((struct xfs_btree_block *)blk)->bb_u.s.bb_lsn);
1998 case XFS_BMAP_CRC_MAGIC:
1999 case XFS_BMAP_MAGIC:
2000 return be64_to_cpu(
2001 ((struct xfs_btree_block *)blk)->bb_u.l.bb_lsn);
2002 case XFS_AGF_MAGIC:
2003 return be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
2004 case XFS_AGFL_MAGIC:
2005 return be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
2006 case XFS_AGI_MAGIC:
2007 return be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
2008 case XFS_SYMLINK_MAGIC:
2009 return be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
2010 case XFS_DIR3_BLOCK_MAGIC:
2011 case XFS_DIR3_DATA_MAGIC:
2012 case XFS_DIR3_FREE_MAGIC:
2013 return be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
2014 case XFS_ATTR3_RMT_MAGIC:
2015 return be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
2016 case XFS_SB_MAGIC:
2017 return be64_to_cpu(((struct xfs_sb *)blk)->sb_lsn);
2018 default:
2019 break;
2020 }
2021
2022 magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
2023 switch (magicda) {
2024 case XFS_DIR3_LEAF1_MAGIC:
2025 case XFS_DIR3_LEAFN_MAGIC:
2026 case XFS_DA3_NODE_MAGIC:
2027 return be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
2028 default:
2029 break;
2030 }
2031
2032 /*
2033 * We do individual object checks on dquot and inode buffers as they
2034 * have their own individual LSN records. Also, we could have a stale
2035 * buffer here, so we have to at least recognise these buffer types.
2036 *
2037 * A notd complexity here is inode unlinked list processing - it logs
2038 * the inode directly in the buffer, but we don't know which inodes have
2039 * been modified, and there is no global buffer LSN. Hence we need to
2040 * recover all inode buffer types immediately. This problem will be
2041 * fixed by logical logging of the unlinked list modifications.
2042 */
2043 magic16 = be16_to_cpu(*(__be16 *)blk);
2044 switch (magic16) {
2045 case XFS_DQUOT_MAGIC:
2046 case XFS_DINODE_MAGIC:
2047 goto recover_immediately;
2048 default:
2049 break;
2050 }
2051
2052 /* unknown buffer contents, recover immediately */
2053
2054recover_immediately:
2055 return (xfs_lsn_t)-1;
2056
2057}
2058
2059/*
1962 * Validate the recovered buffer is of the correct type and attach the 2060 * Validate the recovered buffer is of the correct type and attach the
1963 * appropriate buffer operations to them for writeback. Magic numbers are in a 2061 * appropriate buffer operations to them for writeback. Magic numbers are in a
1964 * few places: 2062 * few places:
@@ -1967,7 +2065,7 @@ xlog_recover_do_inode_buffer(
1967 * inside a struct xfs_da_blkinfo at the start of the buffer. 2065 * inside a struct xfs_da_blkinfo at the start of the buffer.
1968 */ 2066 */
1969static void 2067static void
1970xlog_recovery_validate_buf_type( 2068xlog_recover_validate_buf_type(
1971 struct xfs_mount *mp, 2069 struct xfs_mount *mp,
1972 struct xfs_buf *bp, 2070 struct xfs_buf *bp,
1973 xfs_buf_log_format_t *buf_f) 2071 xfs_buf_log_format_t *buf_f)
@@ -2246,7 +2344,7 @@ xlog_recover_do_reg_buffer(
2246 * just avoid the verification stage for non-crc filesystems 2344 * just avoid the verification stage for non-crc filesystems
2247 */ 2345 */
2248 if (xfs_sb_version_hascrc(&mp->m_sb)) 2346 if (xfs_sb_version_hascrc(&mp->m_sb))
2249 xlog_recovery_validate_buf_type(mp, bp, buf_f); 2347 xlog_recover_validate_buf_type(mp, bp, buf_f);
2250} 2348}
2251 2349
2252/* 2350/*
@@ -2444,13 +2542,15 @@ STATIC int
2444xlog_recover_buffer_pass2( 2542xlog_recover_buffer_pass2(
2445 struct xlog *log, 2543 struct xlog *log,
2446 struct list_head *buffer_list, 2544 struct list_head *buffer_list,
2447 struct xlog_recover_item *item) 2545 struct xlog_recover_item *item,
2546 xfs_lsn_t current_lsn)
2448{ 2547{
2449 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2548 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2450 xfs_mount_t *mp = log->l_mp; 2549 xfs_mount_t *mp = log->l_mp;
2451 xfs_buf_t *bp; 2550 xfs_buf_t *bp;
2452 int error; 2551 int error;
2453 uint buf_flags; 2552 uint buf_flags;
2553 xfs_lsn_t lsn;
2454 2554
2455 /* 2555 /*
2456 * In this pass we only want to recover all the buffers which have 2556 * In this pass we only want to recover all the buffers which have
@@ -2475,10 +2575,17 @@ xlog_recover_buffer_pass2(
2475 error = bp->b_error; 2575 error = bp->b_error;
2476 if (error) { 2576 if (error) {
2477 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); 2577 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2478 xfs_buf_relse(bp); 2578 goto out_release;
2479 return error;
2480 } 2579 }
2481 2580
2581 /*
2582 * recover the buffer only if we get an LSN from it and it's less than
2583 * the lsn of the transaction we are replaying.
2584 */
2585 lsn = xlog_recover_get_buf_lsn(mp, bp);
2586 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
2587 goto out_release;
2588
2482 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 2589 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2483 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2590 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2484 } else if (buf_f->blf_flags & 2591 } else if (buf_f->blf_flags &
@@ -2488,7 +2595,7 @@ xlog_recover_buffer_pass2(
2488 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2595 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2489 } 2596 }
2490 if (error) 2597 if (error)
2491 return XFS_ERROR(error); 2598 goto out_release;
2492 2599
2493 /* 2600 /*
2494 * Perform delayed write on the buffer. Asynchronous writes will be 2601 * Perform delayed write on the buffer. Asynchronous writes will be
@@ -2517,6 +2624,7 @@ xlog_recover_buffer_pass2(
2517 xfs_buf_delwri_queue(bp, buffer_list); 2624 xfs_buf_delwri_queue(bp, buffer_list);
2518 } 2625 }
2519 2626
2627out_release:
2520 xfs_buf_relse(bp); 2628 xfs_buf_relse(bp);
2521 return error; 2629 return error;
2522} 2630}
@@ -2525,7 +2633,8 @@ STATIC int
2525xlog_recover_inode_pass2( 2633xlog_recover_inode_pass2(
2526 struct xlog *log, 2634 struct xlog *log,
2527 struct list_head *buffer_list, 2635 struct list_head *buffer_list,
2528 struct xlog_recover_item *item) 2636 struct xlog_recover_item *item,
2637 xfs_lsn_t current_lsn)
2529{ 2638{
2530 xfs_inode_log_format_t *in_f; 2639 xfs_inode_log_format_t *in_f;
2531 xfs_mount_t *mp = log->l_mp; 2640 xfs_mount_t *mp = log->l_mp;
@@ -2605,6 +2714,20 @@ xlog_recover_inode_pass2(
2605 } 2714 }
2606 2715
2607 /* 2716 /*
2717 * If the inode has an LSN in it, recover the inode only if it's less
2718 * than the lsn of the transaction we are replaying.
2719 */
2720 if (dip->di_version >= 3) {
2721 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
2722
2723 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2724 trace_xfs_log_recover_inode_skip(log, in_f);
2725 error = 0;
2726 goto out_release;
2727 }
2728 }
2729
2730 /*
2608 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes 2731 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
2609 * are transactional and if ordering is necessary we can determine that 2732 * are transactional and if ordering is necessary we can determine that
2610 * more accurately by the LSN field in the V3 inode core. Don't trust 2733 * more accurately by the LSN field in the V3 inode core. Don't trust
@@ -2793,6 +2916,8 @@ write_inode_buffer:
2793 ASSERT(bp->b_target->bt_mount == mp); 2916 ASSERT(bp->b_target->bt_mount == mp);
2794 bp->b_iodone = xlog_recover_iodone; 2917 bp->b_iodone = xlog_recover_iodone;
2795 xfs_buf_delwri_queue(bp, buffer_list); 2918 xfs_buf_delwri_queue(bp, buffer_list);
2919
2920out_release:
2796 xfs_buf_relse(bp); 2921 xfs_buf_relse(bp);
2797error: 2922error:
2798 if (need_free) 2923 if (need_free)
@@ -2834,7 +2959,8 @@ STATIC int
2834xlog_recover_dquot_pass2( 2959xlog_recover_dquot_pass2(
2835 struct xlog *log, 2960 struct xlog *log,
2836 struct list_head *buffer_list, 2961 struct list_head *buffer_list,
2837 struct xlog_recover_item *item) 2962 struct xlog_recover_item *item,
2963 xfs_lsn_t current_lsn)
2838{ 2964{
2839 xfs_mount_t *mp = log->l_mp; 2965 xfs_mount_t *mp = log->l_mp;
2840 xfs_buf_t *bp; 2966 xfs_buf_t *bp;
@@ -2908,6 +3034,19 @@ xlog_recover_dquot_pass2(
2908 return XFS_ERROR(EIO); 3034 return XFS_ERROR(EIO);
2909 } 3035 }
2910 3036
3037 /*
3038 * If the dquot has an LSN in it, recover the dquot only if it's less
3039 * than the lsn of the transaction we are replaying.
3040 */
3041 if (xfs_sb_version_hascrc(&mp->m_sb)) {
3042 struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
3043 xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
3044
3045 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3046 goto out_release;
3047 }
3048 }
3049
2911 memcpy(ddq, recddq, item->ri_buf[1].i_len); 3050 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2912 if (xfs_sb_version_hascrc(&mp->m_sb)) { 3051 if (xfs_sb_version_hascrc(&mp->m_sb)) {
2913 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), 3052 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
@@ -2918,9 +3057,10 @@ xlog_recover_dquot_pass2(
2918 ASSERT(bp->b_target->bt_mount == mp); 3057 ASSERT(bp->b_target->bt_mount == mp);
2919 bp->b_iodone = xlog_recover_iodone; 3058 bp->b_iodone = xlog_recover_iodone;
2920 xfs_buf_delwri_queue(bp, buffer_list); 3059 xfs_buf_delwri_queue(bp, buffer_list);
2921 xfs_buf_relse(bp);
2922 3060
2923 return (0); 3061out_release:
3062 xfs_buf_relse(bp);
3063 return 0;
2924} 3064}
2925 3065
2926/* 3066/*
@@ -3267,15 +3407,18 @@ xlog_recover_commit_pass2(
3267 3407
3268 switch (ITEM_TYPE(item)) { 3408 switch (ITEM_TYPE(item)) {
3269 case XFS_LI_BUF: 3409 case XFS_LI_BUF:
3270 return xlog_recover_buffer_pass2(log, buffer_list, item); 3410 return xlog_recover_buffer_pass2(log, buffer_list, item,
3411 trans->r_lsn);
3271 case XFS_LI_INODE: 3412 case XFS_LI_INODE:
3272 return xlog_recover_inode_pass2(log, buffer_list, item); 3413 return xlog_recover_inode_pass2(log, buffer_list, item,
3414 trans->r_lsn);
3273 case XFS_LI_EFI: 3415 case XFS_LI_EFI:
3274 return xlog_recover_efi_pass2(log, item, trans->r_lsn); 3416 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
3275 case XFS_LI_EFD: 3417 case XFS_LI_EFD:
3276 return xlog_recover_efd_pass2(log, item); 3418 return xlog_recover_efd_pass2(log, item);
3277 case XFS_LI_DQUOT: 3419 case XFS_LI_DQUOT:
3278 return xlog_recover_dquot_pass2(log, buffer_list, item); 3420 return xlog_recover_dquot_pass2(log, buffer_list, item,
3421 trans->r_lsn);
3279 case XFS_LI_ICREATE: 3422 case XFS_LI_ICREATE:
3280 return xlog_recover_do_icreate_pass2(log, buffer_list, item); 3423 return xlog_recover_do_icreate_pass2(log, buffer_list, item);
3281 case XFS_LI_QUOTAOFF: 3424 case XFS_LI_QUOTAOFF: