aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_log_recover.c
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2013-08-29 20:23:45 -0400
committerBen Myers <bpm@sgi.com>2013-09-10 13:49:57 -0400
commit638f44163d57f87d0905fbed7d54202beff916fc (patch)
treebecdb2c6ee54e318bd1cb27bd72f3438194674dc /fs/xfs/xfs_log_recover.c
parent21b5c9784bceb8b8e0095f87355f3b138ebac2d0 (diff)
xfs: recovery of swap extents operations for CRC filesystems
This is the recovery side of the btree block owner change operation performed by swapext on CRC enabled filesystems. We detect that an owner change is needed by the flag that has been placed on the inode log format flag field. Because the inode recovery is being replayed after the buffers that make up the BMBT in the given checkpoint, we can walk all the buffers and directly modify them when we see the flag set on an inode. Because the inode can be relogged and hence present in multiple chekpoints with the "change owner" flag set, we could do multiple passes across the inode to do this change. While this isn't optimal, we can't directly ignore the flag as there may be multiple independent swap extent operations being replayed on the same inode in different checkpoints so we can't ignore them. Further, because the owner change operation uses ordered buffers, we might have buffers that are newer on disk than the current checkpoint and so already have the owner changed in them. Hence we cannot just peek at a buffer in the tree and check that it has the correct owner and assume that the change was completed. So, for the moment just brute force the owner change every time we see an inode with the flag set. Note that we have to be careful here because the owner of the buffers may point to either the old owner or the new owner. Currently the verifier can't verify the owner directly, so there is no failure case here right now. If we verify the owner exactly in future, then we'll have to take this into account. This was tested in terms of normal operation via xfstests - all of the fsr tests now pass without failure. however, we really need to modify xfs/227 to stress v3 inodes correctly to ensure we fully cover this case for v5 filesystems. In terms of recovery testing, I used a hacked version of xfs_fsr that held the temp inode open for a few seconds before exiting so that the filesystem could be shut down with an open owner change recovery flags set on at least the temp inode. fsr leaves the temp inode unlinked and in btree format, so this was necessary for the owner change to be reliably replayed. logprint confirmed the tmp inode in the log had the correct flag set: INO: cnt:3 total:3 a:0x69e9e0 len:56 a:0x69ea20 len:176 a:0x69eae0 len:88 INODE: #regs:3 ino:0x44 flags:0x209 dsize:88 ^^^^^ 0x200 is set, indicating a data fork owner change needed to be replayed on inode 0x44. A printk in the revoery code confirmed that the inode change was recovered: XFS (vdc): Mounting Filesystem XFS (vdc): Starting recovery (logdev: internal) recovering owner change ino 0x44 XFS (vdc): Version 5 superblock detected. This kernel L support enabled! Use of these features in this kernel is at your own risk! XFS (vdc): Ending recovery (logdev: internal) The script used to test this was: $ cat ./recovery-fsr.sh #!/bin/bash dev=/dev/vdc mntpt=/mnt/scratch testfile=$mntpt/testfile umount $mntpt mkfs.xfs -f -m crc=1 $dev mount $dev $mntpt chmod 777 $mntpt for i in `seq 10000 -1 0`; do xfs_io -f -d -c "pwrite $(($i * 4096)) 4096" $testfile > /dev/null 2>&1 done xfs_bmap -vp $testfile |head -20 xfs_fsr -d -v $testfile & sleep 10 /home/dave/src/xfstests-dev/src/godown -f $mntpt wait umount $mntpt xfs_logprint -t $dev |tail -20 time mount $dev $mntpt xfs_bmap -vp $testfile umount $mntpt $ Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_log_recover.c')
-rw-r--r--fs/xfs/xfs_log_recover.c123
1 files changed, 99 insertions, 24 deletions
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1728c7c016a6..1c3b0c9c9aac 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2629,6 +2629,82 @@ out_release:
2629 return error; 2629 return error;
2630} 2630}
2631 2631
2632/*
2633 * Inode fork owner changes
2634 *
2635 * If we have been told that we have to reparent the inode fork, it's because an
2636 * extent swap operation on a CRC enabled filesystem has been done and we are
2637 * replaying it. We need to walk the BMBT of the appropriate fork and change the
2638 * owners of it.
2639 *
2640 * The complexity here is that we don't have an inode context to work with, so
2641 * after we've replayed the inode we need to instantiate one. This is where the
2642 * fun begins.
2643 *
2644 * We are in the middle of log recovery, so we can't run transactions. That
2645 * means we cannot use cache coherent inode instantiation via xfs_iget(), as
2646 * that will result in the corresponding iput() running the inode through
2647 * xfs_inactive(). If we've just replayed an inode core that changes the link
2648 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
2649 * transactions (bad!).
2650 *
2651 * So, to avoid this, we instantiate an inode directly from the inode core we've
2652 * just recovered. We have the buffer still locked, and all we really need to
2653 * instantiate is the inode core and the forks being modified. We can do this
2654 * manually, then run the inode btree owner change, and then tear down the
2655 * xfs_inode without having to run any transactions at all.
2656 *
2657 * Also, because we don't have a transaction context available here but need to
2658 * gather all the buffers we modify for writeback so we pass the buffer_list
2659 * instead for the operation to use.
2660 */
2661
2662STATIC int
2663xfs_recover_inode_owner_change(
2664 struct xfs_mount *mp,
2665 struct xfs_dinode *dip,
2666 struct xfs_inode_log_format *in_f,
2667 struct list_head *buffer_list)
2668{
2669 struct xfs_inode *ip;
2670 int error;
2671
2672 ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
2673
2674 ip = xfs_inode_alloc(mp, in_f->ilf_ino);
2675 if (!ip)
2676 return ENOMEM;
2677
2678 /* instantiate the inode */
2679 xfs_dinode_from_disk(&ip->i_d, dip);
2680 ASSERT(ip->i_d.di_version >= 3);
2681
2682 error = xfs_iformat_fork(ip, dip);
2683 if (error)
2684 goto out_free_ip;
2685
2686
2687 if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
2688 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
2689 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
2690 ip->i_ino, buffer_list);
2691 if (error)
2692 goto out_free_ip;
2693 }
2694
2695 if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
2696 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
2697 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
2698 ip->i_ino, buffer_list);
2699 if (error)
2700 goto out_free_ip;
2701 }
2702
2703out_free_ip:
2704 xfs_inode_free(ip);
2705 return error;
2706}
2707
2632STATIC int 2708STATIC int
2633xlog_recover_inode_pass2( 2709xlog_recover_inode_pass2(
2634 struct xlog *log, 2710 struct xlog *log,
@@ -2681,8 +2757,7 @@ xlog_recover_inode_pass2(
2681 error = bp->b_error; 2757 error = bp->b_error;
2682 if (error) { 2758 if (error) {
2683 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); 2759 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2684 xfs_buf_relse(bp); 2760 goto out_release;
2685 goto error;
2686 } 2761 }
2687 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 2762 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2688 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); 2763 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -2692,30 +2767,31 @@ xlog_recover_inode_pass2(
2692 * like an inode! 2767 * like an inode!
2693 */ 2768 */
2694 if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { 2769 if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
2695 xfs_buf_relse(bp);
2696 xfs_alert(mp, 2770 xfs_alert(mp,
2697 "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", 2771 "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2698 __func__, dip, bp, in_f->ilf_ino); 2772 __func__, dip, bp, in_f->ilf_ino);
2699 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", 2773 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2700 XFS_ERRLEVEL_LOW, mp); 2774 XFS_ERRLEVEL_LOW, mp);
2701 error = EFSCORRUPTED; 2775 error = EFSCORRUPTED;
2702 goto error; 2776 goto out_release;
2703 } 2777 }
2704 dicp = item->ri_buf[1].i_addr; 2778 dicp = item->ri_buf[1].i_addr;
2705 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { 2779 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2706 xfs_buf_relse(bp);
2707 xfs_alert(mp, 2780 xfs_alert(mp,
2708 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", 2781 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2709 __func__, item, in_f->ilf_ino); 2782 __func__, item, in_f->ilf_ino);
2710 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", 2783 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2711 XFS_ERRLEVEL_LOW, mp); 2784 XFS_ERRLEVEL_LOW, mp);
2712 error = EFSCORRUPTED; 2785 error = EFSCORRUPTED;
2713 goto error; 2786 goto out_release;
2714 } 2787 }
2715 2788
2716 /* 2789 /*
2717 * If the inode has an LSN in it, recover the inode only if it's less 2790 * If the inode has an LSN in it, recover the inode only if it's less
2718 * than the lsn of the transaction we are replaying. 2791 * than the lsn of the transaction we are replaying. Note: we still
2792 * need to replay an owner change even though the inode is more recent
2793 * than the transaction as there is no guarantee that all the btree
2794 * blocks are more recent than this transaction, too.
2719 */ 2795 */
2720 if (dip->di_version >= 3) { 2796 if (dip->di_version >= 3) {
2721 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); 2797 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
@@ -2723,7 +2799,7 @@ xlog_recover_inode_pass2(
2723 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 2799 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2724 trace_xfs_log_recover_inode_skip(log, in_f); 2800 trace_xfs_log_recover_inode_skip(log, in_f);
2725 error = 0; 2801 error = 0;
2726 goto out_release; 2802 goto out_owner_change;
2727 } 2803 }
2728 } 2804 }
2729 2805
@@ -2745,10 +2821,9 @@ xlog_recover_inode_pass2(
2745 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { 2821 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2746 /* do nothing */ 2822 /* do nothing */
2747 } else { 2823 } else {
2748 xfs_buf_relse(bp);
2749 trace_xfs_log_recover_inode_skip(log, in_f); 2824 trace_xfs_log_recover_inode_skip(log, in_f);
2750 error = 0; 2825 error = 0;
2751 goto error; 2826 goto out_release;
2752 } 2827 }
2753 } 2828 }
2754 2829
@@ -2760,13 +2835,12 @@ xlog_recover_inode_pass2(
2760 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 2835 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2761 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", 2836 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2762 XFS_ERRLEVEL_LOW, mp, dicp); 2837 XFS_ERRLEVEL_LOW, mp, dicp);
2763 xfs_buf_relse(bp);
2764 xfs_alert(mp, 2838 xfs_alert(mp,
2765 "%s: Bad regular inode log record, rec ptr 0x%p, " 2839 "%s: Bad regular inode log record, rec ptr 0x%p, "
2766 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2840 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2767 __func__, item, dip, bp, in_f->ilf_ino); 2841 __func__, item, dip, bp, in_f->ilf_ino);
2768 error = EFSCORRUPTED; 2842 error = EFSCORRUPTED;
2769 goto error; 2843 goto out_release;
2770 } 2844 }
2771 } else if (unlikely(S_ISDIR(dicp->di_mode))) { 2845 } else if (unlikely(S_ISDIR(dicp->di_mode))) {
2772 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2846 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -2774,19 +2848,17 @@ xlog_recover_inode_pass2(
2774 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 2848 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2775 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", 2849 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2776 XFS_ERRLEVEL_LOW, mp, dicp); 2850 XFS_ERRLEVEL_LOW, mp, dicp);
2777 xfs_buf_relse(bp);
2778 xfs_alert(mp, 2851 xfs_alert(mp,
2779 "%s: Bad dir inode log record, rec ptr 0x%p, " 2852 "%s: Bad dir inode log record, rec ptr 0x%p, "
2780 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2853 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2781 __func__, item, dip, bp, in_f->ilf_ino); 2854 __func__, item, dip, bp, in_f->ilf_ino);
2782 error = EFSCORRUPTED; 2855 error = EFSCORRUPTED;
2783 goto error; 2856 goto out_release;
2784 } 2857 }
2785 } 2858 }
2786 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ 2859 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2787 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", 2860 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2788 XFS_ERRLEVEL_LOW, mp, dicp); 2861 XFS_ERRLEVEL_LOW, mp, dicp);
2789 xfs_buf_relse(bp);
2790 xfs_alert(mp, 2862 xfs_alert(mp,
2791 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 2863 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2792 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2864 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
@@ -2794,29 +2866,27 @@ xlog_recover_inode_pass2(
2794 dicp->di_nextents + dicp->di_anextents, 2866 dicp->di_nextents + dicp->di_anextents,
2795 dicp->di_nblocks); 2867 dicp->di_nblocks);
2796 error = EFSCORRUPTED; 2868 error = EFSCORRUPTED;
2797 goto error; 2869 goto out_release;
2798 } 2870 }
2799 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 2871 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2800 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", 2872 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2801 XFS_ERRLEVEL_LOW, mp, dicp); 2873 XFS_ERRLEVEL_LOW, mp, dicp);
2802 xfs_buf_relse(bp);
2803 xfs_alert(mp, 2874 xfs_alert(mp,
2804 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 2875 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2805 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, 2876 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2806 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); 2877 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2807 error = EFSCORRUPTED; 2878 error = EFSCORRUPTED;
2808 goto error; 2879 goto out_release;
2809 } 2880 }
2810 isize = xfs_icdinode_size(dicp->di_version); 2881 isize = xfs_icdinode_size(dicp->di_version);
2811 if (unlikely(item->ri_buf[1].i_len > isize)) { 2882 if (unlikely(item->ri_buf[1].i_len > isize)) {
2812 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", 2883 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2813 XFS_ERRLEVEL_LOW, mp, dicp); 2884 XFS_ERRLEVEL_LOW, mp, dicp);
2814 xfs_buf_relse(bp);
2815 xfs_alert(mp, 2885 xfs_alert(mp,
2816 "%s: Bad inode log record length %d, rec ptr 0x%p", 2886 "%s: Bad inode log record length %d, rec ptr 0x%p",
2817 __func__, item->ri_buf[1].i_len, item); 2887 __func__, item->ri_buf[1].i_len, item);
2818 error = EFSCORRUPTED; 2888 error = EFSCORRUPTED;
2819 goto error; 2889 goto out_release;
2820 } 2890 }
2821 2891
2822 /* The core is in in-core format */ 2892 /* The core is in in-core format */
@@ -2842,7 +2912,7 @@ xlog_recover_inode_pass2(
2842 } 2912 }
2843 2913
2844 if (in_f->ilf_size == 2) 2914 if (in_f->ilf_size == 2)
2845 goto write_inode_buffer; 2915 goto out_owner_change;
2846 len = item->ri_buf[2].i_len; 2916 len = item->ri_buf[2].i_len;
2847 src = item->ri_buf[2].i_addr; 2917 src = item->ri_buf[2].i_addr;
2848 ASSERT(in_f->ilf_size <= 4); 2918 ASSERT(in_f->ilf_size <= 4);
@@ -2903,13 +2973,15 @@ xlog_recover_inode_pass2(
2903 default: 2973 default:
2904 xfs_warn(log->l_mp, "%s: Invalid flag", __func__); 2974 xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2905 ASSERT(0); 2975 ASSERT(0);
2906 xfs_buf_relse(bp);
2907 error = EIO; 2976 error = EIO;
2908 goto error; 2977 goto out_release;
2909 } 2978 }
2910 } 2979 }
2911 2980
2912write_inode_buffer: 2981out_owner_change:
2982 if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
2983 error = xfs_recover_inode_owner_change(mp, dip, in_f,
2984 buffer_list);
2913 /* re-generate the checksum. */ 2985 /* re-generate the checksum. */
2914 xfs_dinode_calc_crc(log->l_mp, dip); 2986 xfs_dinode_calc_crc(log->l_mp, dip);
2915 2987
@@ -2923,6 +2995,9 @@ error:
2923 if (need_free) 2995 if (need_free)
2924 kmem_free(in_f); 2996 kmem_free(in_f);
2925 return XFS_ERROR(error); 2997 return XFS_ERROR(error);
2998
2999 xfs_buf_relse(bp);
3000 goto error;
2926} 3001}
2927 3002
2928/* 3003/*