aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2013-08-29 20:23:45 -0400
committerBen Myers <bpm@sgi.com>2013-09-10 13:49:57 -0400
commit638f44163d57f87d0905fbed7d54202beff916fc (patch)
treebecdb2c6ee54e318bd1cb27bd72f3438194674dc
parent21b5c9784bceb8b8e0095f87355f3b138ebac2d0 (diff)
xfs: recovery of swap extents operations for CRC filesystems
This is the recovery side of the btree block owner change operation performed by swapext on CRC enabled filesystems. We detect that an owner change is needed by the flag that has been placed on the inode log format flag field. Because the inode recovery is being replayed after the buffers that make up the BMBT in the given checkpoint, we can walk all the buffers and directly modify them when we see the flag set on an inode. Because the inode can be relogged and hence present in multiple chekpoints with the "change owner" flag set, we could do multiple passes across the inode to do this change. While this isn't optimal, we can't directly ignore the flag as there may be multiple independent swap extent operations being replayed on the same inode in different checkpoints so we can't ignore them. Further, because the owner change operation uses ordered buffers, we might have buffers that are newer on disk than the current checkpoint and so already have the owner changed in them. Hence we cannot just peek at a buffer in the tree and check that it has the correct owner and assume that the change was completed. So, for the moment just brute force the owner change every time we see an inode with the flag set. Note that we have to be careful here because the owner of the buffers may point to either the old owner or the new owner. Currently the verifier can't verify the owner directly, so there is no failure case here right now. If we verify the owner exactly in future, then we'll have to take this into account. This was tested in terms of normal operation via xfstests - all of the fsr tests now pass without failure. however, we really need to modify xfs/227 to stress v3 inodes correctly to ensure we fully cover this case for v5 filesystems. In terms of recovery testing, I used a hacked version of xfs_fsr that held the temp inode open for a few seconds before exiting so that the filesystem could be shut down with an open owner change recovery flags set on at least the temp inode. fsr leaves the temp inode unlinked and in btree format, so this was necessary for the owner change to be reliably replayed. logprint confirmed the tmp inode in the log had the correct flag set: INO: cnt:3 total:3 a:0x69e9e0 len:56 a:0x69ea20 len:176 a:0x69eae0 len:88 INODE: #regs:3 ino:0x44 flags:0x209 dsize:88 ^^^^^ 0x200 is set, indicating a data fork owner change needed to be replayed on inode 0x44. A printk in the revoery code confirmed that the inode change was recovered: XFS (vdc): Mounting Filesystem XFS (vdc): Starting recovery (logdev: internal) recovering owner change ino 0x44 XFS (vdc): Version 5 superblock detected. This kernel L support enabled! Use of these features in this kernel is at your own risk! XFS (vdc): Ending recovery (logdev: internal) The script used to test this was: $ cat ./recovery-fsr.sh #!/bin/bash dev=/dev/vdc mntpt=/mnt/scratch testfile=$mntpt/testfile umount $mntpt mkfs.xfs -f -m crc=1 $dev mount $dev $mntpt chmod 777 $mntpt for i in `seq 10000 -1 0`; do xfs_io -f -d -c "pwrite $(($i * 4096)) 4096" $testfile > /dev/null 2>&1 done xfs_bmap -vp $testfile |head -20 xfs_fsr -d -v $testfile & sleep 10 /home/dave/src/xfstests-dev/src/godown -f $mntpt wait umount $mntpt xfs_logprint -t $dev |tail -20 time mount $dev $mntpt xfs_bmap -vp $testfile umount $mntpt $ Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
-rw-r--r--fs/xfs/xfs_bmap_btree.c26
-rw-r--r--fs/xfs/xfs_bmap_btree.h3
-rw-r--r--fs/xfs/xfs_bmap_util.c14
-rw-r--r--fs/xfs/xfs_btree.c32
-rw-r--r--fs/xfs/xfs_btree.h3
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_icache.h4
-rw-r--r--fs/xfs/xfs_inode_buf.c2
-rw-r--r--fs/xfs/xfs_inode_buf.h18
-rw-r--r--fs/xfs/xfs_log_format.h9
-rw-r--r--fs/xfs/xfs_log_recover.c123
11 files changed, 171 insertions, 67 deletions
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index aa2eadd41bab..531b0206cce6 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -932,30 +932,40 @@ xfs_bmdr_maxrecs(
932 * we switch forks between inodes. The operation that the caller is doing will 932 * we switch forks between inodes. The operation that the caller is doing will
933 * determine whether is needs to change owner before or after the switch. 933 * determine whether is needs to change owner before or after the switch.
934 * 934 *
935 * For demand paged modification, the fork switch should be done after reading 935 * For demand paged transactional modification, the fork switch should be done
936 * in all the blocks, modifying them and pinning them in the transaction. For 936 * after reading in all the blocks, modifying them and pinning them in the
937 * modification when the buffers are already pinned in memory, the fork switch 937 * transaction. For modification when the buffers are already pinned in memory,
938 * can be done before changing the owner as we won't need to validate the owner 938 * the fork switch can be done before changing the owner as we won't need to
939 * until the btree buffers are unpinned and writes can occur again. 939 * validate the owner until the btree buffers are unpinned and writes can occur
940 * again.
941 *
942 * For recovery based ownership change, there is no transactional context and
943 * so a buffer list must be supplied so that we can record the buffers that we
944 * modified for the caller to issue IO on.
940 */ 945 */
941int 946int
942xfs_bmbt_change_owner( 947xfs_bmbt_change_owner(
943 struct xfs_trans *tp, 948 struct xfs_trans *tp,
944 struct xfs_inode *ip, 949 struct xfs_inode *ip,
945 int whichfork, 950 int whichfork,
946 xfs_ino_t new_owner) 951 xfs_ino_t new_owner,
952 struct list_head *buffer_list)
947{ 953{
948 struct xfs_btree_cur *cur; 954 struct xfs_btree_cur *cur;
949 int error; 955 int error;
950 956
957 ASSERT(tp || buffer_list);
958 ASSERT(!(tp && buffer_list));
951 if (whichfork == XFS_DATA_FORK) 959 if (whichfork == XFS_DATA_FORK)
952 ASSERT(ip->i_d.di_format = XFS_DINODE_FMT_BTREE); 960 ASSERT(ip->i_d.di_format = XFS_DINODE_FMT_BTREE);
953 else 961 else
954 ASSERT(ip->i_d.di_aformat = XFS_DINODE_FMT_BTREE); 962 ASSERT(ip->i_d.di_aformat = XFS_DINODE_FMT_BTREE);
955 963
956 cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); 964 cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
957 error = xfs_btree_change_owner(cur, new_owner); 965 if (!cur)
966 return ENOMEM;
967
968 error = xfs_btree_change_owner(cur, new_owner, buffer_list);
958 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); 969 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
959 return error; 970 return error;
960} 971}
961
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index bceac7affa27..e367461a638e 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -237,7 +237,8 @@ extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
237extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); 237extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
238 238
239extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, 239extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
240 int whichfork, xfs_ino_t new_owner); 240 int whichfork, xfs_ino_t new_owner,
241 struct list_head *buffer_list);
241 242
242extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, 243extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
243 struct xfs_trans *, struct xfs_inode *, int); 244 struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index ad8a91d2e011..c6dc55142cbe 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1932,16 +1932,18 @@ xfs_swap_extents(
1932 target_log_flags = XFS_ILOG_CORE; 1932 target_log_flags = XFS_ILOG_CORE;
1933 if (ip->i_d.di_version == 3 && 1933 if (ip->i_d.di_version == 3 &&
1934 ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { 1934 ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1935 target_log_flags |= XFS_ILOG_OWNER; 1935 target_log_flags |= XFS_ILOG_DOWNER;
1936 error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, tip->i_ino); 1936 error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
1937 tip->i_ino, NULL);
1937 if (error) 1938 if (error)
1938 goto out_trans_cancel; 1939 goto out_trans_cancel;
1939 } 1940 }
1940 1941
1941 if (tip->i_d.di_version == 3 && 1942 if (tip->i_d.di_version == 3 &&
1942 tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { 1943 tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1943 src_log_flags |= XFS_ILOG_OWNER; 1944 src_log_flags |= XFS_ILOG_DOWNER;
1944 error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, ip->i_ino); 1945 error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
1946 ip->i_ino, NULL);
1945 if (error) 1947 if (error)
1946 goto out_trans_cancel; 1948 goto out_trans_cancel;
1947 } 1949 }
@@ -1997,7 +1999,7 @@ xfs_swap_extents(
1997 break; 1999 break;
1998 case XFS_DINODE_FMT_BTREE: 2000 case XFS_DINODE_FMT_BTREE:
1999 ASSERT(ip->i_d.di_version < 3 || 2001 ASSERT(ip->i_d.di_version < 3 ||
2000 (src_log_flags & XFS_ILOG_OWNER)); 2002 (src_log_flags & XFS_ILOG_DOWNER));
2001 src_log_flags |= XFS_ILOG_DBROOT; 2003 src_log_flags |= XFS_ILOG_DBROOT;
2002 break; 2004 break;
2003 } 2005 }
@@ -2017,7 +2019,7 @@ xfs_swap_extents(
2017 case XFS_DINODE_FMT_BTREE: 2019 case XFS_DINODE_FMT_BTREE:
2018 target_log_flags |= XFS_ILOG_DBROOT; 2020 target_log_flags |= XFS_ILOG_DBROOT;
2019 ASSERT(tip->i_d.di_version < 3 || 2021 ASSERT(tip->i_d.di_version < 3 ||
2020 (target_log_flags & XFS_ILOG_OWNER)); 2022 (target_log_flags & XFS_ILOG_DOWNER));
2021 break; 2023 break;
2022 } 2024 }
2023 2025
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 047573f02702..5690e102243d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -3907,13 +3907,16 @@ xfs_btree_get_rec(
3907 * buffer as an ordered buffer and log it appropriately. We need to ensure that 3907 * buffer as an ordered buffer and log it appropriately. We need to ensure that
3908 * we mark the region we change dirty so that if the buffer is relogged in 3908 * we mark the region we change dirty so that if the buffer is relogged in
3909 * a subsequent transaction the changes we make here as an ordered buffer are 3909 * a subsequent transaction the changes we make here as an ordered buffer are
3910 * correctly relogged in that transaction. 3910 * correctly relogged in that transaction. If we are in recovery context, then
3911 * just queue the modified buffer as delayed write buffer so the transaction
3912 * recovery completion writes the changes to disk.
3911 */ 3913 */
3912static int 3914static int
3913xfs_btree_block_change_owner( 3915xfs_btree_block_change_owner(
3914 struct xfs_btree_cur *cur, 3916 struct xfs_btree_cur *cur,
3915 int level, 3917 int level,
3916 __uint64_t new_owner) 3918 __uint64_t new_owner,
3919 struct list_head *buffer_list)
3917{ 3920{
3918 struct xfs_btree_block *block; 3921 struct xfs_btree_block *block;
3919 struct xfs_buf *bp; 3922 struct xfs_buf *bp;
@@ -3930,16 +3933,19 @@ xfs_btree_block_change_owner(
3930 block->bb_u.s.bb_owner = cpu_to_be32(new_owner); 3933 block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
3931 3934
3932 /* 3935 /*
3933 * Log owner change as an ordered buffer. If the block is a root block 3936 * If the block is a root block hosted in an inode, we might not have a
3934 * hosted in an inode, we might not have a buffer pointer here and we 3937 * buffer pointer here and we shouldn't attempt to log the change as the
3935 * shouldn't attempt to log the change as the information is already 3938 * information is already held in the inode and discarded when the root
3936 * held in the inode and discarded when the root block is formatted into 3939 * block is formatted into the on-disk inode fork. We still change it,
3937 * the on-disk inode fork. We still change it, though, so everything is 3940 * though, so everything is consistent in memory.
3938 * consistent in memory.
3939 */ 3941 */
3940 if (bp) { 3942 if (bp) {
3941 xfs_trans_ordered_buf(cur->bc_tp, bp); 3943 if (cur->bc_tp) {
3942 xfs_btree_log_block(cur, bp, XFS_BB_OWNER); 3944 xfs_trans_ordered_buf(cur->bc_tp, bp);
3945 xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
3946 } else {
3947 xfs_buf_delwri_queue(bp, buffer_list);
3948 }
3943 } else { 3949 } else {
3944 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 3950 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3945 ASSERT(level == cur->bc_nlevels - 1); 3951 ASSERT(level == cur->bc_nlevels - 1);
@@ -3956,7 +3962,8 @@ xfs_btree_block_change_owner(
3956int 3962int
3957xfs_btree_change_owner( 3963xfs_btree_change_owner(
3958 struct xfs_btree_cur *cur, 3964 struct xfs_btree_cur *cur,
3959 __uint64_t new_owner) 3965 __uint64_t new_owner,
3966 struct list_head *buffer_list)
3960{ 3967{
3961 union xfs_btree_ptr lptr; 3968 union xfs_btree_ptr lptr;
3962 int level; 3969 int level;
@@ -3986,7 +3993,8 @@ xfs_btree_change_owner(
3986 /* for each buffer in the level */ 3993 /* for each buffer in the level */
3987 do { 3994 do {
3988 error = xfs_btree_block_change_owner(cur, level, 3995 error = xfs_btree_block_change_owner(cur, level,
3989 new_owner); 3996 new_owner,
3997 buffer_list);
3990 } while (!error); 3998 } while (!error);
3991 3999
3992 if (error != ENOENT) 4000 if (error != ENOENT)
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 544b209e0256..06729b67ad58 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -445,7 +445,8 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
445int xfs_btree_insert(struct xfs_btree_cur *, int *); 445int xfs_btree_insert(struct xfs_btree_cur *, int *);
446int xfs_btree_delete(struct xfs_btree_cur *, int *); 446int xfs_btree_delete(struct xfs_btree_cur *, int *);
447int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); 447int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
448int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner); 448int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
449 struct list_head *buffer_list);
449 450
450/* 451/*
451 * btree block CRC helpers 452 * btree block CRC helpers
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 16219b9c6790..7942432d9f77 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -48,7 +48,7 @@ STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
48/* 48/*
49 * Allocate and initialise an xfs_inode. 49 * Allocate and initialise an xfs_inode.
50 */ 50 */
51STATIC struct xfs_inode * 51struct xfs_inode *
52xfs_inode_alloc( 52xfs_inode_alloc(
53 struct xfs_mount *mp, 53 struct xfs_mount *mp,
54 xfs_ino_t ino) 54 xfs_ino_t ino)
@@ -98,7 +98,7 @@ xfs_inode_free_callback(
98 kmem_zone_free(xfs_inode_zone, ip); 98 kmem_zone_free(xfs_inode_zone, ip);
99} 99}
100 100
101STATIC void 101void
102xfs_inode_free( 102xfs_inode_free(
103 struct xfs_inode *ip) 103 struct xfs_inode *ip)
104{ 104{
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 8a89f7d791bd..458e6bc22cc4 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -42,6 +42,10 @@ struct xfs_eofblocks {
42int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, 42int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
43 uint flags, uint lock_flags, xfs_inode_t **ipp); 43 uint flags, uint lock_flags, xfs_inode_t **ipp);
44 44
45/* recovery needs direct inode allocation capability */
46struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino);
47void xfs_inode_free(struct xfs_inode *ip);
48
45void xfs_reclaim_worker(struct work_struct *work); 49void xfs_reclaim_worker(struct work_struct *work);
46 50
47int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 51int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
index e011d597f12f..3d25c9a5f6bc 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/xfs_inode_buf.c
@@ -196,7 +196,7 @@ xfs_imap_to_bp(
196 return 0; 196 return 0;
197} 197}
198 198
199STATIC void 199void
200xfs_dinode_from_disk( 200xfs_dinode_from_disk(
201 xfs_icdinode_t *to, 201 xfs_icdinode_t *to,
202 xfs_dinode_t *from) 202 xfs_dinode_t *from)
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h
index 599e6c0ca2a9..abba0ae8cf2d 100644
--- a/fs/xfs/xfs_inode_buf.h
+++ b/fs/xfs/xfs_inode_buf.h
@@ -32,17 +32,17 @@ struct xfs_imap {
32 ushort im_boffset; /* inode offset in block in bytes */ 32 ushort im_boffset; /* inode offset in block in bytes */
33}; 33};
34 34
35int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, 35int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
36 struct xfs_imap *, struct xfs_dinode **, 36 struct xfs_imap *, struct xfs_dinode **,
37 struct xfs_buf **, uint, uint); 37 struct xfs_buf **, uint, uint);
38int xfs_iread(struct xfs_mount *, struct xfs_trans *, 38int xfs_iread(struct xfs_mount *, struct xfs_trans *,
39 struct xfs_inode *, uint); 39 struct xfs_inode *, uint);
40void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); 40void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
41void xfs_dinode_to_disk(struct xfs_dinode *, 41void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
42 struct xfs_icdinode *); 42void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
43 43
44#if defined(DEBUG) 44#if defined(DEBUG)
45void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); 45void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
46#else 46#else
47#define xfs_inobp_check(mp, bp) 47#define xfs_inobp_check(mp, bp)
48#endif /* DEBUG */ 48#endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h
index 08a6fbe03bb6..ca7e28a8ed31 100644
--- a/fs/xfs/xfs_log_format.h
+++ b/fs/xfs/xfs_log_format.h
@@ -474,7 +474,8 @@ typedef struct xfs_inode_log_format_64 {
474#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ 474#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
475#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ 475#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
476#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ 476#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
477#define XFS_ILOG_OWNER 0x200 /* change the extent tree owner on replay */ 477#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */
478#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */
478 479
479 480
480/* 481/*
@@ -488,7 +489,8 @@ typedef struct xfs_inode_log_format_64 {
488#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ 489#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
489 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ 490 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
490 XFS_ILOG_UUID | XFS_ILOG_ADATA | \ 491 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
491 XFS_ILOG_AEXT | XFS_ILOG_ABROOT) 492 XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
493 XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
492 494
493#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ 495#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
494 XFS_ILOG_DBROOT) 496 XFS_ILOG_DBROOT)
@@ -500,7 +502,8 @@ typedef struct xfs_inode_log_format_64 {
500 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ 502 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
501 XFS_ILOG_DEV | XFS_ILOG_UUID | \ 503 XFS_ILOG_DEV | XFS_ILOG_UUID | \
502 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ 504 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
503 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP) 505 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
506 XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
504 507
505static inline int xfs_ilog_fbroot(int w) 508static inline int xfs_ilog_fbroot(int w)
506{ 509{
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1728c7c016a6..1c3b0c9c9aac 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2629,6 +2629,82 @@ out_release:
2629 return error; 2629 return error;
2630} 2630}
2631 2631
2632/*
2633 * Inode fork owner changes
2634 *
2635 * If we have been told that we have to reparent the inode fork, it's because an
2636 * extent swap operation on a CRC enabled filesystem has been done and we are
2637 * replaying it. We need to walk the BMBT of the appropriate fork and change the
2638 * owners of it.
2639 *
2640 * The complexity here is that we don't have an inode context to work with, so
2641 * after we've replayed the inode we need to instantiate one. This is where the
2642 * fun begins.
2643 *
2644 * We are in the middle of log recovery, so we can't run transactions. That
2645 * means we cannot use cache coherent inode instantiation via xfs_iget(), as
2646 * that will result in the corresponding iput() running the inode through
2647 * xfs_inactive(). If we've just replayed an inode core that changes the link
2648 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
2649 * transactions (bad!).
2650 *
2651 * So, to avoid this, we instantiate an inode directly from the inode core we've
2652 * just recovered. We have the buffer still locked, and all we really need to
2653 * instantiate is the inode core and the forks being modified. We can do this
2654 * manually, then run the inode btree owner change, and then tear down the
2655 * xfs_inode without having to run any transactions at all.
2656 *
2657 * Also, because we don't have a transaction context available here but need to
2658 * gather all the buffers we modify for writeback so we pass the buffer_list
2659 * instead for the operation to use.
2660 */
2661
2662STATIC int
2663xfs_recover_inode_owner_change(
2664 struct xfs_mount *mp,
2665 struct xfs_dinode *dip,
2666 struct xfs_inode_log_format *in_f,
2667 struct list_head *buffer_list)
2668{
2669 struct xfs_inode *ip;
2670 int error;
2671
2672 ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
2673
2674 ip = xfs_inode_alloc(mp, in_f->ilf_ino);
2675 if (!ip)
2676 return ENOMEM;
2677
2678 /* instantiate the inode */
2679 xfs_dinode_from_disk(&ip->i_d, dip);
2680 ASSERT(ip->i_d.di_version >= 3);
2681
2682 error = xfs_iformat_fork(ip, dip);
2683 if (error)
2684 goto out_free_ip;
2685
2686
2687 if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
2688 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
2689 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
2690 ip->i_ino, buffer_list);
2691 if (error)
2692 goto out_free_ip;
2693 }
2694
2695 if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
2696 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
2697 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
2698 ip->i_ino, buffer_list);
2699 if (error)
2700 goto out_free_ip;
2701 }
2702
2703out_free_ip:
2704 xfs_inode_free(ip);
2705 return error;
2706}
2707
2632STATIC int 2708STATIC int
2633xlog_recover_inode_pass2( 2709xlog_recover_inode_pass2(
2634 struct xlog *log, 2710 struct xlog *log,
@@ -2681,8 +2757,7 @@ xlog_recover_inode_pass2(
2681 error = bp->b_error; 2757 error = bp->b_error;
2682 if (error) { 2758 if (error) {
2683 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); 2759 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2684 xfs_buf_relse(bp); 2760 goto out_release;
2685 goto error;
2686 } 2761 }
2687 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 2762 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2688 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); 2763 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -2692,30 +2767,31 @@ xlog_recover_inode_pass2(
2692 * like an inode! 2767 * like an inode!
2693 */ 2768 */
2694 if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { 2769 if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
2695 xfs_buf_relse(bp);
2696 xfs_alert(mp, 2770 xfs_alert(mp,
2697 "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", 2771 "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2698 __func__, dip, bp, in_f->ilf_ino); 2772 __func__, dip, bp, in_f->ilf_ino);
2699 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", 2773 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2700 XFS_ERRLEVEL_LOW, mp); 2774 XFS_ERRLEVEL_LOW, mp);
2701 error = EFSCORRUPTED; 2775 error = EFSCORRUPTED;
2702 goto error; 2776 goto out_release;
2703 } 2777 }
2704 dicp = item->ri_buf[1].i_addr; 2778 dicp = item->ri_buf[1].i_addr;
2705 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { 2779 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2706 xfs_buf_relse(bp);
2707 xfs_alert(mp, 2780 xfs_alert(mp,
2708 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", 2781 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2709 __func__, item, in_f->ilf_ino); 2782 __func__, item, in_f->ilf_ino);
2710 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", 2783 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2711 XFS_ERRLEVEL_LOW, mp); 2784 XFS_ERRLEVEL_LOW, mp);
2712 error = EFSCORRUPTED; 2785 error = EFSCORRUPTED;
2713 goto error; 2786 goto out_release;
2714 } 2787 }
2715 2788
2716 /* 2789 /*
2717 * If the inode has an LSN in it, recover the inode only if it's less 2790 * If the inode has an LSN in it, recover the inode only if it's less
2718 * than the lsn of the transaction we are replaying. 2791 * than the lsn of the transaction we are replaying. Note: we still
2792 * need to replay an owner change even though the inode is more recent
2793 * than the transaction as there is no guarantee that all the btree
2794 * blocks are more recent than this transaction, too.
2719 */ 2795 */
2720 if (dip->di_version >= 3) { 2796 if (dip->di_version >= 3) {
2721 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); 2797 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
@@ -2723,7 +2799,7 @@ xlog_recover_inode_pass2(
2723 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 2799 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2724 trace_xfs_log_recover_inode_skip(log, in_f); 2800 trace_xfs_log_recover_inode_skip(log, in_f);
2725 error = 0; 2801 error = 0;
2726 goto out_release; 2802 goto out_owner_change;
2727 } 2803 }
2728 } 2804 }
2729 2805
@@ -2745,10 +2821,9 @@ xlog_recover_inode_pass2(
2745 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { 2821 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2746 /* do nothing */ 2822 /* do nothing */
2747 } else { 2823 } else {
2748 xfs_buf_relse(bp);
2749 trace_xfs_log_recover_inode_skip(log, in_f); 2824 trace_xfs_log_recover_inode_skip(log, in_f);
2750 error = 0; 2825 error = 0;
2751 goto error; 2826 goto out_release;
2752 } 2827 }
2753 } 2828 }
2754 2829
@@ -2760,13 +2835,12 @@ xlog_recover_inode_pass2(
2760 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 2835 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2761 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", 2836 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2762 XFS_ERRLEVEL_LOW, mp, dicp); 2837 XFS_ERRLEVEL_LOW, mp, dicp);
2763 xfs_buf_relse(bp);
2764 xfs_alert(mp, 2838 xfs_alert(mp,
2765 "%s: Bad regular inode log record, rec ptr 0x%p, " 2839 "%s: Bad regular inode log record, rec ptr 0x%p, "
2766 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2840 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2767 __func__, item, dip, bp, in_f->ilf_ino); 2841 __func__, item, dip, bp, in_f->ilf_ino);
2768 error = EFSCORRUPTED; 2842 error = EFSCORRUPTED;
2769 goto error; 2843 goto out_release;
2770 } 2844 }
2771 } else if (unlikely(S_ISDIR(dicp->di_mode))) { 2845 } else if (unlikely(S_ISDIR(dicp->di_mode))) {
2772 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2846 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -2774,19 +2848,17 @@ xlog_recover_inode_pass2(
2774 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 2848 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2775 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", 2849 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2776 XFS_ERRLEVEL_LOW, mp, dicp); 2850 XFS_ERRLEVEL_LOW, mp, dicp);
2777 xfs_buf_relse(bp);
2778 xfs_alert(mp, 2851 xfs_alert(mp,
2779 "%s: Bad dir inode log record, rec ptr 0x%p, " 2852 "%s: Bad dir inode log record, rec ptr 0x%p, "
2780 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2853 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2781 __func__, item, dip, bp, in_f->ilf_ino); 2854 __func__, item, dip, bp, in_f->ilf_ino);
2782 error = EFSCORRUPTED; 2855 error = EFSCORRUPTED;
2783 goto error; 2856 goto out_release;
2784 } 2857 }
2785 } 2858 }
2786 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ 2859 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2787 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", 2860 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2788 XFS_ERRLEVEL_LOW, mp, dicp); 2861 XFS_ERRLEVEL_LOW, mp, dicp);
2789 xfs_buf_relse(bp);
2790 xfs_alert(mp, 2862 xfs_alert(mp,
2791 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 2863 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2792 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2864 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
@@ -2794,29 +2866,27 @@ xlog_recover_inode_pass2(
2794 dicp->di_nextents + dicp->di_anextents, 2866 dicp->di_nextents + dicp->di_anextents,
2795 dicp->di_nblocks); 2867 dicp->di_nblocks);
2796 error = EFSCORRUPTED; 2868 error = EFSCORRUPTED;
2797 goto error; 2869 goto out_release;
2798 } 2870 }
2799 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 2871 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2800 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", 2872 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2801 XFS_ERRLEVEL_LOW, mp, dicp); 2873 XFS_ERRLEVEL_LOW, mp, dicp);
2802 xfs_buf_relse(bp);
2803 xfs_alert(mp, 2874 xfs_alert(mp,
2804 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 2875 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2805 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, 2876 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2806 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); 2877 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2807 error = EFSCORRUPTED; 2878 error = EFSCORRUPTED;
2808 goto error; 2879 goto out_release;
2809 } 2880 }
2810 isize = xfs_icdinode_size(dicp->di_version); 2881 isize = xfs_icdinode_size(dicp->di_version);
2811 if (unlikely(item->ri_buf[1].i_len > isize)) { 2882 if (unlikely(item->ri_buf[1].i_len > isize)) {
2812 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", 2883 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2813 XFS_ERRLEVEL_LOW, mp, dicp); 2884 XFS_ERRLEVEL_LOW, mp, dicp);
2814 xfs_buf_relse(bp);
2815 xfs_alert(mp, 2885 xfs_alert(mp,
2816 "%s: Bad inode log record length %d, rec ptr 0x%p", 2886 "%s: Bad inode log record length %d, rec ptr 0x%p",
2817 __func__, item->ri_buf[1].i_len, item); 2887 __func__, item->ri_buf[1].i_len, item);
2818 error = EFSCORRUPTED; 2888 error = EFSCORRUPTED;
2819 goto error; 2889 goto out_release;
2820 } 2890 }
2821 2891
2822 /* The core is in in-core format */ 2892 /* The core is in in-core format */
@@ -2842,7 +2912,7 @@ xlog_recover_inode_pass2(
2842 } 2912 }
2843 2913
2844 if (in_f->ilf_size == 2) 2914 if (in_f->ilf_size == 2)
2845 goto write_inode_buffer; 2915 goto out_owner_change;
2846 len = item->ri_buf[2].i_len; 2916 len = item->ri_buf[2].i_len;
2847 src = item->ri_buf[2].i_addr; 2917 src = item->ri_buf[2].i_addr;
2848 ASSERT(in_f->ilf_size <= 4); 2918 ASSERT(in_f->ilf_size <= 4);
@@ -2903,13 +2973,15 @@ xlog_recover_inode_pass2(
2903 default: 2973 default:
2904 xfs_warn(log->l_mp, "%s: Invalid flag", __func__); 2974 xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2905 ASSERT(0); 2975 ASSERT(0);
2906 xfs_buf_relse(bp);
2907 error = EIO; 2976 error = EIO;
2908 goto error; 2977 goto out_release;
2909 } 2978 }
2910 } 2979 }
2911 2980
2912write_inode_buffer: 2981out_owner_change:
2982 if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
2983 error = xfs_recover_inode_owner_change(mp, dip, in_f,
2984 buffer_list);
2913 /* re-generate the checksum. */ 2985 /* re-generate the checksum. */
2914 xfs_dinode_calc_crc(log->l_mp, dip); 2986 xfs_dinode_calc_crc(log->l_mp, dip);
2915 2987
@@ -2923,6 +2995,9 @@ error:
2923 if (need_free) 2995 if (need_free)
2924 kmem_free(in_f); 2996 kmem_free(in_f);
2925 return XFS_ERROR(error); 2997 return XFS_ERROR(error);
2998
2999 xfs_buf_relse(bp);
3000 goto error;
2926} 3001}
2927 3002
2928/* 3003/*