aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2013-08-29 20:23:44 -0400
committerBen Myers <bpm@sgi.com>2013-09-10 11:26:47 -0400
commit21b5c9784bceb8b8e0095f87355f3b138ebac2d0 (patch)
tree52b179280cb81bba1d7304a41676ed64d847067a /fs/xfs
parent0f295a214bb7658ca37bd61a8a1f0cd4a9d86c1f (diff)
xfs: swap extents operations for CRC filesystems
For CRC enabled filesystems, we can't just swap inode forks from one inode to another when defragmenting a file - the blocks in the inode fork bmap btree contain pointers back to the owner inode. Hence if we are to swap the inode forks we have to atomically modify every block in the btree during the transaction. We are doing an entire fork swap here, so we could create a new transaction item type that indicates we are changing the owner of a certain structure from one value to another. If we combine this with ordered buffer logging to modify all the buffers in the tree, then we can change the buffers in the tree without needing log space for the operation. However, this then requires log recovery to perform the modification of the owner information of the objects/structures in question. This does introduce some interesting ordering details into recovery: we have to make sure that the owner change replay occurs after the change that moves the objects is made, not before. Hence we can't use a separate log item for this as we have no guarantee of strict ordering between multiple items in the log due to the relogging action of asynchronous transaction commits. Hence there is no "generic" method we can use for changing the ownership of arbitrary metadata structures. For inode forks, however, there is a simple method of communicating that the fork contents need the owner rewritten - we can pass a inode log format flag for the fork for the transaction that does a fork swap. This flag will then follow the inode fork through relogging actions so when the swap actually gets replayed the ownership can be changed immediately by log recovery. So that gives us a simple method of "whole fork" exchange between two inodes. This is relatively simple to implement, so it makes sense to do this as an initial implementation to support xfs_fsr on CRC enabled filesytems in the same manner as we do on existing filesystems. This commit introduces the swapext driven functionality, the recovery functionality will be in a separate patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_bmap_btree.c34
-rw-r--r--fs/xfs/xfs_bmap_btree.h3
-rw-r--r--fs/xfs/xfs_bmap_util.c52
-rw-r--r--fs/xfs/xfs_btree.c162
-rw-r--r--fs/xfs/xfs_btree.h18
-rw-r--r--fs/xfs/xfs_log_format.h1
6 files changed, 231 insertions, 39 deletions
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index cf3bc76710c3..aa2eadd41bab 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -925,3 +925,37 @@ xfs_bmdr_maxrecs(
925 return blocklen / sizeof(xfs_bmdr_rec_t); 925 return blocklen / sizeof(xfs_bmdr_rec_t);
926 return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t)); 926 return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
927} 927}
928
929/*
930 * Change the owner of a btree format fork fo the inode passed in. Change it to
931 * the owner of that is passed in so that we can change owners before or after
932 * we switch forks between inodes. The operation that the caller is doing will
933 * determine whether is needs to change owner before or after the switch.
934 *
935 * For demand paged modification, the fork switch should be done after reading
936 * in all the blocks, modifying them and pinning them in the transaction. For
937 * modification when the buffers are already pinned in memory, the fork switch
938 * can be done before changing the owner as we won't need to validate the owner
939 * until the btree buffers are unpinned and writes can occur again.
940 */
941int
942xfs_bmbt_change_owner(
943 struct xfs_trans *tp,
944 struct xfs_inode *ip,
945 int whichfork,
946 xfs_ino_t new_owner)
947{
948 struct xfs_btree_cur *cur;
949 int error;
950
951 if (whichfork == XFS_DATA_FORK)
952 ASSERT(ip->i_d.di_format = XFS_DINODE_FMT_BTREE);
953 else
954 ASSERT(ip->i_d.di_aformat = XFS_DINODE_FMT_BTREE);
955
956 cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
957 error = xfs_btree_change_owner(cur, new_owner);
958 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
959 return error;
960}
961
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 1b726d626941..bceac7affa27 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,6 +236,9 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
236extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); 236extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
237extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); 237extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
238 238
239extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
240 int whichfork, xfs_ino_t new_owner);
241
239extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, 242extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
240 struct xfs_trans *, struct xfs_inode *, int); 243 struct xfs_trans *, struct xfs_inode *, int);
241 244
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 541d59f5e658..ad8a91d2e011 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1789,14 +1789,6 @@ xfs_swap_extents(
1789 int taforkblks = 0; 1789 int taforkblks = 0;
1790 __uint64_t tmp; 1790 __uint64_t tmp;
1791 1791
1792 /*
1793 * We have no way of updating owner information in the BMBT blocks for
1794 * each inode on CRC enabled filesystems, so to avoid corrupting the
1795 * this metadata we simply don't allow extent swaps to occur.
1796 */
1797 if (xfs_sb_version_hascrc(&mp->m_sb))
1798 return XFS_ERROR(EINVAL);
1799
1800 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); 1792 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
1801 if (!tempifp) { 1793 if (!tempifp) {
1802 error = XFS_ERROR(ENOMEM); 1794 error = XFS_ERROR(ENOMEM);
@@ -1920,6 +1912,40 @@ xfs_swap_extents(
1920 goto out_trans_cancel; 1912 goto out_trans_cancel;
1921 } 1913 }
1922 1914
1915 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1916 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1917
1918 /*
1919 * Before we've swapped the forks, lets set the owners of the forks
1920 * appropriately. We have to do this as we are demand paging the btree
1921 * buffers, and so the validation done on read will expect the owner
1922 * field to be correctly set. Once we change the owners, we can swap the
1923 * inode forks.
1924 *
1925 * Note the trickiness in setting the log flags - we set the owner log
1926 * flag on the opposite inode (i.e. the inode we are setting the new
1927 * owner to be) because once we swap the forks and log that, log
1928 * recovery is going to see the fork as owned by the swapped inode,
1929 * not the pre-swapped inodes.
1930 */
1931 src_log_flags = XFS_ILOG_CORE;
1932 target_log_flags = XFS_ILOG_CORE;
1933 if (ip->i_d.di_version == 3 &&
1934 ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1935 target_log_flags |= XFS_ILOG_OWNER;
1936 error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, tip->i_ino);
1937 if (error)
1938 goto out_trans_cancel;
1939 }
1940
1941 if (tip->i_d.di_version == 3 &&
1942 tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1943 src_log_flags |= XFS_ILOG_OWNER;
1944 error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, ip->i_ino);
1945 if (error)
1946 goto out_trans_cancel;
1947 }
1948
1923 /* 1949 /*
1924 * Swap the data forks of the inodes 1950 * Swap the data forks of the inodes
1925 */ 1951 */
@@ -1957,7 +1983,6 @@ xfs_swap_extents(
1957 tip->i_delayed_blks = ip->i_delayed_blks; 1983 tip->i_delayed_blks = ip->i_delayed_blks;
1958 ip->i_delayed_blks = 0; 1984 ip->i_delayed_blks = 0;
1959 1985
1960 src_log_flags = XFS_ILOG_CORE;
1961 switch (ip->i_d.di_format) { 1986 switch (ip->i_d.di_format) {
1962 case XFS_DINODE_FMT_EXTENTS: 1987 case XFS_DINODE_FMT_EXTENTS:
1963 /* If the extents fit in the inode, fix the 1988 /* If the extents fit in the inode, fix the
@@ -1971,11 +1996,12 @@ xfs_swap_extents(
1971 src_log_flags |= XFS_ILOG_DEXT; 1996 src_log_flags |= XFS_ILOG_DEXT;
1972 break; 1997 break;
1973 case XFS_DINODE_FMT_BTREE: 1998 case XFS_DINODE_FMT_BTREE:
1999 ASSERT(ip->i_d.di_version < 3 ||
2000 (src_log_flags & XFS_ILOG_OWNER));
1974 src_log_flags |= XFS_ILOG_DBROOT; 2001 src_log_flags |= XFS_ILOG_DBROOT;
1975 break; 2002 break;
1976 } 2003 }
1977 2004
1978 target_log_flags = XFS_ILOG_CORE;
1979 switch (tip->i_d.di_format) { 2005 switch (tip->i_d.di_format) {
1980 case XFS_DINODE_FMT_EXTENTS: 2006 case XFS_DINODE_FMT_EXTENTS:
1981 /* If the extents fit in the inode, fix the 2007 /* If the extents fit in the inode, fix the
@@ -1990,13 +2016,11 @@ xfs_swap_extents(
1990 break; 2016 break;
1991 case XFS_DINODE_FMT_BTREE: 2017 case XFS_DINODE_FMT_BTREE:
1992 target_log_flags |= XFS_ILOG_DBROOT; 2018 target_log_flags |= XFS_ILOG_DBROOT;
2019 ASSERT(tip->i_d.di_version < 3 ||
2020 (target_log_flags & XFS_ILOG_OWNER));
1993 break; 2021 break;
1994 } 2022 }
1995 2023
1996
1997 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1998 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1999
2000 xfs_trans_log_inode(tp, ip, src_log_flags); 2024 xfs_trans_log_inode(tp, ip, src_log_flags);
2001 xfs_trans_log_inode(tp, tip, target_log_flags); 2025 xfs_trans_log_inode(tp, tip, target_log_flags);
2002 2026
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 7a2b4da3c0db..047573f02702 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -855,6 +855,41 @@ xfs_btree_readahead(
855 return xfs_btree_readahead_sblock(cur, lr, block); 855 return xfs_btree_readahead_sblock(cur, lr, block);
856} 856}
857 857
858STATIC xfs_daddr_t
859xfs_btree_ptr_to_daddr(
860 struct xfs_btree_cur *cur,
861 union xfs_btree_ptr *ptr)
862{
863 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
864 ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
865
866 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
867 } else {
868 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
869 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
870
871 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
872 be32_to_cpu(ptr->s));
873 }
874}
875
876/*
877 * Readahead @count btree blocks at the given @ptr location.
878 *
879 * We don't need to care about long or short form btrees here as we have a
880 * method of converting the ptr directly to a daddr available to us.
881 */
882STATIC void
883xfs_btree_readahead_ptr(
884 struct xfs_btree_cur *cur,
885 union xfs_btree_ptr *ptr,
886 xfs_extlen_t count)
887{
888 xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
889 xfs_btree_ptr_to_daddr(cur, ptr),
890 cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
891}
892
858/* 893/*
859 * Set the buffer for level "lev" in the cursor to bp, releasing 894 * Set the buffer for level "lev" in the cursor to bp, releasing
860 * any previous buffer. 895 * any previous buffer.
@@ -1073,24 +1108,6 @@ xfs_btree_buf_to_ptr(
1073 } 1108 }
1074} 1109}
1075 1110
1076STATIC xfs_daddr_t
1077xfs_btree_ptr_to_daddr(
1078 struct xfs_btree_cur *cur,
1079 union xfs_btree_ptr *ptr)
1080{
1081 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1082 ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
1083
1084 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
1085 } else {
1086 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
1087 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
1088
1089 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
1090 be32_to_cpu(ptr->s));
1091 }
1092}
1093
1094STATIC void 1111STATIC void
1095xfs_btree_set_refs( 1112xfs_btree_set_refs(
1096 struct xfs_btree_cur *cur, 1113 struct xfs_btree_cur *cur,
@@ -3869,3 +3886,112 @@ xfs_btree_get_rec(
3869 *stat = 1; 3886 *stat = 1;
3870 return 0; 3887 return 0;
3871} 3888}
3889
3890/*
3891 * Change the owner of a btree.
3892 *
3893 * The mechanism we use here is ordered buffer logging. Because we don't know
3894 * how many buffers were are going to need to modify, we don't really want to
3895 * have to make transaction reservations for the worst case of every buffer in a
3896 * full size btree as that may be more space that we can fit in the log....
3897 *
3898 * We do the btree walk in the most optimal manner possible - we have sibling
3899 * pointers so we can just walk all the blocks on each level from left to right
3900 * in a single pass, and then move to the next level and do the same. We can
3901 * also do readahead on the sibling pointers to get IO moving more quickly,
3902 * though for slow disks this is unlikely to make much difference to performance
3903 * as the amount of CPU work we have to do before moving to the next block is
3904 * relatively small.
3905 *
3906 * For each btree block that we load, modify the owner appropriately, set the
3907 * buffer as an ordered buffer and log it appropriately. We need to ensure that
3908 * we mark the region we change dirty so that if the buffer is relogged in
3909 * a subsequent transaction the changes we make here as an ordered buffer are
3910 * correctly relogged in that transaction.
3911 */
3912static int
3913xfs_btree_block_change_owner(
3914 struct xfs_btree_cur *cur,
3915 int level,
3916 __uint64_t new_owner)
3917{
3918 struct xfs_btree_block *block;
3919 struct xfs_buf *bp;
3920 union xfs_btree_ptr rptr;
3921
3922 /* do right sibling readahead */
3923 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
3924
3925 /* modify the owner */
3926 block = xfs_btree_get_block(cur, level, &bp);
3927 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
3928 block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
3929 else
3930 block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
3931
3932 /*
3933 * Log owner change as an ordered buffer. If the block is a root block
3934 * hosted in an inode, we might not have a buffer pointer here and we
3935 * shouldn't attempt to log the change as the information is already
3936 * held in the inode and discarded when the root block is formatted into
3937 * the on-disk inode fork. We still change it, though, so everything is
3938 * consistent in memory.
3939 */
3940 if (bp) {
3941 xfs_trans_ordered_buf(cur->bc_tp, bp);
3942 xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
3943 } else {
3944 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3945 ASSERT(level == cur->bc_nlevels - 1);
3946 }
3947
3948 /* now read rh sibling block for next iteration */
3949 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3950 if (xfs_btree_ptr_is_null(cur, &rptr))
3951 return ENOENT;
3952
3953 return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
3954}
3955
3956int
3957xfs_btree_change_owner(
3958 struct xfs_btree_cur *cur,
3959 __uint64_t new_owner)
3960{
3961 union xfs_btree_ptr lptr;
3962 int level;
3963 struct xfs_btree_block *block = NULL;
3964 int error = 0;
3965
3966 cur->bc_ops->init_ptr_from_cur(cur, &lptr);
3967
3968 /* for each level */
3969 for (level = cur->bc_nlevels - 1; level >= 0; level--) {
3970 /* grab the left hand block */
3971 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
3972 if (error)
3973 return error;
3974
3975 /* readahead the left most block for the next level down */
3976 if (level > 0) {
3977 union xfs_btree_ptr *ptr;
3978
3979 ptr = xfs_btree_ptr_addr(cur, 1, block);
3980 xfs_btree_readahead_ptr(cur, ptr, 1);
3981
3982 /* save for the next iteration of the loop */
3983 lptr = *ptr;
3984 }
3985
3986 /* for each buffer in the level */
3987 do {
3988 error = xfs_btree_block_change_owner(cur, level,
3989 new_owner);
3990 } while (!error);
3991
3992 if (error != ENOENT)
3993 return error;
3994 }
3995
3996 return 0;
3997}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index c8473c7ef45e..544b209e0256 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -121,15 +121,18 @@ union xfs_btree_rec {
121/* 121/*
122 * For logging record fields. 122 * For logging record fields.
123 */ 123 */
124#define XFS_BB_MAGIC 0x01 124#define XFS_BB_MAGIC (1 << 0)
125#define XFS_BB_LEVEL 0x02 125#define XFS_BB_LEVEL (1 << 1)
126#define XFS_BB_NUMRECS 0x04 126#define XFS_BB_NUMRECS (1 << 2)
127#define XFS_BB_LEFTSIB 0x08 127#define XFS_BB_LEFTSIB (1 << 3)
128#define XFS_BB_RIGHTSIB 0x10 128#define XFS_BB_RIGHTSIB (1 << 4)
129#define XFS_BB_BLKNO 0x20 129#define XFS_BB_BLKNO (1 << 5)
130#define XFS_BB_LSN (1 << 6)
131#define XFS_BB_UUID (1 << 7)
132#define XFS_BB_OWNER (1 << 8)
130#define XFS_BB_NUM_BITS 5 133#define XFS_BB_NUM_BITS 5
131#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) 134#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
132#define XFS_BB_NUM_BITS_CRC 8 135#define XFS_BB_NUM_BITS_CRC 9
133#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) 136#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
134 137
135/* 138/*
@@ -442,6 +445,7 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
442int xfs_btree_insert(struct xfs_btree_cur *, int *); 445int xfs_btree_insert(struct xfs_btree_cur *, int *);
443int xfs_btree_delete(struct xfs_btree_cur *, int *); 446int xfs_btree_delete(struct xfs_btree_cur *, int *);
444int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); 447int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
448int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner);
445 449
446/* 450/*
447 * btree block CRC helpers 451 * btree block CRC helpers
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h
index 31e3a06c4644..08a6fbe03bb6 100644
--- a/fs/xfs/xfs_log_format.h
+++ b/fs/xfs/xfs_log_format.h
@@ -474,6 +474,7 @@ typedef struct xfs_inode_log_format_64 {
474#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ 474#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
475#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ 475#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
476#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ 476#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
477#define XFS_ILOG_OWNER 0x200 /* change the extent tree owner on replay */
477 478
478 479
479/* 480/*