aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_bmap_btree.c34
-rw-r--r--fs/xfs/xfs_bmap_btree.h3
-rw-r--r--fs/xfs/xfs_bmap_util.c52
-rw-r--r--fs/xfs/xfs_btree.c162
-rw-r--r--fs/xfs/xfs_btree.h18
-rw-r--r--fs/xfs/xfs_log_format.h1
6 files changed, 231 insertions, 39 deletions
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index cf3bc76710c3..aa2eadd41bab 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -925,3 +925,37 @@ xfs_bmdr_maxrecs(
925 return blocklen / sizeof(xfs_bmdr_rec_t); 925 return blocklen / sizeof(xfs_bmdr_rec_t);
926 return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t)); 926 return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
927} 927}
928
929/*
930 * Change the owner of a btree format fork fo the inode passed in. Change it to
931 * the owner of that is passed in so that we can change owners before or after
932 * we switch forks between inodes. The operation that the caller is doing will
933 * determine whether is needs to change owner before or after the switch.
934 *
935 * For demand paged modification, the fork switch should be done after reading
936 * in all the blocks, modifying them and pinning them in the transaction. For
937 * modification when the buffers are already pinned in memory, the fork switch
938 * can be done before changing the owner as we won't need to validate the owner
939 * until the btree buffers are unpinned and writes can occur again.
940 */
941int
942xfs_bmbt_change_owner(
943 struct xfs_trans *tp,
944 struct xfs_inode *ip,
945 int whichfork,
946 xfs_ino_t new_owner)
947{
948 struct xfs_btree_cur *cur;
949 int error;
950
951 if (whichfork == XFS_DATA_FORK)
952 ASSERT(ip->i_d.di_format = XFS_DINODE_FMT_BTREE);
953 else
954 ASSERT(ip->i_d.di_aformat = XFS_DINODE_FMT_BTREE);
955
956 cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
957 error = xfs_btree_change_owner(cur, new_owner);
958 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
959 return error;
960}
961
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 1b726d626941..bceac7affa27 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,6 +236,9 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
236extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); 236extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
237extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); 237extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
238 238
239extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
240 int whichfork, xfs_ino_t new_owner);
241
239extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, 242extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
240 struct xfs_trans *, struct xfs_inode *, int); 243 struct xfs_trans *, struct xfs_inode *, int);
241 244
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 541d59f5e658..ad8a91d2e011 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1789,14 +1789,6 @@ xfs_swap_extents(
1789 int taforkblks = 0; 1789 int taforkblks = 0;
1790 __uint64_t tmp; 1790 __uint64_t tmp;
1791 1791
1792 /*
1793 * We have no way of updating owner information in the BMBT blocks for
1794 * each inode on CRC enabled filesystems, so to avoid corrupting the
1795 * this metadata we simply don't allow extent swaps to occur.
1796 */
1797 if (xfs_sb_version_hascrc(&mp->m_sb))
1798 return XFS_ERROR(EINVAL);
1799
1800 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); 1792 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
1801 if (!tempifp) { 1793 if (!tempifp) {
1802 error = XFS_ERROR(ENOMEM); 1794 error = XFS_ERROR(ENOMEM);
@@ -1920,6 +1912,40 @@ xfs_swap_extents(
1920 goto out_trans_cancel; 1912 goto out_trans_cancel;
1921 } 1913 }
1922 1914
1915 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1916 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1917
1918 /*
1919 * Before we've swapped the forks, lets set the owners of the forks
1920 * appropriately. We have to do this as we are demand paging the btree
1921 * buffers, and so the validation done on read will expect the owner
1922 * field to be correctly set. Once we change the owners, we can swap the
1923 * inode forks.
1924 *
1925 * Note the trickiness in setting the log flags - we set the owner log
1926 * flag on the opposite inode (i.e. the inode we are setting the new
1927 * owner to be) because once we swap the forks and log that, log
1928 * recovery is going to see the fork as owned by the swapped inode,
1929 * not the pre-swapped inodes.
1930 */
1931 src_log_flags = XFS_ILOG_CORE;
1932 target_log_flags = XFS_ILOG_CORE;
1933 if (ip->i_d.di_version == 3 &&
1934 ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1935 target_log_flags |= XFS_ILOG_OWNER;
1936 error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, tip->i_ino);
1937 if (error)
1938 goto out_trans_cancel;
1939 }
1940
1941 if (tip->i_d.di_version == 3 &&
1942 tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1943 src_log_flags |= XFS_ILOG_OWNER;
1944 error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, ip->i_ino);
1945 if (error)
1946 goto out_trans_cancel;
1947 }
1948
1923 /* 1949 /*
1924 * Swap the data forks of the inodes 1950 * Swap the data forks of the inodes
1925 */ 1951 */
@@ -1957,7 +1983,6 @@ xfs_swap_extents(
1957 tip->i_delayed_blks = ip->i_delayed_blks; 1983 tip->i_delayed_blks = ip->i_delayed_blks;
1958 ip->i_delayed_blks = 0; 1984 ip->i_delayed_blks = 0;
1959 1985
1960 src_log_flags = XFS_ILOG_CORE;
1961 switch (ip->i_d.di_format) { 1986 switch (ip->i_d.di_format) {
1962 case XFS_DINODE_FMT_EXTENTS: 1987 case XFS_DINODE_FMT_EXTENTS:
1963 /* If the extents fit in the inode, fix the 1988 /* If the extents fit in the inode, fix the
@@ -1971,11 +1996,12 @@ xfs_swap_extents(
1971 src_log_flags |= XFS_ILOG_DEXT; 1996 src_log_flags |= XFS_ILOG_DEXT;
1972 break; 1997 break;
1973 case XFS_DINODE_FMT_BTREE: 1998 case XFS_DINODE_FMT_BTREE:
1999 ASSERT(ip->i_d.di_version < 3 ||
2000 (src_log_flags & XFS_ILOG_OWNER));
1974 src_log_flags |= XFS_ILOG_DBROOT; 2001 src_log_flags |= XFS_ILOG_DBROOT;
1975 break; 2002 break;
1976 } 2003 }
1977 2004
1978 target_log_flags = XFS_ILOG_CORE;
1979 switch (tip->i_d.di_format) { 2005 switch (tip->i_d.di_format) {
1980 case XFS_DINODE_FMT_EXTENTS: 2006 case XFS_DINODE_FMT_EXTENTS:
1981 /* If the extents fit in the inode, fix the 2007 /* If the extents fit in the inode, fix the
@@ -1990,13 +2016,11 @@ xfs_swap_extents(
1990 break; 2016 break;
1991 case XFS_DINODE_FMT_BTREE: 2017 case XFS_DINODE_FMT_BTREE:
1992 target_log_flags |= XFS_ILOG_DBROOT; 2018 target_log_flags |= XFS_ILOG_DBROOT;
2019 ASSERT(tip->i_d.di_version < 3 ||
2020 (target_log_flags & XFS_ILOG_OWNER));
1993 break; 2021 break;
1994 } 2022 }
1995 2023
1996
1997 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1998 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1999
2000 xfs_trans_log_inode(tp, ip, src_log_flags); 2024 xfs_trans_log_inode(tp, ip, src_log_flags);
2001 xfs_trans_log_inode(tp, tip, target_log_flags); 2025 xfs_trans_log_inode(tp, tip, target_log_flags);
2002 2026
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 7a2b4da3c0db..047573f02702 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -855,6 +855,41 @@ xfs_btree_readahead(
855 return xfs_btree_readahead_sblock(cur, lr, block); 855 return xfs_btree_readahead_sblock(cur, lr, block);
856} 856}
857 857
858STATIC xfs_daddr_t
859xfs_btree_ptr_to_daddr(
860 struct xfs_btree_cur *cur,
861 union xfs_btree_ptr *ptr)
862{
863 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
864 ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
865
866 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
867 } else {
868 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
869 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
870
871 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
872 be32_to_cpu(ptr->s));
873 }
874}
875
876/*
877 * Readahead @count btree blocks at the given @ptr location.
878 *
879 * We don't need to care about long or short form btrees here as we have a
880 * method of converting the ptr directly to a daddr available to us.
881 */
882STATIC void
883xfs_btree_readahead_ptr(
884 struct xfs_btree_cur *cur,
885 union xfs_btree_ptr *ptr,
886 xfs_extlen_t count)
887{
888 xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
889 xfs_btree_ptr_to_daddr(cur, ptr),
890 cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
891}
892
858/* 893/*
859 * Set the buffer for level "lev" in the cursor to bp, releasing 894 * Set the buffer for level "lev" in the cursor to bp, releasing
860 * any previous buffer. 895 * any previous buffer.
@@ -1073,24 +1108,6 @@ xfs_btree_buf_to_ptr(
1073 } 1108 }
1074} 1109}
1075 1110
1076STATIC xfs_daddr_t
1077xfs_btree_ptr_to_daddr(
1078 struct xfs_btree_cur *cur,
1079 union xfs_btree_ptr *ptr)
1080{
1081 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1082 ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
1083
1084 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
1085 } else {
1086 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
1087 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
1088
1089 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
1090 be32_to_cpu(ptr->s));
1091 }
1092}
1093
1094STATIC void 1111STATIC void
1095xfs_btree_set_refs( 1112xfs_btree_set_refs(
1096 struct xfs_btree_cur *cur, 1113 struct xfs_btree_cur *cur,
@@ -3869,3 +3886,112 @@ xfs_btree_get_rec(
3869 *stat = 1; 3886 *stat = 1;
3870 return 0; 3887 return 0;
3871} 3888}
3889
3890/*
3891 * Change the owner of a btree.
3892 *
3893 * The mechanism we use here is ordered buffer logging. Because we don't know
3894 * how many buffers were are going to need to modify, we don't really want to
3895 * have to make transaction reservations for the worst case of every buffer in a
3896 * full size btree as that may be more space that we can fit in the log....
3897 *
3898 * We do the btree walk in the most optimal manner possible - we have sibling
3899 * pointers so we can just walk all the blocks on each level from left to right
3900 * in a single pass, and then move to the next level and do the same. We can
3901 * also do readahead on the sibling pointers to get IO moving more quickly,
3902 * though for slow disks this is unlikely to make much difference to performance
3903 * as the amount of CPU work we have to do before moving to the next block is
3904 * relatively small.
3905 *
3906 * For each btree block that we load, modify the owner appropriately, set the
3907 * buffer as an ordered buffer and log it appropriately. We need to ensure that
3908 * we mark the region we change dirty so that if the buffer is relogged in
3909 * a subsequent transaction the changes we make here as an ordered buffer are
3910 * correctly relogged in that transaction.
3911 */
3912static int
3913xfs_btree_block_change_owner(
3914 struct xfs_btree_cur *cur,
3915 int level,
3916 __uint64_t new_owner)
3917{
3918 struct xfs_btree_block *block;
3919 struct xfs_buf *bp;
3920 union xfs_btree_ptr rptr;
3921
3922 /* do right sibling readahead */
3923 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
3924
3925 /* modify the owner */
3926 block = xfs_btree_get_block(cur, level, &bp);
3927 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
3928 block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
3929 else
3930 block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
3931
3932 /*
3933 * Log owner change as an ordered buffer. If the block is a root block
3934 * hosted in an inode, we might not have a buffer pointer here and we
3935 * shouldn't attempt to log the change as the information is already
3936 * held in the inode and discarded when the root block is formatted into
3937 * the on-disk inode fork. We still change it, though, so everything is
3938 * consistent in memory.
3939 */
3940 if (bp) {
3941 xfs_trans_ordered_buf(cur->bc_tp, bp);
3942 xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
3943 } else {
3944 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3945 ASSERT(level == cur->bc_nlevels - 1);
3946 }
3947
3948 /* now read rh sibling block for next iteration */
3949 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3950 if (xfs_btree_ptr_is_null(cur, &rptr))
3951 return ENOENT;
3952
3953 return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
3954}
3955
3956int
3957xfs_btree_change_owner(
3958 struct xfs_btree_cur *cur,
3959 __uint64_t new_owner)
3960{
3961 union xfs_btree_ptr lptr;
3962 int level;
3963 struct xfs_btree_block *block = NULL;
3964 int error = 0;
3965
3966 cur->bc_ops->init_ptr_from_cur(cur, &lptr);
3967
3968 /* for each level */
3969 for (level = cur->bc_nlevels - 1; level >= 0; level--) {
3970 /* grab the left hand block */
3971 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
3972 if (error)
3973 return error;
3974
3975 /* readahead the left most block for the next level down */
3976 if (level > 0) {
3977 union xfs_btree_ptr *ptr;
3978
3979 ptr = xfs_btree_ptr_addr(cur, 1, block);
3980 xfs_btree_readahead_ptr(cur, ptr, 1);
3981
3982 /* save for the next iteration of the loop */
3983 lptr = *ptr;
3984 }
3985
3986 /* for each buffer in the level */
3987 do {
3988 error = xfs_btree_block_change_owner(cur, level,
3989 new_owner);
3990 } while (!error);
3991
3992 if (error != ENOENT)
3993 return error;
3994 }
3995
3996 return 0;
3997}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index c8473c7ef45e..544b209e0256 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -121,15 +121,18 @@ union xfs_btree_rec {
121/* 121/*
122 * For logging record fields. 122 * For logging record fields.
123 */ 123 */
124#define XFS_BB_MAGIC 0x01 124#define XFS_BB_MAGIC (1 << 0)
125#define XFS_BB_LEVEL 0x02 125#define XFS_BB_LEVEL (1 << 1)
126#define XFS_BB_NUMRECS 0x04 126#define XFS_BB_NUMRECS (1 << 2)
127#define XFS_BB_LEFTSIB 0x08 127#define XFS_BB_LEFTSIB (1 << 3)
128#define XFS_BB_RIGHTSIB 0x10 128#define XFS_BB_RIGHTSIB (1 << 4)
129#define XFS_BB_BLKNO 0x20 129#define XFS_BB_BLKNO (1 << 5)
130#define XFS_BB_LSN (1 << 6)
131#define XFS_BB_UUID (1 << 7)
132#define XFS_BB_OWNER (1 << 8)
130#define XFS_BB_NUM_BITS 5 133#define XFS_BB_NUM_BITS 5
131#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) 134#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
132#define XFS_BB_NUM_BITS_CRC 8 135#define XFS_BB_NUM_BITS_CRC 9
133#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) 136#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
134 137
135/* 138/*
@@ -442,6 +445,7 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
442int xfs_btree_insert(struct xfs_btree_cur *, int *); 445int xfs_btree_insert(struct xfs_btree_cur *, int *);
443int xfs_btree_delete(struct xfs_btree_cur *, int *); 446int xfs_btree_delete(struct xfs_btree_cur *, int *);
444int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); 447int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
448int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner);
445 449
446/* 450/*
447 * btree block CRC helpers 451 * btree block CRC helpers
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h
index 31e3a06c4644..08a6fbe03bb6 100644
--- a/fs/xfs/xfs_log_format.h
+++ b/fs/xfs/xfs_log_format.h
@@ -474,6 +474,7 @@ typedef struct xfs_inode_log_format_64 {
474#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ 474#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
475#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ 475#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
476#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ 476#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
477#define XFS_ILOG_OWNER 0x200 /* change the extent tree owner on replay */
477 478
478 479
479/* 480/*