aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Chinner <dgc@sgi.com>2008-03-27 03:00:45 -0400
committerLachlan McIlroy <lachlan@redback.melbourne.sgi.com>2008-04-17 21:42:21 -0400
commit59a33f9f776b051018ec98af95bd9fe8ba9d0f3e (patch)
treed8d93fcd6ef6a77a7efe722ed61febd3b4051bf7
parent75de2a91c98a6f486f261c1367fe59f5583e15a3 (diff)
[XFS] Ensure a btree insert returns a valid cursor.
When writing into preallocated regions there is a case where XFS can oops or hang doing the unwritten extent conversion on I/O completion. It turns out that the problem is related to the btree cursor being invalid. When we do an insert into the tree, we may need to split blocks in the tree. When we only split at the leaf level (i.e. level 0), everything works just fine. However, if we have a multi-level split in the btreee, the cursor passed to the insert function is no longer valid once the insert is complete. The leaf level split is handled correctly because all the operations at level 0 are done using the original cursor, hence it is updated correctly. However, when we need to update the next level up the tree, we don't use that cursor - we use a cloned cursor that points to the index in the next level up where we need to do the insert. Hence if we need to split a second level, the changes to the tree are reflected in the cloned cursor and not the original cursor. This clone-and-move-up-a-level-on-split behaviour recurses all the way to the top of the tree. The complexity here is that these cloned cursors do not point to the original index that was inserted - they point to the newly allocated block (the right block) and the original cursor pointer to that level may still point to the left block. Hence, without deep examination of the cloned cursor and buffers, we cannot update the original cursor with the new path from the cloned cursor. In these cases the original cursor could be pointing to the wrong block(s) and hence a subsequent modification to the tree using that cursor will lead to corruption of the tree. The crash case occurs when the tree changes height - we insert a new level in the tree, and the cursor does not have a buffer in it's path for that level. Hence any attempt to walk back up the cursor to the root block will result in a null pointer dereference. To make matters even more complex, the BMAP BT is rooted in an inode, so we can have a change of height in the btree *without a root split*. That is, if the root block in the inode is full when we split a leaf node, we cannot fit the pointer to the new block in the root, so we allocate a new block, migrate all the ptrs out of the inode into the new block and point the inode root block at the newly allocated block. This changes the height of the tree without a root split having occurred and hence invalidates the path in the original cursor. The patch below prevents xfs_bmbt_insert() from returning with an invalid cursor by detecting the cases that invalidate the original cursor and refresh it by do a lookup into the btree for the original index we were inserting at. Note that the INOBT, AGFBNO and AGFCNT btree implementations also have this bug, but the cursor is currently always destroyed or revalidated after an insert for those trees. Hence this patch only address the problem in the BMBT code. SGI-PV: 979339 SGI-Modid: xfs-linux-melb:xfs-kern:30701a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
-rw-r--r--fs/xfs/xfs_bmap_btree.c38
1 files changed, 36 insertions, 2 deletions
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bd18987326a3..93470b728dd0 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -2027,6 +2027,24 @@ xfs_bmbt_increment(
2027 2027
2028/* 2028/*
2029 * Insert the current record at the point referenced by cur. 2029 * Insert the current record at the point referenced by cur.
2030 *
2031 * A multi-level split of the tree on insert will invalidate the original
2032 * cursor. It appears, however, that some callers assume that the cursor is
2033 * always valid. Hence if we do a multi-level split we need to revalidate the
2034 * cursor.
2035 *
2036 * When a split occurs, we will see a new cursor returned. Use that as a
2037 * trigger to determine if we need to revalidate the original cursor. If we get
2038 * a split, then use the original irec to lookup up the path of the record we
2039 * just inserted.
2040 *
2041 * Note that the fact that the btree root is in the inode means that we can
2042 * have the level of the tree change without a "split" occurring at the root
2043 * level. What happens is that the root is migrated to an allocated block and
2044 * the inode root is pointed to it. This means a single split can change the
2045 * level of the tree (level 2 -> level 3) and invalidate the old cursor. Hence
2046 * the level change should be accounted as a split so as to correctly trigger a
2047 * revalidation of the old cursor.
2030 */ 2048 */
2031int /* error */ 2049int /* error */
2032xfs_bmbt_insert( 2050xfs_bmbt_insert(
@@ -2039,11 +2057,14 @@ xfs_bmbt_insert(
2039 xfs_fsblock_t nbno; 2057 xfs_fsblock_t nbno;
2040 xfs_btree_cur_t *ncur; 2058 xfs_btree_cur_t *ncur;
2041 xfs_bmbt_rec_t nrec; 2059 xfs_bmbt_rec_t nrec;
2060 xfs_bmbt_irec_t oirec; /* original irec */
2042 xfs_btree_cur_t *pcur; 2061 xfs_btree_cur_t *pcur;
2062 int splits = 0;
2043 2063
2044 XFS_BMBT_TRACE_CURSOR(cur, ENTRY); 2064 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2045 level = 0; 2065 level = 0;
2046 nbno = NULLFSBLOCK; 2066 nbno = NULLFSBLOCK;
2067 oirec = cur->bc_rec.b;
2047 xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b); 2068 xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
2048 ncur = NULL; 2069 ncur = NULL;
2049 pcur = cur; 2070 pcur = cur;
@@ -2052,11 +2073,13 @@ xfs_bmbt_insert(
2052 &i))) { 2073 &i))) {
2053 if (pcur != cur) 2074 if (pcur != cur)
2054 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); 2075 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
2055 XFS_BMBT_TRACE_CURSOR(cur, ERROR); 2076 goto error0;
2056 return error;
2057 } 2077 }
2058 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 2078 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
2059 if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) { 2079 if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
2080 /* allocating a new root is effectively a split */
2081 if (cur->bc_nlevels != pcur->bc_nlevels)
2082 splits++;
2060 cur->bc_nlevels = pcur->bc_nlevels; 2083 cur->bc_nlevels = pcur->bc_nlevels;
2061 cur->bc_private.b.allocated += 2084 cur->bc_private.b.allocated +=
2062 pcur->bc_private.b.allocated; 2085 pcur->bc_private.b.allocated;
@@ -2070,10 +2093,21 @@ xfs_bmbt_insert(
2070 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); 2093 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
2071 } 2094 }
2072 if (ncur) { 2095 if (ncur) {
2096 splits++;
2073 pcur = ncur; 2097 pcur = ncur;
2074 ncur = NULL; 2098 ncur = NULL;
2075 } 2099 }
2076 } while (nbno != NULLFSBLOCK); 2100 } while (nbno != NULLFSBLOCK);
2101
2102 if (splits > 1) {
2103 /* revalidate the old cursor as we had a multi-level split */
2104 error = xfs_bmbt_lookup_eq(cur, oirec.br_startoff,
2105 oirec.br_startblock, oirec.br_blockcount, &i);
2106 if (error)
2107 goto error0;
2108 ASSERT(i == 1);
2109 }
2110
2077 XFS_BMBT_TRACE_CURSOR(cur, EXIT); 2111 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2078 *stat = i; 2112 *stat = i;
2079 return 0; 2113 return 0;