aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2014-12-07 16:31:47 -0500
committerChris Mason <clm@fb.com>2014-12-10 15:22:31 -0500
commit678886bdc6378c1cbd5072da2c5a3035000214e3 (patch)
tree889ed90fc7f07d2bdbdb6929f60be4c6580f384a /fs
parent01eacb277909de942abbdff37584ad331e3893fe (diff)
Btrfs: fix fs corruption on transaction abort if device supports discard
When we abort a transaction we iterate over all the ranges marked as dirty in fs_info->freed_extents[0] and fs_info->freed_extents[1], clear them from those trees, add them back (unpin) to the free space caches and, if the fs was mounted with "-o discard", perform a discard on those regions. Also, after adding the regions to the free space caches, a fitrim ioctl call can see those ranges in a block group's free space cache and perform a discard on the ranges, so the same issue can happen without "-o discard" as well. This causes corruption, affecting one or multiple btree nodes (in the worst case leaving the fs unmountable) because some of those ranges (the ones in the fs_info->pinned_extents tree) correspond to btree nodes/leafs that are referred by the last committed super block - breaking the rule that anything that was committed by a transaction is untouched until the next transaction commits successfully. I ran into this while running in a loop (for several hours) the fstest that I recently submitted: [PATCH] fstests: add btrfs test to stress chunk allocation/removal and fstrim The corruption always happened when a transaction aborted and then fsck complained like this: _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent *** fsck.btrfs output *** Check tree block failed, want=94945280, have=0 Check tree block failed, want=94945280, have=0 Check tree block failed, want=94945280, have=0 Check tree block failed, want=94945280, have=0 Check tree block failed, want=94945280, have=0 read block failed check_tree_block Couldn't open file system In this case 94945280 corresponded to the root of a tree. Using frace what I observed was the following sequence of steps happened: 1) transaction N started, fs_info->pinned_extents pointed to fs_info->freed_extents[0]; 2) node/eb 94945280 is created; 3) eb is persisted to disk; 4) transaction N commit starts, fs_info->pinned_extents now points to fs_info->freed_extents[1], and transaction N completes; 5) transaction N + 1 starts; 6) eb is COWed, and btrfs_free_tree_block() called for this eb; 7) eb range (94945280 to 94945280 + 16Kb) is added to fs_info->pinned_extents (fs_info->freed_extents[1]); 8) Something goes wrong in transaction N + 1, like hitting ENOSPC for example, and the transaction is aborted, turning the fs into readonly mode. The stack trace I got for example: [112065.253935] [<ffffffff8140c7b6>] dump_stack+0x4d/0x66 [112065.254271] [<ffffffff81042984>] warn_slowpath_common+0x7f/0x98 [112065.254567] [<ffffffffa0325990>] ? __btrfs_abort_transaction+0x50/0x10b [btrfs] [112065.261674] [<ffffffff810429e5>] warn_slowpath_fmt+0x48/0x50 [112065.261922] [<ffffffffa032949e>] ? btrfs_free_path+0x26/0x29 [btrfs] [112065.262211] [<ffffffffa0325990>] __btrfs_abort_transaction+0x50/0x10b [btrfs] [112065.262545] [<ffffffffa036b1d6>] btrfs_remove_chunk+0x537/0x58b [btrfs] [112065.262771] [<ffffffffa033840f>] btrfs_delete_unused_bgs+0x1de/0x21b [btrfs] [112065.263105] [<ffffffffa0343106>] cleaner_kthread+0x100/0x12f [btrfs] (...) [112065.264493] ---[ end trace dd7903a975a31a08 ]--- [112065.264673] BTRFS: error (device sdc) in btrfs_remove_chunk:2625: errno=-28 No space left [112065.264997] BTRFS info (device sdc): forced readonly 9) The clear kthread sees that the BTRFS_FS_STATE_ERROR bit is set in fs_info->fs_state and calls btrfs_cleanup_transaction(), which in turn calls btrfs_destroy_pinned_extent(); 10) Then btrfs_destroy_pinned_extent() iterates over all the ranges marked as dirty in fs_info->freed_extents[], and for each one it calls discard, if the fs was mounted with "-o discard", and adds the range to the free space cache of the respective block group; 11) btrfs_trim_block_group(), invoked from the fitrim ioctl code path, sees the free space entries and performs a discard; 12) After an umount and mount (or fsck), our eb's location on disk was full of zeroes, and it should have been untouched, because it was marked as dirty in the fs_info->pinned_extents tree, and therefore used by the trees that the last committed superblock points to. Fix this by not performing a discard and not adding the ranges to the free space caches - it's useless from this point since the fs is now in readonly mode and we won't write free space caches to disk anymore (otherwise we would leak space) nor any new superblock. By not adding the ranges to the free space caches, it prevents other code paths from allocating that space and write to it as well, therefore being safer and simpler. This isn't a new problem, as it's been present since 2011 (git commit acce952b0263825da32cf10489413dec78053347). Cc: stable@vger.kernel.org # any kernel released after 2011-01-06 Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/disk-io.c6
-rw-r--r--fs/btrfs/extent-tree.c10
2 files changed, 6 insertions, 10 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 30965120772b..8c63419a7f70 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4121,12 +4121,6 @@ again:
4121 if (ret) 4121 if (ret)
4122 break; 4122 break;
4123 4123
4124 /* opt_discard */
4125 if (btrfs_test_opt(root, DISCARD))
4126 ret = btrfs_error_discard_extent(root, start,
4127 end + 1 - start,
4128 NULL);
4129
4130 clear_extent_dirty(unpin, start, end, GFP_NOFS); 4124 clear_extent_dirty(unpin, start, end, GFP_NOFS);
4131 btrfs_error_unpin_extent_range(root, start, end); 4125 btrfs_error_unpin_extent_range(root, start, end);
4132 cond_resched(); 4126 cond_resched();
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 974b3edf69c7..4f3c03d9a575 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5727,7 +5727,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5727 update_global_block_rsv(fs_info); 5727 update_global_block_rsv(fs_info);
5728} 5728}
5729 5729
5730static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 5730static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
5731 const bool return_free_space)
5731{ 5732{
5732 struct btrfs_fs_info *fs_info = root->fs_info; 5733 struct btrfs_fs_info *fs_info = root->fs_info;
5733 struct btrfs_block_group_cache *cache = NULL; 5734 struct btrfs_block_group_cache *cache = NULL;
@@ -5751,7 +5752,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
5751 5752
5752 if (start < cache->last_byte_to_unpin) { 5753 if (start < cache->last_byte_to_unpin) {
5753 len = min(len, cache->last_byte_to_unpin - start); 5754 len = min(len, cache->last_byte_to_unpin - start);
5754 btrfs_add_free_space(cache, start, len); 5755 if (return_free_space)
5756 btrfs_add_free_space(cache, start, len);
5755 } 5757 }
5756 5758
5757 start += len; 5759 start += len;
@@ -5815,7 +5817,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5815 end + 1 - start, NULL); 5817 end + 1 - start, NULL);
5816 5818
5817 clear_extent_dirty(unpin, start, end, GFP_NOFS); 5819 clear_extent_dirty(unpin, start, end, GFP_NOFS);
5818 unpin_extent_range(root, start, end); 5820 unpin_extent_range(root, start, end, true);
5819 cond_resched(); 5821 cond_resched();
5820 } 5822 }
5821 5823
@@ -9693,7 +9695,7 @@ out:
9693 9695
9694int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 9696int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
9695{ 9697{
9696 return unpin_extent_range(root, start, end); 9698 return unpin_extent_range(root, start, end, false);
9697} 9699}
9698 9700
9699int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 9701int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,