aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2015-11-13 18:57:17 -0500
committerChris Mason <clm@fb.com>2015-11-25 08:19:50 -0500
commit7fd01182d1a1412cd44a5474b9aa93548d4a73ae (patch)
treebf5b688ce933106b192480ecbcccb38398e3a8cd /fs
parent8eab77ff167b62760d878f1d19312eb9f7d4c176 (diff)
Btrfs: fix the number of transaction units needed to remove a block group
We were using only 1 transaction unit when attempting to delete an unused block group but in reality we need 3 + N units, where N corresponds to the number of stripes. We were accounting only for the addition of the orphan item (for the block group's free space cache inode) but we were not accounting that we need to delete one block group item from the extent tree, one free space item from the tree of tree roots and N device extent items from the device tree. While one unit is not enough, it worked most of the time because for each single unit we are too pessimistic and assume an entire tree path, with the highest possible heigth (8), needs to be COWed with eventual node splits at every possible level in the tree, so there was usually enough reserved space for removing all the items and adding the orphan item. However after adding the orphan item, writepages() can by called by the VM subsystem against the btree inode when we are under memory pressure, which causes writeback to start for the nodes we COWed before, this forces the operation to remove the free space item to COW again some (or all of) the same nodes (in the tree of tree roots). Even without writepages() being called, we could fail with ENOSPC because these items are located in multiple trees and one of them might have a higher heigth and require node/leaf splits at many levels, exhausting all the reserved space before removing all the items and adding the orphan. In the kernel 4.0 release, commit 3d84be799194 ("Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group"), we attempted to fix a BUG_ON due to ENOSPC when trying to add the orphan item by making the cleaner kthread reserve one transaction unit before attempting to remove the block group, but this was not enough. We had a couple user reports still hitting the same BUG_ON after 4.0, like Stefan Priebe's report on a 4.2-rc6 kernel for example: http://www.spinics.net/lists/linux-btrfs/msg46070.html So fix this by reserving all the necessary units of metadata. Reported-by: Stefan Priebe <s.priebe@profihost.ag> Fixes: 3d84be799194 ("Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/ctree.h3
-rw-r--r--fs/btrfs/extent-tree.c37
-rw-r--r--fs/btrfs/volumes.c3
3 files changed, 38 insertions, 5 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1573be6f9518..d88994f71eae 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3480,7 +3480,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3480 u64 type, u64 chunk_objectid, u64 chunk_offset, 3480 u64 type, u64 chunk_objectid, u64 chunk_offset,
3481 u64 size); 3481 u64 size);
3482struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( 3482struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
3483 struct btrfs_fs_info *fs_info); 3483 struct btrfs_fs_info *fs_info,
3484 const u64 chunk_offset);
3484int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 3485int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
3485 struct btrfs_root *root, u64 group_start, 3486 struct btrfs_root *root, u64 group_start,
3486 struct extent_map *em); 3487 struct extent_map *em);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 78200932c1cf..e97d6d61cd42 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10257,14 +10257,44 @@ out:
10257} 10257}
10258 10258
10259struct btrfs_trans_handle * 10259struct btrfs_trans_handle *
10260btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info) 10260btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10261 const u64 chunk_offset)
10261{ 10262{
10263 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10264 struct extent_map *em;
10265 struct map_lookup *map;
10266 unsigned int num_items;
10267
10268 read_lock(&em_tree->lock);
10269 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10270 read_unlock(&em_tree->lock);
10271 ASSERT(em && em->start == chunk_offset);
10272
10262 /* 10273 /*
10274 * We need to reserve 3 + N units from the metadata space info in order
10275 * to remove a block group (done at btrfs_remove_chunk() and at
10276 * btrfs_remove_block_group()), which are used for:
10277 *
10263 * 1 unit for adding the free space inode's orphan (located in the tree 10278 * 1 unit for adding the free space inode's orphan (located in the tree
10264 * of tree roots). 10279 * of tree roots).
10280 * 1 unit for deleting the block group item (located in the extent
10281 * tree).
10282 * 1 unit for deleting the free space item (located in tree of tree
10283 * roots).
10284 * N units for deleting N device extent items corresponding to each
10285 * stripe (located in the device tree).
10286 *
10287 * In order to remove a block group we also need to reserve units in the
10288 * system space info in order to update the chunk tree (update one or
10289 * more device items and remove one chunk item), but this is done at
10290 * btrfs_remove_chunk() through a call to check_system_chunk().
10265 */ 10291 */
10292 map = (struct map_lookup *)em->bdev;
10293 num_items = 3 + map->num_stripes;
10294 free_extent_map(em);
10295
10266 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 10296 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10267 1, 1); 10297 num_items, 1);
10268} 10298}
10269 10299
10270/* 10300/*
@@ -10333,7 +10363,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10333 * Want to do this before we do anything else so we can recover 10363 * Want to do this before we do anything else so we can recover
10334 * properly if we fail to join the transaction. 10364 * properly if we fail to join the transaction.
10335 */ 10365 */
10336 trans = btrfs_start_trans_remove_block_group(fs_info); 10366 trans = btrfs_start_trans_remove_block_group(fs_info,
10367 block_group->key.objectid);
10337 if (IS_ERR(trans)) { 10368 if (IS_ERR(trans)) {
10338 btrfs_dec_block_group_ro(root, block_group); 10369 btrfs_dec_block_group_ro(root, block_group);
10339 ret = PTR_ERR(trans); 10370 ret = PTR_ERR(trans);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e0bd364e958d..45f20252efed 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2853,7 +2853,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
2853 if (ret) 2853 if (ret)
2854 return ret; 2854 return ret;
2855 2855
2856 trans = btrfs_start_trans_remove_block_group(root->fs_info); 2856 trans = btrfs_start_trans_remove_block_group(root->fs_info,
2857 chunk_offset);
2857 if (IS_ERR(trans)) { 2858 if (IS_ERR(trans)) {
2858 ret = PTR_ERR(trans); 2859 ret = PTR_ERR(trans);
2859 btrfs_std_error(root->fs_info, ret, NULL); 2860 btrfs_std_error(root->fs_info, ret, NULL);