aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2014-11-27 16:14:15 -0500
committerChris Mason <clm@fb.com>2014-12-02 21:35:09 -0500
commit04216820fe83d5e27322065ba989de27dbfc104d (patch)
treefebfcea25770f1298cbb7470dbeba5d473751530
parentae0ab003f2840e7ed66ed762ec0d4e298ad1e725 (diff)
Btrfs: fix race between fs trimming and block group remove/allocation
Our fs trim operation, which is completely transactionless (doesn't start or joins an existing transaction) consists of visiting all block groups and then for each one to iterate its free space entries and perform a discard operation against the space range represented by the free space entries. However before performing a discard, the corresponding free space entry is removed from the free space rbtree, and when the discard completes it is added back to the free space rbtree. If a block group remove operation happens while the discard is ongoing (or before it starts and after a free space entry is hidden), we end up not waiting for the discard to complete, remove the extent map that maps logical address to physical addresses and the corresponding chunk metadata from the the chunk and device trees. After that and before the discard completes, the current running transaction can finish and a new one start, allowing for new block groups that map to the same physical addresses to be allocated and written to. So fix this by keeping the extent map in memory until the discard completes so that the same physical addresses aren't reused before it completes. If the physical locations that are under a discard operation end up being used for a new metadata block group for example, and dirty metadata extents are written before the discard finishes (the VM might call writepages() of our btree inode's i_mapping for example, or an fsync log commit happens) we end up overwriting metadata with zeroes, which leads to errors from fsck like the following: checking extents Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block owner ref check failed [833912832 16384] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block root 5 root dir 256 error root 5 inode 260 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_3 filetype 1 errors 6, no dir index, no inode ref root 5 inode 262 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_5 filetype 1 errors 6, no dir index, no inode ref root 5 inode 263 errors 2001, no inode item, link count wrong (...) Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r--fs/btrfs/ctree.h12
-rw-r--r--fs/btrfs/disk-io.c13
-rw-r--r--fs/btrfs/extent-tree.c60
-rw-r--r--fs/btrfs/free-space-cache.c38
-rw-r--r--fs/btrfs/volumes.c26
-rw-r--r--fs/btrfs/volumes.h12
6 files changed, 140 insertions, 21 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 302f37c56546..d71915e04e92 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1279,6 +1279,7 @@ struct btrfs_block_group_cache {
1279 unsigned int dirty:1; 1279 unsigned int dirty:1;
1280 unsigned int iref:1; 1280 unsigned int iref:1;
1281 unsigned int has_caching_ctl:1; 1281 unsigned int has_caching_ctl:1;
1282 unsigned int removed:1;
1282 1283
1283 int disk_cache_state; 1284 int disk_cache_state;
1284 1285
@@ -1311,6 +1312,8 @@ struct btrfs_block_group_cache {
1311 1312
1312 /* For read-only block groups */ 1313 /* For read-only block groups */
1313 struct list_head ro_list; 1314 struct list_head ro_list;
1315
1316 atomic_t trimming;
1314}; 1317};
1315 1318
1316/* delayed seq elem */ 1319/* delayed seq elem */
@@ -1740,6 +1743,12 @@ struct btrfs_fs_info {
1740 1743
1741 /* For btrfs to record security options */ 1744 /* For btrfs to record security options */
1742 struct security_mnt_opts security_opts; 1745 struct security_mnt_opts security_opts;
1746
1747 /*
1748 * Chunks that can't be freed yet (under a trim/discard operation)
1749 * and will be latter freed. Protected by fs_info->chunk_mutex.
1750 */
1751 struct list_head pinned_chunks;
1743}; 1752};
1744 1753
1745struct btrfs_subvolume_writers { 1754struct btrfs_subvolume_writers {
@@ -3405,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3405 u64 type, u64 chunk_objectid, u64 chunk_offset, 3414 u64 type, u64 chunk_objectid, u64 chunk_offset,
3406 u64 size); 3415 u64 size);
3407int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 3416int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
3408 struct btrfs_root *root, u64 group_start); 3417 struct btrfs_root *root, u64 group_start,
3418 struct extent_map *em);
3409void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); 3419void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
3410void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 3420void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
3411 struct btrfs_root *root); 3421 struct btrfs_root *root);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1e3e414c8501..30965120772b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb,
2384 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2384 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2385 init_waitqueue_head(&fs_info->async_submit_wait); 2385 init_waitqueue_head(&fs_info->async_submit_wait);
2386 2386
2387 INIT_LIST_HEAD(&fs_info->pinned_chunks);
2388
2387 ret = btrfs_alloc_stripe_hash_table(fs_info); 2389 ret = btrfs_alloc_stripe_hash_table(fs_info);
2388 if (ret) { 2390 if (ret) {
2389 err = ret; 2391 err = ret;
@@ -3715,6 +3717,17 @@ void close_ctree(struct btrfs_root *root)
3715 3717
3716 btrfs_free_block_rsv(root, root->orphan_block_rsv); 3718 btrfs_free_block_rsv(root, root->orphan_block_rsv);
3717 root->orphan_block_rsv = NULL; 3719 root->orphan_block_rsv = NULL;
3720
3721 lock_chunks(root);
3722 while (!list_empty(&fs_info->pinned_chunks)) {
3723 struct extent_map *em;
3724
3725 em = list_first_entry(&fs_info->pinned_chunks,
3726 struct extent_map, list);
3727 list_del_init(&em->list);
3728 free_extent_map(em);
3729 }
3730 unlock_chunks(root);
3718} 3731}
3719 3732
3720int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 3733int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a5e64dda2db9..dbc115a25798 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9005,6 +9005,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
9005 INIT_LIST_HEAD(&cache->bg_list); 9005 INIT_LIST_HEAD(&cache->bg_list);
9006 INIT_LIST_HEAD(&cache->ro_list); 9006 INIT_LIST_HEAD(&cache->ro_list);
9007 btrfs_init_free_space_ctl(cache); 9007 btrfs_init_free_space_ctl(cache);
9008 atomic_set(&cache->trimming, 0);
9008 9009
9009 return cache; 9010 return cache;
9010} 9011}
@@ -9306,7 +9307,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
9306} 9307}
9307 9308
9308int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 9309int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9309 struct btrfs_root *root, u64 group_start) 9310 struct btrfs_root *root, u64 group_start,
9311 struct extent_map *em)
9310{ 9312{
9311 struct btrfs_path *path; 9313 struct btrfs_path *path;
9312 struct btrfs_block_group_cache *block_group; 9314 struct btrfs_block_group_cache *block_group;
@@ -9319,6 +9321,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9319 int index; 9321 int index;
9320 int factor; 9322 int factor;
9321 struct btrfs_caching_control *caching_ctl = NULL; 9323 struct btrfs_caching_control *caching_ctl = NULL;
9324 bool remove_em;
9322 9325
9323 root = root->fs_info->extent_root; 9326 root = root->fs_info->extent_root;
9324 9327
@@ -9464,6 +9467,61 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9464 9467
9465 memcpy(&key, &block_group->key, sizeof(key)); 9468 memcpy(&key, &block_group->key, sizeof(key));
9466 9469
9470 lock_chunks(root);
9471 spin_lock(&block_group->lock);
9472 block_group->removed = 1;
9473 /*
9474 * At this point trimming can't start on this block group, because we
9475 * removed the block group from the tree fs_info->block_group_cache_tree
9476 * so no one can't find it anymore and even if someone already got this
9477 * block group before we removed it from the rbtree, they have already
9478 * incremented block_group->trimming - if they didn't, they won't find
9479 * any free space entries because we already removed them all when we
9480 * called btrfs_remove_free_space_cache().
9481 *
9482 * And we must not remove the extent map from the fs_info->mapping_tree
9483 * to prevent the same logical address range and physical device space
9484 * ranges from being reused for a new block group. This is because our
9485 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
9486 * completely transactionless, so while it is trimming a range the
9487 * currently running transaction might finish and a new one start,
9488 * allowing for new block groups to be created that can reuse the same
9489 * physical device locations unless we take this special care.
9490 */
9491 remove_em = (atomic_read(&block_group->trimming) == 0);
9492 /*
9493 * Make sure a trimmer task always sees the em in the pinned_chunks list
9494 * if it sees block_group->removed == 1 (needs to lock block_group->lock
9495 * before checking block_group->removed).
9496 */
9497 if (!remove_em) {
9498 /*
9499 * Our em might be in trans->transaction->pending_chunks which
9500 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
9501 * and so is the fs_info->pinned_chunks list.
9502 *
9503 * So at this point we must be holding the chunk_mutex to avoid
9504 * any races with chunk allocation (more specifically at
9505 * volumes.c:contains_pending_extent()), to ensure it always
9506 * sees the em, either in the pending_chunks list or in the
9507 * pinned_chunks list.
9508 */
9509 list_move_tail(&em->list, &root->fs_info->pinned_chunks);
9510 }
9511 spin_unlock(&block_group->lock);
9512 unlock_chunks(root);
9513
9514 if (remove_em) {
9515 struct extent_map_tree *em_tree;
9516
9517 em_tree = &root->fs_info->mapping_tree.map_tree;
9518 write_lock(&em_tree->lock);
9519 remove_extent_mapping(em_tree, em);
9520 write_unlock(&em_tree->lock);
9521 /* once for the tree */
9522 free_extent_map(em);
9523 }
9524
9467 btrfs_put_block_group(block_group); 9525 btrfs_put_block_group(block_group);
9468 btrfs_put_block_group(block_group); 9526 btrfs_put_block_group(block_group);
9469 9527
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 33848196550e..0ddc114e2aed 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -27,6 +27,7 @@
27#include "disk-io.h" 27#include "disk-io.h"
28#include "extent_io.h" 28#include "extent_io.h"
29#include "inode-map.h" 29#include "inode-map.h"
30#include "volumes.h"
30 31
31#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 32#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
32#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 33#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
@@ -3101,11 +3102,46 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
3101 3102
3102 *trimmed = 0; 3103 *trimmed = 0;
3103 3104
3105 spin_lock(&block_group->lock);
3106 if (block_group->removed) {
3107 spin_unlock(&block_group->lock);
3108 return 0;
3109 }
3110 atomic_inc(&block_group->trimming);
3111 spin_unlock(&block_group->lock);
3112
3104 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); 3113 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
3105 if (ret) 3114 if (ret)
3106 return ret; 3115 goto out;
3107 3116
3108 ret = trim_bitmaps(block_group, trimmed, start, end, minlen); 3117 ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
3118out:
3119 spin_lock(&block_group->lock);
3120 if (atomic_dec_and_test(&block_group->trimming) &&
3121 block_group->removed) {
3122 struct extent_map_tree *em_tree;
3123 struct extent_map *em;
3124
3125 spin_unlock(&block_group->lock);
3126
3127 em_tree = &block_group->fs_info->mapping_tree.map_tree;
3128 write_lock(&em_tree->lock);
3129 em = lookup_extent_mapping(em_tree, block_group->key.objectid,
3130 1);
3131 BUG_ON(!em); /* logic error, can't happen */
3132 remove_extent_mapping(em_tree, em);
3133 write_unlock(&em_tree->lock);
3134
3135 lock_chunks(block_group->fs_info->chunk_root);
3136 list_del_init(&em->list);
3137 unlock_chunks(block_group->fs_info->chunk_root);
3138
3139 /* once for us and once for the tree */
3140 free_extent_map(em);
3141 free_extent_map(em);
3142 } else {
3143 spin_unlock(&block_group->lock);
3144 }
3109 3145
3110 return ret; 3146 return ret;
3111} 3147}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 01920515f90d..588f37e0a564 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
53DEFINE_MUTEX(uuid_mutex); 53DEFINE_MUTEX(uuid_mutex);
54static LIST_HEAD(fs_uuids); 54static LIST_HEAD(fs_uuids);
55 55
56static void lock_chunks(struct btrfs_root *root)
57{
58 mutex_lock(&root->fs_info->chunk_mutex);
59}
60
61static void unlock_chunks(struct btrfs_root *root)
62{
63 mutex_unlock(&root->fs_info->chunk_mutex);
64}
65
66static struct btrfs_fs_devices *__alloc_fs_devices(void) 56static struct btrfs_fs_devices *__alloc_fs_devices(void)
67{ 57{
68 struct btrfs_fs_devices *fs_devs; 58 struct btrfs_fs_devices *fs_devs;
@@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
1068 u64 *start, u64 len) 1058 u64 *start, u64 len)
1069{ 1059{
1070 struct extent_map *em; 1060 struct extent_map *em;
1061 struct list_head *search_list = &trans->transaction->pending_chunks;
1071 int ret = 0; 1062 int ret = 0;
1072 1063
1073 list_for_each_entry(em, &trans->transaction->pending_chunks, list) { 1064again:
1065 list_for_each_entry(em, search_list, list) {
1074 struct map_lookup *map; 1066 struct map_lookup *map;
1075 int i; 1067 int i;
1076 1068
@@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
1087 ret = 1; 1079 ret = 1;
1088 } 1080 }
1089 } 1081 }
1082 if (search_list == &trans->transaction->pending_chunks) {
1083 search_list = &trans->root->fs_info->pinned_chunks;
1084 goto again;
1085 }
1090 1086
1091 return ret; 1087 return ret;
1092} 1088}
@@ -2653,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2653 } 2649 }
2654 } 2650 }
2655 2651
2656 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 2652 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
2657 if (ret) { 2653 if (ret) {
2658 btrfs_abort_transaction(trans, extent_root, ret); 2654 btrfs_abort_transaction(trans, extent_root, ret);
2659 goto out; 2655 goto out;
2660 } 2656 }
2661 2657
2662 write_lock(&em_tree->lock);
2663 remove_extent_mapping(em_tree, em);
2664 write_unlock(&em_tree->lock);
2665
2666 /* once for the tree */
2667 free_extent_map(em);
2668out: 2658out:
2669 /* once for us */ 2659 /* once for us */
2670 free_extent_map(em); 2660 free_extent_map(em);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4cc00e64427e..637bcfadadb2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -515,4 +515,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
515void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); 515void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
516void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, 516void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
517 struct btrfs_transaction *transaction); 517 struct btrfs_transaction *transaction);
518
519static inline void lock_chunks(struct btrfs_root *root)
520{
521 mutex_lock(&root->fs_info->chunk_mutex);
522}
523
524static inline void unlock_chunks(struct btrfs_root *root)
525{
526 mutex_unlock(&root->fs_info->chunk_mutex);
527}
528
529
518#endif 530#endif