summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Mahoney <jeffm@suse.com>2019-03-27 08:24:12 -0400
committerDavid Sterba <dsterba@suse.com>2019-04-29 13:02:37 -0400
commit1c11b63eff2a67906cb9137bc6b2ee27767f313b (patch)
treeb68008df55c8d10a68832dacd19fa0f254b86f09
parent68c94e55e1502868813a3cac2febc021d01edb75 (diff)
btrfs: replace pending/pinned chunks lists with io tree
The pending chunks list contains chunks that are allocated in the current transaction but haven't been created yet. The pinned chunks list contains chunks that are being released in the current transaction. Both describe chunks that are not reflected on disk as in use but are unavailable just the same. The pending chunks list is anchored by the transaction handle, which means that we need to hold a reference to a transaction when working with the list. The way we use them is by iterating over both lists to perform comparisons on the stripes they describe for each device. This is backwards and requires that we keep a transaction handle open while we're trimming. This patchset adds an extent_io_tree to btrfs_device that maintains the allocation state of the device. Extents are set dirty when chunks are first allocated -- when the extent maps are added to the mapping tree. They're cleared when last removed -- when the extent maps are removed from the mapping tree. This matches the lifespan of the pending and pinned chunks list and allows us to do trims on unallocated space safely without pinning the transaction for what may be a lengthy operation. We can also use this io tree to mark which chunks have already been trimmed so we don't repeat the operation. Signed-off-by: Jeff Mahoney <jeffm@suse.com> Signed-off-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/ctree.h6
-rw-r--r--fs/btrfs/disk-io.c11
-rw-r--r--fs/btrfs/extent-tree.c28
-rw-r--r--fs/btrfs/extent_map.c36
-rw-r--r--fs/btrfs/free-space-cache.c4
-rw-r--r--fs/btrfs/transaction.c9
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/volumes.c85
-rw-r--r--fs/btrfs/volumes.h2
9 files changed, 63 insertions, 119 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 19833b4af630..93270e20a8e7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1149,12 +1149,6 @@ struct btrfs_fs_info {
1149 struct mutex unused_bg_unpin_mutex; 1149 struct mutex unused_bg_unpin_mutex;
1150 struct mutex delete_unused_bgs_mutex; 1150 struct mutex delete_unused_bgs_mutex;
1151 1151
1152 /*
1153 * Chunks that can't be freed yet (under a trim/discard operation)
1154 * and will be latter freed. Protected by fs_info->chunk_mutex.
1155 */
1156 struct list_head pinned_chunks;
1157
1158 /* Cached block sizes */ 1152 /* Cached block sizes */
1159 u32 nodesize; 1153 u32 nodesize;
1160 u32 sectorsize; 1154 u32 sectorsize;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c10702a3f83..0b2b75a7efbd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2774,8 +2774,6 @@ int open_ctree(struct super_block *sb,
2774 init_waitqueue_head(&fs_info->async_submit_wait); 2774 init_waitqueue_head(&fs_info->async_submit_wait);
2775 init_waitqueue_head(&fs_info->delayed_iputs_wait); 2775 init_waitqueue_head(&fs_info->delayed_iputs_wait);
2776 2776
2777 INIT_LIST_HEAD(&fs_info->pinned_chunks);
2778
2779 /* Usable values until the real ones are cached from the superblock */ 2777 /* Usable values until the real ones are cached from the superblock */
2780 fs_info->nodesize = 4096; 2778 fs_info->nodesize = 4096;
2781 fs_info->sectorsize = 4096; 2779 fs_info->sectorsize = 4096;
@@ -4050,15 +4048,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
4050 4048
4051 btrfs_free_stripe_hash_table(fs_info); 4049 btrfs_free_stripe_hash_table(fs_info);
4052 btrfs_free_ref_cache(fs_info); 4050 btrfs_free_ref_cache(fs_info);
4053
4054 while (!list_empty(&fs_info->pinned_chunks)) {
4055 struct extent_map *em;
4056
4057 em = list_first_entry(&fs_info->pinned_chunks,
4058 struct extent_map, list);
4059 list_del_init(&em->list);
4060 free_extent_map(em);
4061 }
4062} 4051}
4063 4052
4064int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 4053int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c5f9e8359c6f..a9f504e7be33 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10946,10 +10946,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10946 memcpy(&key, &block_group->key, sizeof(key)); 10946 memcpy(&key, &block_group->key, sizeof(key));
10947 10947
10948 mutex_lock(&fs_info->chunk_mutex); 10948 mutex_lock(&fs_info->chunk_mutex);
10949 if (!list_empty(&em->list)) {
10950 /* We're in the transaction->pending_chunks list. */
10951 free_extent_map(em);
10952 }
10953 spin_lock(&block_group->lock); 10949 spin_lock(&block_group->lock);
10954 block_group->removed = 1; 10950 block_group->removed = 1;
10955 /* 10951 /*
@@ -10976,25 +10972,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10976 * the transaction commit has completed. 10972 * the transaction commit has completed.
10977 */ 10973 */
10978 remove_em = (atomic_read(&block_group->trimming) == 0); 10974 remove_em = (atomic_read(&block_group->trimming) == 0);
10979 /*
10980 * Make sure a trimmer task always sees the em in the pinned_chunks list
10981 * if it sees block_group->removed == 1 (needs to lock block_group->lock
10982 * before checking block_group->removed).
10983 */
10984 if (!remove_em) {
10985 /*
10986 * Our em might be in trans->transaction->pending_chunks which
10987 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10988 * and so is the fs_info->pinned_chunks list.
10989 *
10990 * So at this point we must be holding the chunk_mutex to avoid
10991 * any races with chunk allocation (more specifically at
10992 * volumes.c:contains_pending_extent()), to ensure it always
10993 * sees the em, either in the pending_chunks list or in the
10994 * pinned_chunks list.
10995 */
10996 list_move_tail(&em->list, &fs_info->pinned_chunks);
10997 }
10998 spin_unlock(&block_group->lock); 10975 spin_unlock(&block_group->lock);
10999 10976
11000 if (remove_em) { 10977 if (remove_em) {
@@ -11002,11 +10979,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
11002 10979
11003 em_tree = &fs_info->mapping_tree.map_tree; 10980 em_tree = &fs_info->mapping_tree.map_tree;
11004 write_lock(&em_tree->lock); 10981 write_lock(&em_tree->lock);
11005 /*
11006 * The em might be in the pending_chunks list, so make sure the
11007 * chunk mutex is locked, since remove_extent_mapping() will
11008 * delete us from that list.
11009 */
11010 remove_extent_mapping(em_tree, em); 10982 remove_extent_mapping(em_tree, em);
11011 write_unlock(&em_tree->lock); 10983 write_unlock(&em_tree->lock);
11012 /* once for the tree */ 10984 /* once for the tree */
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 928f729c55ba..5a79a656dfa6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -4,6 +4,7 @@
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/spinlock.h> 5#include <linux/spinlock.h>
6#include "ctree.h" 6#include "ctree.h"
7#include "volumes.h"
7#include "extent_map.h" 8#include "extent_map.h"
8#include "compression.h" 9#include "compression.h"
9 10
@@ -337,6 +338,37 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
337 try_merge_map(tree, em); 338 try_merge_map(tree, em);
338} 339}
339 340
341static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
342{
343 struct map_lookup *map = em->map_lookup;
344 u64 stripe_size = em->orig_block_len;
345 int i;
346
347 for (i = 0; i < map->num_stripes; i++) {
348 struct btrfs_bio_stripe *stripe = &map->stripes[i];
349 struct btrfs_device *device = stripe->dev;
350
351 set_extent_bits_nowait(&device->alloc_state, stripe->physical,
352 stripe->physical + stripe_size - 1, bits);
353 }
354}
355
356static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
357{
358 struct map_lookup *map = em->map_lookup;
359 u64 stripe_size = em->orig_block_len;
360 int i;
361
362 for (i = 0; i < map->num_stripes; i++) {
363 struct btrfs_bio_stripe *stripe = &map->stripes[i];
364 struct btrfs_device *device = stripe->dev;
365
366 __clear_extent_bit(&device->alloc_state, stripe->physical,
367 stripe->physical + stripe_size - 1, bits,
368 0, 0, NULL, GFP_NOWAIT, NULL);
369 }
370}
371
340/** 372/**
341 * add_extent_mapping - add new extent map to the extent tree 373 * add_extent_mapping - add new extent map to the extent tree
342 * @tree: tree to insert new map in 374 * @tree: tree to insert new map in
@@ -357,6 +389,8 @@ int add_extent_mapping(struct extent_map_tree *tree,
357 goto out; 389 goto out;
358 390
359 setup_extent_mapping(tree, em, modified); 391 setup_extent_mapping(tree, em, modified);
392 if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
393 extent_map_device_set_bits(em, CHUNK_ALLOCATED);
360out: 394out:
361 return ret; 395 return ret;
362} 396}
@@ -438,6 +472,8 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
438 rb_erase_cached(&em->rb_node, &tree->map); 472 rb_erase_cached(&em->rb_node, &tree->map);
439 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) 473 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
440 list_del_init(&em->list); 474 list_del_init(&em->list);
475 if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
476 extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
441 RB_CLEAR_NODE(&em->rb_node); 477 RB_CLEAR_NODE(&em->rb_node);
442} 478}
443 479
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 74aa552f4793..207fb50dcc7a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3366,10 +3366,6 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
3366 em = lookup_extent_mapping(em_tree, block_group->key.objectid, 3366 em = lookup_extent_mapping(em_tree, block_group->key.objectid,
3367 1); 3367 1);
3368 BUG_ON(!em); /* logic error, can't happen */ 3368 BUG_ON(!em); /* logic error, can't happen */
3369 /*
3370 * remove_extent_mapping() will delete us from the pinned_chunks
3371 * list, which is protected by the chunk mutex.
3372 */
3373 remove_extent_mapping(em_tree, em); 3369 remove_extent_mapping(em_tree, em);
3374 write_unlock(&em_tree->lock); 3370 write_unlock(&em_tree->lock);
3375 mutex_unlock(&fs_info->chunk_mutex); 3371 mutex_unlock(&fs_info->chunk_mutex);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b32769998bbb..e5404326fc55 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -50,14 +50,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
50 btrfs_err(transaction->fs_info, 50 btrfs_err(transaction->fs_info,
51 "pending csums is %llu", 51 "pending csums is %llu",
52 transaction->delayed_refs.pending_csums); 52 transaction->delayed_refs.pending_csums);
53 while (!list_empty(&transaction->pending_chunks)) {
54 struct extent_map *em;
55
56 em = list_first_entry(&transaction->pending_chunks,
57 struct extent_map, list);
58 list_del_init(&em->list);
59 free_extent_map(em);
60 }
61 /* 53 /*
62 * If any block groups are found in ->deleted_bgs then it's 54 * If any block groups are found in ->deleted_bgs then it's
63 * because the transaction was aborted and a commit did not 55 * because the transaction was aborted and a commit did not
@@ -235,7 +227,6 @@ loop:
235 spin_lock_init(&cur_trans->delayed_refs.lock); 227 spin_lock_init(&cur_trans->delayed_refs.lock);
236 228
237 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 229 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
238 INIT_LIST_HEAD(&cur_trans->pending_chunks);
239 INIT_LIST_HEAD(&cur_trans->dev_update_list); 230 INIT_LIST_HEAD(&cur_trans->dev_update_list);
240 INIT_LIST_HEAD(&cur_trans->switch_commits); 231 INIT_LIST_HEAD(&cur_trans->switch_commits);
241 INIT_LIST_HEAD(&cur_trans->dirty_bgs); 232 INIT_LIST_HEAD(&cur_trans->dirty_bgs);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 2bd76f681520..4419a4a0294b 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -51,7 +51,6 @@ struct btrfs_transaction {
51 wait_queue_head_t writer_wait; 51 wait_queue_head_t writer_wait;
52 wait_queue_head_t commit_wait; 52 wait_queue_head_t commit_wait;
53 struct list_head pending_snapshots; 53 struct list_head pending_snapshots;
54 struct list_head pending_chunks;
55 struct list_head dev_update_list; 54 struct list_head dev_update_list;
56 struct list_head switch_commits; 55 struct list_head switch_commits;
57 struct list_head dirty_bgs; 56 struct list_head dirty_bgs;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2bde9e9c188e..2e5e48d8dd2f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -336,6 +336,7 @@ void btrfs_free_device(struct btrfs_device *device)
336{ 336{
337 WARN_ON(!list_empty(&device->post_commit_list)); 337 WARN_ON(!list_empty(&device->post_commit_list));
338 rcu_string_free(device->name); 338 rcu_string_free(device->name);
339 extent_io_tree_release(&device->alloc_state);
339 bio_put(device->flush_bio); 340 bio_put(device->flush_bio);
340 kfree(device); 341 kfree(device);
341} 342}
@@ -412,6 +413,7 @@ static struct btrfs_device *__alloc_device(void)
412 btrfs_device_data_ordered_init(dev); 413 btrfs_device_data_ordered_init(dev);
413 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 414 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
414 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 415 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
416 extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
415 417
416 return dev; 418 return dev;
417} 419}
@@ -1499,58 +1501,30 @@ error_bdev_put:
1499 return device; 1501 return device;
1500} 1502}
1501 1503
1502static int contains_pending_extent(struct btrfs_transaction *transaction, 1504/*
1503 struct btrfs_device *device, 1505 * Try to find a chunk that intersects [start, start + len] range and when one
1504 u64 *start, u64 len) 1506 * such is found, record the end of it in *start
1507 */
1508#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
1509static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1510 u64 len)
1505{ 1511{
1506 struct btrfs_fs_info *fs_info = device->fs_info; 1512 u64 physical_start, physical_end;
1507 struct extent_map *em;
1508 struct list_head *search_list = &fs_info->pinned_chunks;
1509 int ret = 0;
1510 u64 physical_start = *start;
1511 1513
1512 if (transaction) 1514 lockdep_assert_held(&device->fs_info->chunk_mutex);
1513 search_list = &transaction->pending_chunks;
1514again:
1515 list_for_each_entry(em, search_list, list) {
1516 struct map_lookup *map;
1517 int i;
1518 1515
1519 map = em->map_lookup; 1516 if (!find_first_extent_bit(&device->alloc_state, *start,
1520 for (i = 0; i < map->num_stripes; i++) { 1517 &physical_start, &physical_end,
1521 u64 end; 1518 CHUNK_ALLOCATED, NULL)) {
1522 1519
1523 if (map->stripes[i].dev != device) 1520 if (in_range(physical_start, *start, len) ||
1524 continue; 1521 in_range(*start, physical_start,
1525 if (map->stripes[i].physical >= physical_start + len || 1522 physical_end - physical_start)) {
1526 map->stripes[i].physical + em->orig_block_len <= 1523 *start = physical_end + 1;
1527 physical_start) 1524 return true;
1528 continue;
1529 /*
1530 * Make sure that while processing the pinned list we do
1531 * not override our *start with a lower value, because
1532 * we can have pinned chunks that fall within this
1533 * device hole and that have lower physical addresses
1534 * than the pending chunks we processed before. If we
1535 * do not take this special care we can end up getting
1536 * 2 pending chunks that start at the same physical
1537 * device offsets because the end offset of a pinned
1538 * chunk can be equal to the start offset of some
1539 * pending chunk.
1540 */
1541 end = map->stripes[i].physical + em->orig_block_len;
1542 if (end > *start) {
1543 *start = end;
1544 ret = 1;
1545 }
1546 } 1525 }
1547 } 1526 }
1548 if (search_list != &fs_info->pinned_chunks) { 1527 return false;
1549 search_list = &fs_info->pinned_chunks;
1550 goto again;
1551 }
1552
1553 return ret;
1554} 1528}
1555 1529
1556 1530
@@ -1661,15 +1635,12 @@ again:
1661 * Have to check before we set max_hole_start, otherwise 1635 * Have to check before we set max_hole_start, otherwise
1662 * we could end up sending back this offset anyway. 1636 * we could end up sending back this offset anyway.
1663 */ 1637 */
1664 if (contains_pending_extent(transaction, device, 1638 if (contains_pending_extent(device, &search_start,
1665 &search_start,
1666 hole_size)) { 1639 hole_size)) {
1667 if (key.offset >= search_start) { 1640 if (key.offset >= search_start)
1668 hole_size = key.offset - search_start; 1641 hole_size = key.offset - search_start;
1669 } else { 1642 else
1670 WARN_ON_ONCE(1);
1671 hole_size = 0; 1643 hole_size = 0;
1672 }
1673 } 1644 }
1674 1645
1675 if (hole_size > max_hole_size) { 1646 if (hole_size > max_hole_size) {
@@ -1710,8 +1681,7 @@ next:
1710 if (search_end > search_start) { 1681 if (search_end > search_start) {
1711 hole_size = search_end - search_start; 1682 hole_size = search_end - search_start;
1712 1683
1713 if (contains_pending_extent(transaction, device, &search_start, 1684 if (contains_pending_extent(device, &search_start, hole_size)) {
1714 hole_size)) {
1715 btrfs_release_path(path); 1685 btrfs_release_path(path);
1716 goto again; 1686 goto again;
1717 } 1687 }
@@ -4756,7 +4726,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4756 * in-memory chunks are synced to disk so that the loop below sees them 4726 * in-memory chunks are synced to disk so that the loop below sees them
4757 * and relocates them accordingly. 4727 * and relocates them accordingly.
4758 */ 4728 */
4759 if (contains_pending_extent(trans->transaction, device, &start, diff)) { 4729 if (contains_pending_extent(device, &start, diff)) {
4760 mutex_unlock(&fs_info->chunk_mutex); 4730 mutex_unlock(&fs_info->chunk_mutex);
4761 ret = btrfs_commit_transaction(trans); 4731 ret = btrfs_commit_transaction(trans);
4762 if (ret) 4732 if (ret)
@@ -5189,9 +5159,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
5189 free_extent_map(em); 5159 free_extent_map(em);
5190 goto error; 5160 goto error;
5191 } 5161 }
5192
5193 list_add_tail(&em->list, &trans->transaction->pending_chunks);
5194 refcount_inc(&em->refs);
5195 write_unlock(&em_tree->lock); 5162 write_unlock(&em_tree->lock);
5196 5163
5197 ret = btrfs_make_block_group(trans, 0, type, start, chunk_size); 5164 ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
@@ -5224,8 +5191,6 @@ error_del_extent:
5224 free_extent_map(em); 5191 free_extent_map(em);
5225 /* One for the tree reference */ 5192 /* One for the tree reference */
5226 free_extent_map(em); 5193 free_extent_map(em);
5227 /* One for the pending_chunks list reference */
5228 free_extent_map(em);
5229error: 5194error:
5230 kfree(devices_info); 5195 kfree(devices_info);
5231 return ret; 5196 return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d3658a4e65db..6c466ac27c2e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -133,6 +133,8 @@ struct btrfs_device {
133 /* Counter to record the change of device stats */ 133 /* Counter to record the change of device stats */
134 atomic_t dev_stats_ccnt; 134 atomic_t dev_stats_ccnt;
135 atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; 135 atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
136
137 struct extent_io_tree alloc_state;
136}; 138};
137 139
138/* 140/*