aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <clm@fb.com>2015-04-06 15:46:08 -0400
committerChris Mason <clm@fb.com>2015-04-10 17:07:22 -0400
commit1bbc621ef28462456131c035eaeb5567a1a2a2fe (patch)
treed2c9e87e9cef8884a440bc9b6a5bf6574eff9fc7
parent2b108268006e06d57ec9810f4ccf5d99d7e5b598 (diff)
Btrfs: allow block group cache writeout outside critical section in commit
We loop through all of the dirty block groups during commit and write the free space cache. In order to make sure the cache is currect, we do this while no other writers are allowed in the commit. If a large number of block groups are dirty, this can introduce long stalls during the final stages of the commit, which can block new procs trying to change the filesystem. This commit changes the block group cache writeout to take appropriate locks and allow it to run earlier in the commit. We'll still have to redo some of the block groups, but it means we can get most of the work out of the way without blocking the entire FS. Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r--fs/btrfs/ctree.h8
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/btrfs/extent-tree.c241
-rw-r--r--fs/btrfs/free-space-cache.c69
-rw-r--r--fs/btrfs/free-space-cache.h1
-rw-r--r--fs/btrfs/inode-map.c2
-rw-r--r--fs/btrfs/relocation.c9
-rw-r--r--fs/btrfs/transaction.c38
-rw-r--r--fs/btrfs/transaction.h9
9 files changed, 341 insertions, 37 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1df0d9db5332..83051fae9467 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1491,6 +1491,12 @@ struct btrfs_fs_info {
1491 struct mutex chunk_mutex; 1491 struct mutex chunk_mutex;
1492 struct mutex volume_mutex; 1492 struct mutex volume_mutex;
1493 1493
1494 /*
1495 * this is taken to make sure we don't set block groups ro after
1496 * the free space cache has been allocated on them
1497 */
1498 struct mutex ro_block_group_mutex;
1499
1494 /* this is used during read/modify/write to make sure 1500 /* this is used during read/modify/write to make sure
1495 * no two ios are trying to mod the same stripe at the same 1501 * no two ios are trying to mod the same stripe at the same
1496 * time 1502 * time
@@ -3407,6 +3413,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
3407 u64 bytenr, u64 num_bytes, u64 parent, 3413 u64 bytenr, u64 num_bytes, u64 parent,
3408 u64 root_objectid, u64 owner, u64 offset, int no_quota); 3414 u64 root_objectid, u64 owner, u64 offset, int no_quota);
3409 3415
3416int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3417 struct btrfs_root *root);
3410int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3418int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3411 struct btrfs_root *root); 3419 struct btrfs_root *root);
3412int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, 3420int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 568cc4e3d80e..b5e3d5f6400a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2572,6 +2572,7 @@ int open_ctree(struct super_block *sb,
2572 mutex_init(&fs_info->transaction_kthread_mutex); 2572 mutex_init(&fs_info->transaction_kthread_mutex);
2573 mutex_init(&fs_info->cleaner_mutex); 2573 mutex_init(&fs_info->cleaner_mutex);
2574 mutex_init(&fs_info->volume_mutex); 2574 mutex_init(&fs_info->volume_mutex);
2575 mutex_init(&fs_info->ro_block_group_mutex);
2575 init_rwsem(&fs_info->commit_root_sem); 2576 init_rwsem(&fs_info->commit_root_sem);
2576 init_rwsem(&fs_info->cleanup_work_sem); 2577 init_rwsem(&fs_info->cleanup_work_sem);
2577 init_rwsem(&fs_info->subvol_sem); 2578 init_rwsem(&fs_info->subvol_sem);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 40c95135d037..02c2b29a0840 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3298,7 +3298,7 @@ again:
3298 if (ret) 3298 if (ret)
3299 goto out_put; 3299 goto out_put;
3300 3300
3301 ret = btrfs_truncate_free_space_cache(root, trans, inode); 3301 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
3302 if (ret) 3302 if (ret)
3303 goto out_put; 3303 goto out_put;
3304 } 3304 }
@@ -3382,20 +3382,156 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3382 return 0; 3382 return 0;
3383} 3383}
3384 3384
3385int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3385/*
3386 * transaction commit does final block group cache writeback during a
3387 * critical section where nothing is allowed to change the FS. This is
3388 * required in order for the cache to actually match the block group,
3389 * but can introduce a lot of latency into the commit.
3390 *
3391 * So, btrfs_start_dirty_block_groups is here to kick off block group
3392 * cache IO. There's a chance we'll have to redo some of it if the
3393 * block group changes again during the commit, but it greatly reduces
3394 * the commit latency by getting rid of the easy block groups while
3395 * we're still allowing others to join the commit.
3396 */
3397int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3386 struct btrfs_root *root) 3398 struct btrfs_root *root)
3387{ 3399{
3388 struct btrfs_block_group_cache *cache; 3400 struct btrfs_block_group_cache *cache;
3389 struct btrfs_transaction *cur_trans = trans->transaction; 3401 struct btrfs_transaction *cur_trans = trans->transaction;
3390 int ret = 0; 3402 int ret = 0;
3391 int should_put; 3403 int should_put;
3392 struct btrfs_path *path; 3404 struct btrfs_path *path = NULL;
3393 LIST_HEAD(io); 3405 LIST_HEAD(dirty);
3406 struct list_head *io = &cur_trans->io_bgs;
3394 int num_started = 0; 3407 int num_started = 0;
3395 int num_waited = 0; 3408 int loops = 0;
3409
3410 spin_lock(&cur_trans->dirty_bgs_lock);
3411 if (!list_empty(&cur_trans->dirty_bgs)) {
3412 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3413 }
3414 spin_unlock(&cur_trans->dirty_bgs_lock);
3396 3415
3397 if (list_empty(&cur_trans->dirty_bgs)) 3416again:
3417 if (list_empty(&dirty)) {
3418 btrfs_free_path(path);
3398 return 0; 3419 return 0;
3420 }
3421
3422 /*
3423 * make sure all the block groups on our dirty list actually
3424 * exist
3425 */
3426 btrfs_create_pending_block_groups(trans, root);
3427
3428 if (!path) {
3429 path = btrfs_alloc_path();
3430 if (!path)
3431 return -ENOMEM;
3432 }
3433
3434 while (!list_empty(&dirty)) {
3435 cache = list_first_entry(&dirty,
3436 struct btrfs_block_group_cache,
3437 dirty_list);
3438
3439 /*
3440 * cache_write_mutex is here only to save us from balance
3441 * deleting this block group while we are writing out the
3442 * cache
3443 */
3444 mutex_lock(&trans->transaction->cache_write_mutex);
3445
3446 /*
3447 * this can happen if something re-dirties a block
3448 * group that is already under IO. Just wait for it to
3449 * finish and then do it all again
3450 */
3451 if (!list_empty(&cache->io_list)) {
3452 list_del_init(&cache->io_list);
3453 btrfs_wait_cache_io(root, trans, cache,
3454 &cache->io_ctl, path,
3455 cache->key.objectid);
3456 btrfs_put_block_group(cache);
3457 }
3458
3459
3460 /*
3461 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3462 * if it should update the cache_state. Don't delete
3463 * until after we wait.
3464 *
3465 * Since we're not running in the commit critical section
3466 * we need the dirty_bgs_lock to protect from update_block_group
3467 */
3468 spin_lock(&cur_trans->dirty_bgs_lock);
3469 list_del_init(&cache->dirty_list);
3470 spin_unlock(&cur_trans->dirty_bgs_lock);
3471
3472 should_put = 1;
3473
3474 cache_save_setup(cache, trans, path);
3475
3476 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3477 cache->io_ctl.inode = NULL;
3478 ret = btrfs_write_out_cache(root, trans, cache, path);
3479 if (ret == 0 && cache->io_ctl.inode) {
3480 num_started++;
3481 should_put = 0;
3482
3483 /*
3484 * the cache_write_mutex is protecting
3485 * the io_list
3486 */
3487 list_add_tail(&cache->io_list, io);
3488 } else {
3489 /*
3490 * if we failed to write the cache, the
3491 * generation will be bad and life goes on
3492 */
3493 ret = 0;
3494 }
3495 }
3496 if (!ret)
3497 ret = write_one_cache_group(trans, root, path, cache);
3498 mutex_unlock(&trans->transaction->cache_write_mutex);
3499
3500 /* if its not on the io list, we need to put the block group */
3501 if (should_put)
3502 btrfs_put_block_group(cache);
3503
3504 if (ret)
3505 break;
3506 }
3507
3508 /*
3509 * go through delayed refs for all the stuff we've just kicked off
3510 * and then loop back (just once)
3511 */
3512 ret = btrfs_run_delayed_refs(trans, root, 0);
3513 if (!ret && loops == 0) {
3514 loops++;
3515 spin_lock(&cur_trans->dirty_bgs_lock);
3516 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3517 spin_unlock(&cur_trans->dirty_bgs_lock);
3518 goto again;
3519 }
3520
3521 btrfs_free_path(path);
3522 return ret;
3523}
3524
3525int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3526 struct btrfs_root *root)
3527{
3528 struct btrfs_block_group_cache *cache;
3529 struct btrfs_transaction *cur_trans = trans->transaction;
3530 int ret = 0;
3531 int should_put;
3532 struct btrfs_path *path;
3533 struct list_head *io = &cur_trans->io_bgs;
3534 int num_started = 0;
3399 3535
3400 path = btrfs_alloc_path(); 3536 path = btrfs_alloc_path();
3401 if (!path) 3537 if (!path)
@@ -3423,14 +3559,16 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3423 &cache->io_ctl, path, 3559 &cache->io_ctl, path,
3424 cache->key.objectid); 3560 cache->key.objectid);
3425 btrfs_put_block_group(cache); 3561 btrfs_put_block_group(cache);
3426 num_waited++;
3427 } 3562 }
3428 3563
3564 /*
3565 * don't remove from the dirty list until after we've waited
3566 * on any pending IO
3567 */
3429 list_del_init(&cache->dirty_list); 3568 list_del_init(&cache->dirty_list);
3430 should_put = 1; 3569 should_put = 1;
3431 3570
3432 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3571 cache_save_setup(cache, trans, path);
3433 cache_save_setup(cache, trans, path);
3434 3572
3435 if (!ret) 3573 if (!ret)
3436 ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1); 3574 ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
@@ -3441,7 +3579,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3441 if (ret == 0 && cache->io_ctl.inode) { 3579 if (ret == 0 && cache->io_ctl.inode) {
3442 num_started++; 3580 num_started++;
3443 should_put = 0; 3581 should_put = 0;
3444 list_add_tail(&cache->io_list, &io); 3582 list_add_tail(&cache->io_list, io);
3445 } else { 3583 } else {
3446 /* 3584 /*
3447 * if we failed to write the cache, the 3585 * if we failed to write the cache, the
@@ -3458,11 +3596,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3458 btrfs_put_block_group(cache); 3596 btrfs_put_block_group(cache);
3459 } 3597 }
3460 3598
3461 while (!list_empty(&io)) { 3599 while (!list_empty(io)) {
3462 cache = list_first_entry(&io, struct btrfs_block_group_cache, 3600 cache = list_first_entry(io, struct btrfs_block_group_cache,
3463 io_list); 3601 io_list);
3464 list_del_init(&cache->io_list); 3602 list_del_init(&cache->io_list);
3465 num_waited++;
3466 btrfs_wait_cache_io(root, trans, cache, 3603 btrfs_wait_cache_io(root, trans, cache,
3467 &cache->io_ctl, path, cache->key.objectid); 3604 &cache->io_ctl, path, cache->key.objectid);
3468 btrfs_put_block_group(cache); 3605 btrfs_put_block_group(cache);
@@ -5459,15 +5596,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
5459 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5596 if (!alloc && cache->cached == BTRFS_CACHE_NO)
5460 cache_block_group(cache, 1); 5597 cache_block_group(cache, 1);
5461 5598
5462 spin_lock(&trans->transaction->dirty_bgs_lock);
5463 if (list_empty(&cache->dirty_list)) {
5464 list_add_tail(&cache->dirty_list,
5465 &trans->transaction->dirty_bgs);
5466 trans->transaction->num_dirty_bgs++;
5467 btrfs_get_block_group(cache);
5468 }
5469 spin_unlock(&trans->transaction->dirty_bgs_lock);
5470
5471 byte_in_group = bytenr - cache->key.objectid; 5599 byte_in_group = bytenr - cache->key.objectid;
5472 WARN_ON(byte_in_group > cache->key.offset); 5600 WARN_ON(byte_in_group > cache->key.offset);
5473 5601
@@ -5516,6 +5644,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
5516 spin_unlock(&info->unused_bgs_lock); 5644 spin_unlock(&info->unused_bgs_lock);
5517 } 5645 }
5518 } 5646 }
5647
5648 spin_lock(&trans->transaction->dirty_bgs_lock);
5649 if (list_empty(&cache->dirty_list)) {
5650 list_add_tail(&cache->dirty_list,
5651 &trans->transaction->dirty_bgs);
5652 trans->transaction->num_dirty_bgs++;
5653 btrfs_get_block_group(cache);
5654 }
5655 spin_unlock(&trans->transaction->dirty_bgs_lock);
5656
5519 btrfs_put_block_group(cache); 5657 btrfs_put_block_group(cache);
5520 total -= num_bytes; 5658 total -= num_bytes;
5521 bytenr += num_bytes; 5659 bytenr += num_bytes;
@@ -8602,10 +8740,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
8602 8740
8603 BUG_ON(cache->ro); 8741 BUG_ON(cache->ro);
8604 8742
8743again:
8605 trans = btrfs_join_transaction(root); 8744 trans = btrfs_join_transaction(root);
8606 if (IS_ERR(trans)) 8745 if (IS_ERR(trans))
8607 return PTR_ERR(trans); 8746 return PTR_ERR(trans);
8608 8747
8748 /*
8749 * we're not allowed to set block groups readonly after the dirty
8750 * block groups cache has started writing. If it already started,
8751 * back off and let this transaction commit
8752 */
8753 mutex_lock(&root->fs_info->ro_block_group_mutex);
8754 if (trans->transaction->dirty_bg_run) {
8755 u64 transid = trans->transid;
8756
8757 mutex_unlock(&root->fs_info->ro_block_group_mutex);
8758 btrfs_end_transaction(trans, root);
8759
8760 ret = btrfs_wait_for_commit(root, transid);
8761 if (ret)
8762 return ret;
8763 goto again;
8764 }
8765
8766
8609 ret = set_block_group_ro(cache, 0); 8767 ret = set_block_group_ro(cache, 0);
8610 if (!ret) 8768 if (!ret)
8611 goto out; 8769 goto out;
@@ -8620,6 +8778,7 @@ out:
8620 alloc_flags = update_block_group_flags(root, cache->flags); 8778 alloc_flags = update_block_group_flags(root, cache->flags);
8621 check_system_chunk(trans, root, alloc_flags); 8779 check_system_chunk(trans, root, alloc_flags);
8622 } 8780 }
8781 mutex_unlock(&root->fs_info->ro_block_group_mutex);
8623 8782
8624 btrfs_end_transaction(trans, root); 8783 btrfs_end_transaction(trans, root);
8625 return ret; 8784 return ret;
@@ -9425,7 +9584,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9425 goto out; 9584 goto out;
9426 } 9585 }
9427 9586
9587 /*
9588 * get the inode first so any iput calls done for the io_list
9589 * aren't the final iput (no unlinks allowed now)
9590 */
9428 inode = lookup_free_space_inode(tree_root, block_group, path); 9591 inode = lookup_free_space_inode(tree_root, block_group, path);
9592
9593 mutex_lock(&trans->transaction->cache_write_mutex);
9594 /*
9595 * make sure our free spache cache IO is done before remove the
9596 * free space inode
9597 */
9598 spin_lock(&trans->transaction->dirty_bgs_lock);
9599 if (!list_empty(&block_group->io_list)) {
9600 list_del_init(&block_group->io_list);
9601
9602 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
9603
9604 spin_unlock(&trans->transaction->dirty_bgs_lock);
9605 btrfs_wait_cache_io(root, trans, block_group,
9606 &block_group->io_ctl, path,
9607 block_group->key.objectid);
9608 btrfs_put_block_group(block_group);
9609 spin_lock(&trans->transaction->dirty_bgs_lock);
9610 }
9611
9612 if (!list_empty(&block_group->dirty_list)) {
9613 list_del_init(&block_group->dirty_list);
9614 btrfs_put_block_group(block_group);
9615 }
9616 spin_unlock(&trans->transaction->dirty_bgs_lock);
9617 mutex_unlock(&trans->transaction->cache_write_mutex);
9618
9429 if (!IS_ERR(inode)) { 9619 if (!IS_ERR(inode)) {
9430 ret = btrfs_orphan_add(trans, inode); 9620 ret = btrfs_orphan_add(trans, inode);
9431 if (ret) { 9621 if (ret) {
@@ -9518,11 +9708,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9518 9708
9519 spin_lock(&trans->transaction->dirty_bgs_lock); 9709 spin_lock(&trans->transaction->dirty_bgs_lock);
9520 if (!list_empty(&block_group->dirty_list)) { 9710 if (!list_empty(&block_group->dirty_list)) {
9521 list_del_init(&block_group->dirty_list); 9711 WARN_ON(1);
9522 btrfs_put_block_group(block_group); 9712 }
9713 if (!list_empty(&block_group->io_list)) {
9714 WARN_ON(1);
9523 } 9715 }
9524 spin_unlock(&trans->transaction->dirty_bgs_lock); 9716 spin_unlock(&trans->transaction->dirty_bgs_lock);
9525
9526 btrfs_remove_free_space_cache(block_group); 9717 btrfs_remove_free_space_cache(block_group);
9527 9718
9528 spin_lock(&block_group->space_info->lock); 9719 spin_lock(&block_group->space_info->lock);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 83532a245947..253cb74b0e27 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -226,9 +226,37 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
226 226
227int btrfs_truncate_free_space_cache(struct btrfs_root *root, 227int btrfs_truncate_free_space_cache(struct btrfs_root *root,
228 struct btrfs_trans_handle *trans, 228 struct btrfs_trans_handle *trans,
229 struct btrfs_block_group_cache *block_group,
229 struct inode *inode) 230 struct inode *inode)
230{ 231{
231 int ret = 0; 232 int ret = 0;
233 struct btrfs_path *path = btrfs_alloc_path();
234
235 if (!path) {
236 ret = -ENOMEM;
237 goto fail;
238 }
239
240 if (block_group) {
241 mutex_lock(&trans->transaction->cache_write_mutex);
242 if (!list_empty(&block_group->io_list)) {
243 list_del_init(&block_group->io_list);
244
245 btrfs_wait_cache_io(root, trans, block_group,
246 &block_group->io_ctl, path,
247 block_group->key.objectid);
248 btrfs_put_block_group(block_group);
249 }
250
251 /*
252 * now that we've truncated the cache away, its no longer
253 * setup or written
254 */
255 spin_lock(&block_group->lock);
256 block_group->disk_cache_state = BTRFS_DC_CLEAR;
257 spin_unlock(&block_group->lock);
258 }
259 btrfs_free_path(path);
232 260
233 btrfs_i_size_write(inode, 0); 261 btrfs_i_size_write(inode, 0);
234 truncate_pagecache(inode, 0); 262 truncate_pagecache(inode, 0);
@@ -242,11 +270,17 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
242 ret = btrfs_truncate_inode_items(trans, root, inode, 270 ret = btrfs_truncate_inode_items(trans, root, inode,
243 0, BTRFS_EXTENT_DATA_KEY); 271 0, BTRFS_EXTENT_DATA_KEY);
244 if (ret) { 272 if (ret) {
273 mutex_unlock(&trans->transaction->cache_write_mutex);
245 btrfs_abort_transaction(trans, root, ret); 274 btrfs_abort_transaction(trans, root, ret);
246 return ret; 275 return ret;
247 } 276 }
248 277
249 ret = btrfs_update_inode(trans, root, inode); 278 ret = btrfs_update_inode(trans, root, inode);
279
280 if (block_group)
281 mutex_unlock(&trans->transaction->cache_write_mutex);
282
283fail:
250 if (ret) 284 if (ret)
251 btrfs_abort_transaction(trans, root, ret); 285 btrfs_abort_transaction(trans, root, ret);
252 286
@@ -876,6 +910,7 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
876{ 910{
877 int ret; 911 int ret;
878 struct btrfs_free_cluster *cluster = NULL; 912 struct btrfs_free_cluster *cluster = NULL;
913 struct btrfs_free_cluster *cluster_locked = NULL;
879 struct rb_node *node = rb_first(&ctl->free_space_offset); 914 struct rb_node *node = rb_first(&ctl->free_space_offset);
880 struct btrfs_trim_range *trim_entry; 915 struct btrfs_trim_range *trim_entry;
881 916
@@ -887,6 +922,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
887 } 922 }
888 923
889 if (!node && cluster) { 924 if (!node && cluster) {
925 cluster_locked = cluster;
926 spin_lock(&cluster_locked->lock);
890 node = rb_first(&cluster->root); 927 node = rb_first(&cluster->root);
891 cluster = NULL; 928 cluster = NULL;
892 } 929 }
@@ -910,9 +947,15 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
910 node = rb_next(node); 947 node = rb_next(node);
911 if (!node && cluster) { 948 if (!node && cluster) {
912 node = rb_first(&cluster->root); 949 node = rb_first(&cluster->root);
950 cluster_locked = cluster;
951 spin_lock(&cluster_locked->lock);
913 cluster = NULL; 952 cluster = NULL;
914 } 953 }
915 } 954 }
955 if (cluster_locked) {
956 spin_unlock(&cluster_locked->lock);
957 cluster_locked = NULL;
958 }
916 959
917 /* 960 /*
918 * Make sure we don't miss any range that was removed from our rbtree 961 * Make sure we don't miss any range that was removed from our rbtree
@@ -930,6 +973,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
930 973
931 return 0; 974 return 0;
932fail: 975fail:
976 if (cluster_locked)
977 spin_unlock(&cluster_locked->lock);
933 return -ENOSPC; 978 return -ENOSPC;
934} 979}
935 980
@@ -1101,6 +1146,9 @@ int btrfs_wait_cache_io(struct btrfs_root *root,
1101 int ret; 1146 int ret;
1102 struct inode *inode = io_ctl->inode; 1147 struct inode *inode = io_ctl->inode;
1103 1148
1149 if (!inode)
1150 return 0;
1151
1104 root = root->fs_info->tree_root; 1152 root = root->fs_info->tree_root;
1105 1153
1106 /* Flush the dirty pages in the cache file. */ 1154 /* Flush the dirty pages in the cache file. */
@@ -1127,11 +1175,16 @@ out:
1127 btrfs_update_inode(trans, root, inode); 1175 btrfs_update_inode(trans, root, inode);
1128 1176
1129 if (block_group) { 1177 if (block_group) {
1178 /* the dirty list is protected by the dirty_bgs_lock */
1179 spin_lock(&trans->transaction->dirty_bgs_lock);
1180
1181 /* the disk_cache_state is protected by the block group lock */
1130 spin_lock(&block_group->lock); 1182 spin_lock(&block_group->lock);
1131 1183
1132 /* 1184 /*
1133 * only mark this as written if we didn't get put back on 1185 * only mark this as written if we didn't get put back on
1134 * the dirty list while waiting for IO. 1186 * the dirty list while waiting for IO. Otherwise our
1187 * cache state won't be right, and we won't get written again
1135 */ 1188 */
1136 if (!ret && list_empty(&block_group->dirty_list)) 1189 if (!ret && list_empty(&block_group->dirty_list))
1137 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1190 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
@@ -1139,6 +1192,7 @@ out:
1139 block_group->disk_cache_state = BTRFS_DC_ERROR; 1192 block_group->disk_cache_state = BTRFS_DC_ERROR;
1140 1193
1141 spin_unlock(&block_group->lock); 1194 spin_unlock(&block_group->lock);
1195 spin_unlock(&trans->transaction->dirty_bgs_lock);
1142 io_ctl->inode = NULL; 1196 io_ctl->inode = NULL;
1143 iput(inode); 1197 iput(inode);
1144 } 1198 }
@@ -1207,9 +1261,11 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1207 1261
1208 mutex_lock(&ctl->cache_writeout_mutex); 1262 mutex_lock(&ctl->cache_writeout_mutex);
1209 /* Write out the extent entries in the free space cache */ 1263 /* Write out the extent entries in the free space cache */
1264 spin_lock(&ctl->tree_lock);
1210 ret = write_cache_extent_entries(io_ctl, ctl, 1265 ret = write_cache_extent_entries(io_ctl, ctl,
1211 block_group, &entries, &bitmaps, 1266 block_group, &entries, &bitmaps,
1212 &bitmap_list); 1267 &bitmap_list);
1268 spin_unlock(&ctl->tree_lock);
1213 if (ret) { 1269 if (ret) {
1214 mutex_unlock(&ctl->cache_writeout_mutex); 1270 mutex_unlock(&ctl->cache_writeout_mutex);
1215 goto out_nospc; 1271 goto out_nospc;
@@ -1219,6 +1275,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1219 * Some spaces that are freed in the current transaction are pinned, 1275 * Some spaces that are freed in the current transaction are pinned,
1220 * they will be added into free space cache after the transaction is 1276 * they will be added into free space cache after the transaction is
1221 * committed, we shouldn't lose them. 1277 * committed, we shouldn't lose them.
1278 *
1279 * If this changes while we are working we'll get added back to
1280 * the dirty list and redo it. No locking needed
1222 */ 1281 */
1223 ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries); 1282 ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
1224 if (ret) { 1283 if (ret) {
@@ -1231,7 +1290,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1231 * locked while doing it because a concurrent trim can be manipulating 1290 * locked while doing it because a concurrent trim can be manipulating
1232 * or freeing the bitmap. 1291 * or freeing the bitmap.
1233 */ 1292 */
1293 spin_lock(&ctl->tree_lock);
1234 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1294 ret = write_bitmap_entries(io_ctl, &bitmap_list);
1295 spin_unlock(&ctl->tree_lock);
1235 mutex_unlock(&ctl->cache_writeout_mutex); 1296 mutex_unlock(&ctl->cache_writeout_mutex);
1236 if (ret) 1297 if (ret)
1237 goto out_nospc; 1298 goto out_nospc;
@@ -1307,12 +1368,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
1307 spin_unlock(&block_group->lock); 1368 spin_unlock(&block_group->lock);
1308 return 0; 1369 return 0;
1309 } 1370 }
1310
1311 if (block_group->delalloc_bytes) {
1312 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1313 spin_unlock(&block_group->lock);
1314 return 0;
1315 }
1316 spin_unlock(&block_group->lock); 1371 spin_unlock(&block_group->lock);
1317 1372
1318 inode = lookup_free_space_inode(root, block_group, path); 1373 inode = lookup_free_space_inode(root, block_group, path);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index c4339863af05..a16a029ad3b1 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -62,6 +62,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
62 struct btrfs_block_rsv *rsv); 62 struct btrfs_block_rsv *rsv);
63int btrfs_truncate_free_space_cache(struct btrfs_root *root, 63int btrfs_truncate_free_space_cache(struct btrfs_root *root,
64 struct btrfs_trans_handle *trans, 64 struct btrfs_trans_handle *trans,
65 struct btrfs_block_group_cache *block_group,
65 struct inode *inode); 66 struct inode *inode);
66int load_free_space_cache(struct btrfs_fs_info *fs_info, 67int load_free_space_cache(struct btrfs_fs_info *fs_info,
67 struct btrfs_block_group_cache *block_group); 68 struct btrfs_block_group_cache *block_group);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 74faea3a516e..f6a596d5a637 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -456,7 +456,7 @@ again:
456 } 456 }
457 457
458 if (i_size_read(inode) > 0) { 458 if (i_size_read(inode) > 0) {
459 ret = btrfs_truncate_free_space_cache(root, trans, inode); 459 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
460 if (ret) { 460 if (ret) {
461 if (ret != -ENOSPC) 461 if (ret != -ENOSPC)
462 btrfs_abort_transaction(trans, root, ret); 462 btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d83085381bcc..840a4eb0f396 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3430,7 +3430,9 @@ static int block_use_full_backref(struct reloc_control *rc,
3430} 3430}
3431 3431
3432static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3432static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3433 struct inode *inode, u64 ino) 3433 struct btrfs_block_group_cache *block_group,
3434 struct inode *inode,
3435 u64 ino)
3434{ 3436{
3435 struct btrfs_key key; 3437 struct btrfs_key key;
3436 struct btrfs_root *root = fs_info->tree_root; 3438 struct btrfs_root *root = fs_info->tree_root;
@@ -3463,7 +3465,7 @@ truncate:
3463 goto out; 3465 goto out;
3464 } 3466 }
3465 3467
3466 ret = btrfs_truncate_free_space_cache(root, trans, inode); 3468 ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
3467 3469
3468 btrfs_end_transaction(trans, root); 3470 btrfs_end_transaction(trans, root);
3469 btrfs_btree_balance_dirty(root); 3471 btrfs_btree_balance_dirty(root);
@@ -3509,6 +3511,7 @@ static int find_data_references(struct reloc_control *rc,
3509 */ 3511 */
3510 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { 3512 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3511 ret = delete_block_group_cache(rc->extent_root->fs_info, 3513 ret = delete_block_group_cache(rc->extent_root->fs_info,
3514 rc->block_group,
3512 NULL, ref_objectid); 3515 NULL, ref_objectid);
3513 if (ret != -ENOENT) 3516 if (ret != -ENOENT)
3514 return ret; 3517 return ret;
@@ -4223,7 +4226,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4223 btrfs_free_path(path); 4226 btrfs_free_path(path);
4224 4227
4225 if (!IS_ERR(inode)) 4228 if (!IS_ERR(inode))
4226 ret = delete_block_group_cache(fs_info, inode, 0); 4229 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4227 else 4230 else
4228 ret = PTR_ERR(inode); 4231 ret = PTR_ERR(inode);
4229 4232
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 234d6063bbf3..5628e25250c0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -222,6 +222,7 @@ loop:
222 atomic_set(&cur_trans->use_count, 2); 222 atomic_set(&cur_trans->use_count, 2);
223 cur_trans->have_free_bgs = 0; 223 cur_trans->have_free_bgs = 0;
224 cur_trans->start_time = get_seconds(); 224 cur_trans->start_time = get_seconds();
225 cur_trans->dirty_bg_run = 0;
225 226
226 cur_trans->delayed_refs.href_root = RB_ROOT; 227 cur_trans->delayed_refs.href_root = RB_ROOT;
227 atomic_set(&cur_trans->delayed_refs.num_entries, 0); 228 atomic_set(&cur_trans->delayed_refs.num_entries, 0);
@@ -251,6 +252,8 @@ loop:
251 INIT_LIST_HEAD(&cur_trans->switch_commits); 252 INIT_LIST_HEAD(&cur_trans->switch_commits);
252 INIT_LIST_HEAD(&cur_trans->pending_ordered); 253 INIT_LIST_HEAD(&cur_trans->pending_ordered);
253 INIT_LIST_HEAD(&cur_trans->dirty_bgs); 254 INIT_LIST_HEAD(&cur_trans->dirty_bgs);
255 INIT_LIST_HEAD(&cur_trans->io_bgs);
256 mutex_init(&cur_trans->cache_write_mutex);
254 cur_trans->num_dirty_bgs = 0; 257 cur_trans->num_dirty_bgs = 0;
255 spin_lock_init(&cur_trans->dirty_bgs_lock); 258 spin_lock_init(&cur_trans->dirty_bgs_lock);
256 list_add_tail(&cur_trans->list, &fs_info->trans_list); 259 list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -1059,6 +1062,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
1059{ 1062{
1060 struct btrfs_fs_info *fs_info = root->fs_info; 1063 struct btrfs_fs_info *fs_info = root->fs_info;
1061 struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; 1064 struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
1065 struct list_head *io_bgs = &trans->transaction->io_bgs;
1062 struct list_head *next; 1066 struct list_head *next;
1063 struct extent_buffer *eb; 1067 struct extent_buffer *eb;
1064 int ret; 1068 int ret;
@@ -1112,7 +1116,7 @@ again:
1112 return ret; 1116 return ret;
1113 } 1117 }
1114 1118
1115 while (!list_empty(dirty_bgs)) { 1119 while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
1116 ret = btrfs_write_dirty_block_groups(trans, root); 1120 ret = btrfs_write_dirty_block_groups(trans, root);
1117 if (ret) 1121 if (ret)
1118 return ret; 1122 return ret;
@@ -1812,6 +1816,37 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1812 return ret; 1816 return ret;
1813 } 1817 }
1814 1818
1819 if (!cur_trans->dirty_bg_run) {
1820 int run_it = 0;
1821
1822 /* this mutex is also taken before trying to set
1823 * block groups readonly. We need to make sure
1824 * that nobody has set a block group readonly
1825 * after a extents from that block group have been
1826 * allocated for cache files. btrfs_set_block_group_ro
1827 * will wait for the transaction to commit if it
1828 * finds dirty_bg_run = 1
1829 *
1830 * The dirty_bg_run flag is also used to make sure only
1831 * one process starts all the block group IO. It wouldn't
1832 * hurt to have more than one go through, but there's no
1833 * real advantage to it either.
1834 */
1835 mutex_lock(&root->fs_info->ro_block_group_mutex);
1836 if (!cur_trans->dirty_bg_run) {
1837 run_it = 1;
1838 cur_trans->dirty_bg_run = 1;
1839 }
1840 mutex_unlock(&root->fs_info->ro_block_group_mutex);
1841
1842 if (run_it)
1843 ret = btrfs_start_dirty_block_groups(trans, root);
1844 }
1845 if (ret) {
1846 btrfs_end_transaction(trans, root);
1847 return ret;
1848 }
1849
1815 spin_lock(&root->fs_info->trans_lock); 1850 spin_lock(&root->fs_info->trans_lock);
1816 list_splice(&trans->ordered, &cur_trans->pending_ordered); 1851 list_splice(&trans->ordered, &cur_trans->pending_ordered);
1817 if (cur_trans->state >= TRANS_STATE_COMMIT_START) { 1852 if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
@@ -2005,6 +2040,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
2005 2040
2006 assert_qgroups_uptodate(trans); 2041 assert_qgroups_uptodate(trans);
2007 ASSERT(list_empty(&cur_trans->dirty_bgs)); 2042 ASSERT(list_empty(&cur_trans->dirty_bgs));
2043 ASSERT(list_empty(&cur_trans->io_bgs));
2008 update_super_roots(root); 2044 update_super_roots(root);
2009 2045
2010 btrfs_set_super_log_root(root->fs_info->super_copy, 0); 2046 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 4cb0ae264534..0b24755596ba 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -64,10 +64,19 @@ struct btrfs_transaction {
64 struct list_head pending_ordered; 64 struct list_head pending_ordered;
65 struct list_head switch_commits; 65 struct list_head switch_commits;
66 struct list_head dirty_bgs; 66 struct list_head dirty_bgs;
67 struct list_head io_bgs;
67 u64 num_dirty_bgs; 68 u64 num_dirty_bgs;
69
70 /*
71 * we need to make sure block group deletion doesn't race with
72 * free space cache writeout. This mutex keeps them from stomping
73 * on each other
74 */
75 struct mutex cache_write_mutex;
68 spinlock_t dirty_bgs_lock; 76 spinlock_t dirty_bgs_lock;
69 struct btrfs_delayed_ref_root delayed_refs; 77 struct btrfs_delayed_ref_root delayed_refs;
70 int aborted; 78 int aborted;
79 int dirty_bg_run;
71}; 80};
72 81
73#define __TRANS_FREEZABLE (1U << 0) 82#define __TRANS_FREEZABLE (1U << 0)