aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2015-10-03 08:13:13 -0400
committerChris Mason <clm@fb.com>2015-10-05 19:56:38 -0400
commitd9a0540a79f87456907f2ce031f058cf745c5bff (patch)
tree5f733e6ed34569a804711ebf40f4c2c4116319be /fs
parent808f80b46790f27e145c72112189d6a3be2bc884 (diff)
Btrfs: fix deadlock when finalizing block group creation
Josef ran into a deadlock while a transaction handle was finalizing the creation of its block groups, which produced the following trace: [260445.593112] fio D ffff88022a9df468 0 8924 4518 0x00000084 [260445.593119] ffff88022a9df468 ffffffff81c134c0 ffff880429693c00 ffff88022a9df488 [260445.593126] ffff88022a9e0000 ffff8803490d7b00 ffff8803490d7b18 ffff88022a9df4b0 [260445.593132] ffff8803490d7af8 ffff88022a9df488 ffffffff8175a437 ffff8803490d7b00 [260445.593137] Call Trace: [260445.593145] [<ffffffff8175a437>] schedule+0x37/0x80 [260445.593189] [<ffffffffa0850f37>] btrfs_tree_lock+0xa7/0x1f0 [btrfs] [260445.593197] [<ffffffff810db7c0>] ? prepare_to_wait_event+0xf0/0xf0 [260445.593225] [<ffffffffa07eac44>] btrfs_lock_root_node+0x34/0x50 [btrfs] [260445.593253] [<ffffffffa07eff6b>] btrfs_search_slot+0x88b/0xa00 [btrfs] [260445.593295] [<ffffffffa08389df>] ? free_extent_buffer+0x4f/0x90 [btrfs] [260445.593324] [<ffffffffa07f1a06>] btrfs_insert_empty_items+0x66/0xc0 [btrfs] [260445.593351] [<ffffffffa07ea94a>] ? btrfs_alloc_path+0x1a/0x20 [btrfs] [260445.593394] [<ffffffffa08403b9>] btrfs_finish_chunk_alloc+0x1c9/0x570 [btrfs] [260445.593427] [<ffffffffa08002ab>] btrfs_create_pending_block_groups+0x11b/0x200 [btrfs] [260445.593459] [<ffffffffa0800964>] do_chunk_alloc+0x2a4/0x2e0 [btrfs] [260445.593491] [<ffffffffa0803815>] find_free_extent+0xa55/0xd90 [btrfs] [260445.593524] [<ffffffffa0803c22>] btrfs_reserve_extent+0xd2/0x220 [btrfs] [260445.593532] [<ffffffff8119fe5d>] ? account_page_dirtied+0xdd/0x170 [260445.593564] [<ffffffffa0803e78>] btrfs_alloc_tree_block+0x108/0x4a0 [btrfs] [260445.593597] [<ffffffffa080c9de>] ? btree_set_page_dirty+0xe/0x10 [btrfs] [260445.593626] [<ffffffffa07eb5cd>] __btrfs_cow_block+0x12d/0x5b0 [btrfs] [260445.593654] [<ffffffffa07ebbff>] btrfs_cow_block+0x11f/0x1c0 [btrfs] [260445.593682] [<ffffffffa07ef8c7>] btrfs_search_slot+0x1e7/0xa00 [btrfs] [260445.593724] [<ffffffffa08389df>] ? free_extent_buffer+0x4f/0x90 [btrfs] [260445.593752] [<ffffffffa07f1a06>] btrfs_insert_empty_items+0x66/0xc0 [btrfs] [260445.593830] [<ffffffffa07ea94a>] ? btrfs_alloc_path+0x1a/0x20 [btrfs] [260445.593905] [<ffffffffa08403b9>] btrfs_finish_chunk_alloc+0x1c9/0x570 [btrfs] [260445.593946] [<ffffffffa08002ab>] btrfs_create_pending_block_groups+0x11b/0x200 [btrfs] [260445.593990] [<ffffffffa0815798>] btrfs_commit_transaction+0xa8/0xb40 [btrfs] [260445.594042] [<ffffffffa085abcd>] ? btrfs_log_dentry_safe+0x6d/0x80 [btrfs] [260445.594089] [<ffffffffa082bc84>] btrfs_sync_file+0x294/0x350 [btrfs] [260445.594115] [<ffffffff8123e29b>] vfs_fsync_range+0x3b/0xa0 [260445.594133] [<ffffffff81023891>] ? syscall_trace_enter_phase1+0x131/0x180 [260445.594149] [<ffffffff8123e35d>] do_fsync+0x3d/0x70 [260445.594169] [<ffffffff81023bb8>] ? syscall_trace_leave+0xb8/0x110 [260445.594187] [<ffffffff8123e600>] SyS_fsync+0x10/0x20 [260445.594204] [<ffffffff8175de6e>] entry_SYSCALL_64_fastpath+0x12/0x71 This happened because the same transaction handle created a large number of block groups and while finalizing their creation (inserting new items and updating existing items in the chunk and device trees) a new metadata extent had to be allocated and no free space was found in the current metadata block groups, which made find_free_extent() attempt to allocate a new block group via do_chunk_alloc(). However at do_chunk_alloc() we ended up allocating a new system chunk too and exceeded the threshold of 2Mb of reserved chunk bytes, which makes do_chunk_alloc() enter the final part of block group creation again (at btrfs_create_pending_block_groups()) and attempt to lock again the root of the chunk tree when it's already write locked by the same task. Similarly we can deadlock on extent tree nodes/leafs if while we are running delayed references we end up creating a new metadata block group in order to allocate a new node/leaf for the extent tree (as part of a CoW operation or growing the tree), as btrfs_create_pending_block_groups inserts items into the extent tree as well. In this case we get the following trace: [14242.773581] fio D ffff880428ca3418 0 3615 3100 0x00000084 [14242.773588] ffff880428ca3418 ffff88042d66b000 ffff88042a03c800 ffff880428ca3438 [14242.773594] ffff880428ca4000 ffff8803e4b20190 ffff8803e4b201a8 ffff880428ca3460 [14242.773600] ffff8803e4b20188 ffff880428ca3438 ffffffff8175a437 ffff8803e4b20190 [14242.773606] Call Trace: [14242.773613] [<ffffffff8175a437>] schedule+0x37/0x80 [14242.773656] [<ffffffffa057ff07>] btrfs_tree_lock+0xa7/0x1f0 [btrfs] [14242.773664] [<ffffffff810db7c0>] ? prepare_to_wait_event+0xf0/0xf0 [14242.773692] [<ffffffffa0519c44>] btrfs_lock_root_node+0x34/0x50 [btrfs] [14242.773720] [<ffffffffa051ef6b>] btrfs_search_slot+0x88b/0xa00 [btrfs] [14242.773750] [<ffffffffa0520a06>] btrfs_insert_empty_items+0x66/0xc0 [btrfs] [14242.773758] [<ffffffff811ef4a2>] ? kmem_cache_alloc+0x1d2/0x200 [14242.773786] [<ffffffffa0520ad1>] btrfs_insert_item+0x71/0xf0 [btrfs] [14242.773818] [<ffffffffa052f292>] btrfs_create_pending_block_groups+0x102/0x200 [btrfs] [14242.773850] [<ffffffffa052f96e>] do_chunk_alloc+0x2ae/0x2f0 [btrfs] [14242.773934] [<ffffffffa0532825>] find_free_extent+0xa55/0xd90 [btrfs] [14242.773998] [<ffffffffa0532c22>] btrfs_reserve_extent+0xc2/0x1d0 [btrfs] [14242.774041] [<ffffffffa0532e38>] btrfs_alloc_tree_block+0x108/0x4a0 [btrfs] [14242.774078] [<ffffffffa051a5cd>] __btrfs_cow_block+0x12d/0x5b0 [btrfs] [14242.774118] [<ffffffffa051abff>] btrfs_cow_block+0x11f/0x1c0 [btrfs] [14242.774155] [<ffffffffa051e8c7>] btrfs_search_slot+0x1e7/0xa00 [btrfs] [14242.774194] [<ffffffffa0528021>] ? __btrfs_free_extent.isra.70+0x2e1/0xcb0 [btrfs] [14242.774235] [<ffffffffa0520a06>] btrfs_insert_empty_items+0x66/0xc0 [btrfs] [14242.774274] [<ffffffffa051994a>] ? btrfs_alloc_path+0x1a/0x20 [btrfs] [14242.774318] [<ffffffffa052c433>] __btrfs_run_delayed_refs+0xbb3/0x1020 [btrfs] [14242.774358] [<ffffffffa052f404>] btrfs_run_delayed_refs.part.78+0x74/0x280 [btrfs] [14242.774391] [<ffffffffa052f627>] btrfs_run_delayed_refs+0x17/0x20 [btrfs] [14242.774432] [<ffffffffa05be236>] commit_cowonly_roots+0x8d/0x2bd [btrfs] [14242.774474] [<ffffffffa059d07f>] ? __btrfs_run_delayed_items+0x1cf/0x210 [btrfs] [14242.774516] [<ffffffffa05adac3>] ? btrfs_qgroup_account_extents+0x83/0x130 [btrfs] [14242.774558] [<ffffffffa0544c40>] btrfs_commit_transaction+0x590/0xb40 [btrfs] [14242.774599] [<ffffffffa0589b9d>] ? btrfs_log_dentry_safe+0x6d/0x80 [btrfs] [14242.774642] [<ffffffffa055ac54>] btrfs_sync_file+0x294/0x350 [btrfs] [14242.774650] [<ffffffff8123e29b>] vfs_fsync_range+0x3b/0xa0 [14242.774657] [<ffffffff81023891>] ? syscall_trace_enter_phase1+0x131/0x180 [14242.774663] [<ffffffff8123e35d>] do_fsync+0x3d/0x70 [14242.774669] [<ffffffff81023bb8>] ? syscall_trace_leave+0xb8/0x110 [14242.774675] [<ffffffff8123e600>] SyS_fsync+0x10/0x20 [14242.774681] [<ffffffff8175de6e>] entry_SYSCALL_64_fastpath+0x12/0x71 Fix this by never recursing into the finalization phase of block group creation and making sure we never trigger the finalization of block group creation while running delayed references. Reported-by: Josef Bacik <jbacik@fb.com> Fixes: 00d80e342c0f ("Btrfs: fix quick exhaustion of the system array in the superblock") Signed-off-by: Filipe Manana <fdmanana@suse.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/extent-tree.c9
-rw-r--r--fs/btrfs/transaction.c1
-rw-r--r--fs/btrfs/transaction.h1
3 files changed, 10 insertions, 1 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f9604201333..601d7d45d164 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2828,6 +2828,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2828 struct btrfs_delayed_ref_head *head; 2828 struct btrfs_delayed_ref_head *head;
2829 int ret; 2829 int ret;
2830 int run_all = count == (unsigned long)-1; 2830 int run_all = count == (unsigned long)-1;
2831 bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
2831 2832
2832 /* We'll clean this up in btrfs_cleanup_transaction */ 2833 /* We'll clean this up in btrfs_cleanup_transaction */
2833 if (trans->aborted) 2834 if (trans->aborted)
@@ -2844,6 +2845,7 @@ again:
2844#ifdef SCRAMBLE_DELAYED_REFS 2845#ifdef SCRAMBLE_DELAYED_REFS
2845 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2846 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2846#endif 2847#endif
2848 trans->can_flush_pending_bgs = false;
2847 ret = __btrfs_run_delayed_refs(trans, root, count); 2849 ret = __btrfs_run_delayed_refs(trans, root, count);
2848 if (ret < 0) { 2850 if (ret < 0) {
2849 btrfs_abort_transaction(trans, root, ret); 2851 btrfs_abort_transaction(trans, root, ret);
@@ -2893,6 +2895,7 @@ again:
2893 } 2895 }
2894out: 2896out:
2895 assert_qgroups_uptodate(trans); 2897 assert_qgroups_uptodate(trans);
2898 trans->can_flush_pending_bgs = can_flush_pending_bgs;
2896 return 0; 2899 return 0;
2897} 2900}
2898 2901
@@ -4306,7 +4309,8 @@ out:
4306 * the block groups that were made dirty during the lifetime of the 4309 * the block groups that were made dirty during the lifetime of the
4307 * transaction. 4310 * transaction.
4308 */ 4311 */
4309 if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { 4312 if (trans->can_flush_pending_bgs &&
4313 trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
4310 btrfs_create_pending_block_groups(trans, trans->root); 4314 btrfs_create_pending_block_groups(trans, trans->root);
4311 btrfs_trans_release_chunk_metadata(trans); 4315 btrfs_trans_release_chunk_metadata(trans);
4312 } 4316 }
@@ -9560,7 +9564,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9560 struct btrfs_block_group_item item; 9564 struct btrfs_block_group_item item;
9561 struct btrfs_key key; 9565 struct btrfs_key key;
9562 int ret = 0; 9566 int ret = 0;
9567 bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
9563 9568
9569 trans->can_flush_pending_bgs = false;
9564 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 9570 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
9565 if (ret) 9571 if (ret)
9566 goto next; 9572 goto next;
@@ -9581,6 +9587,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9581next: 9587next:
9582 list_del_init(&block_group->bg_list); 9588 list_del_init(&block_group->bg_list);
9583 } 9589 }
9590 trans->can_flush_pending_bgs = can_flush_pending_bgs;
9584} 9591}
9585 9592
9586int btrfs_make_block_group(struct btrfs_trans_handle *trans, 9593int btrfs_make_block_group(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a2d6f7bcef6c..376191c7da13 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -557,6 +557,7 @@ again:
557 h->delayed_ref_elem.seq = 0; 557 h->delayed_ref_elem.seq = 0;
558 h->type = type; 558 h->type = type;
559 h->allocating_chunk = false; 559 h->allocating_chunk = false;
560 h->can_flush_pending_bgs = true;
560 h->reloc_reserved = false; 561 h->reloc_reserved = false;
561 h->sync = false; 562 h->sync = false;
562 INIT_LIST_HEAD(&h->qgroup_ref_list); 563 INIT_LIST_HEAD(&h->qgroup_ref_list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 87964bf8892d..a994bb097ee5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -118,6 +118,7 @@ struct btrfs_trans_handle {
118 short aborted; 118 short aborted;
119 short adding_csums; 119 short adding_csums;
120 bool allocating_chunk; 120 bool allocating_chunk;
121 bool can_flush_pending_bgs;
121 bool reloc_reserved; 122 bool reloc_reserved;
122 bool sync; 123 bool sync;
123 unsigned int type; 124 unsigned int type;