aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2016-04-26 10:39:32 -0400
committerFilipe Manana <fdmanana@suse.com>2016-05-12 20:59:16 -0400
commit9cfa3e34e20e6798a671236000d9e97c8aa5d318 (patch)
tree34cfa1a9e68b0d1ab2a208c52334793855f05fba
parent578def7c50f236432ba140d35bb7ca4ef0a1b20b (diff)
Btrfs: don't do unnecessary delalloc flushes when relocating
Before we start the actual relocation process of a block group, we do calls to flush delalloc of all inodes and then wait for ordered extents to complete. However we do these flush calls just to make sure we don't race with concurrent tasks that have actually already started to run delalloc and have allocated an extent from the block group we want to relocate, right before we set it to readonly mode, but have not yet created the respective ordered extents. The flush calls make us wait for such concurrent tasks because they end up calling filemap_fdatawrite_range() (through btrfs_start_delalloc_roots() -> __start_delalloc_inodes() -> btrfs_alloc_delalloc_work() -> btrfs_run_delalloc_work()) which ends up serializing us with those tasks due to attempts to lock the same pages (and the delalloc flush procedure calls the allocator and creates the ordered extents before unlocking the pages). These flushing calls not only make us waste time (cpu, IO) but also reduce the chances of writing larger extents (applications might be writing to contiguous ranges and we flush before they finish dirtying the whole ranges). So make sure we don't flush delalloc and just wait for concurrent tasks that have already started flushing delalloc and have allocated an extent from the block group we are about to relocate. This change also ends up fixing a race with direct IO writes that makes relocation not wait for direct IO ordered extents. This race is illustrated by the following diagram: CPU 1 CPU 2 btrfs_relocate_block_group(bg X) starts direct IO write, target inode currently has no ordered extents ongoing nor dirty pages (delalloc regions), therefore the root for our inode is not in the list fs_info->ordered_roots btrfs_direct_IO() __blockdev_direct_IO() btrfs_get_blocks_direct() btrfs_lock_extent_direct() locks range in the io tree btrfs_new_extent_direct() btrfs_reserve_extent() --> extent allocated from bg X btrfs_inc_block_group_ro(bg X) btrfs_start_delalloc_roots() __start_delalloc_inodes() --> does nothing, no dealloc ranges in the inode's io tree so the inode's root is not in the list fs_info->delalloc_roots btrfs_wait_ordered_roots() --> does not find the inode's root in the list fs_info->ordered_roots --> ends up not waiting for the direct IO write started by the task at CPU 2 relocate_block_group(rc->stage == MOVE_DATA_EXTENTS) prepare_to_relocate() btrfs_commit_transaction() iterates the extent tree, using its commit root and moves extents into new locations btrfs_add_ordered_extent_dio() --> now a ordered extent is created and added to the list root->ordered_extents and the root added to the list fs_info->ordered_roots --> this is too late and the task at CPU 1 already started the relocation btrfs_commit_transaction() btrfs_finish_ordered_io() btrfs_alloc_reserved_file_extent() --> adds delayed data reference for the extent allocated from bg X relocate_block_group(rc->stage == UPDATE_DATA_PTRS) prepare_to_relocate() btrfs_commit_transaction() --> delayed refs are run, so an extent item for the allocated extent from bg X is added to extent tree --> commit roots are switched, so the next scan in the extent tree will see the extent item sees the extent in the extent tree When this happens the relocation produces the following warning when it finishes: [ 7260.832836] ------------[ cut here ]------------ [ 7260.834653] WARNING: CPU: 5 PID: 6765 at fs/btrfs/relocation.c:4318 btrfs_relocate_block_group+0x245/0x2a1 [btrfs]() [ 7260.838268] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc [ 7260.850935] CPU: 5 PID: 6765 Comm: btrfs Not tainted 4.5.0-rc6-btrfs-next-28+ #1 [ 7260.852998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [ 7260.852998] 0000000000000000 ffff88020bf57bc0 ffffffff812648b3 0000000000000000 [ 7260.852998] 0000000000000009 ffff88020bf57bf8 ffffffff81051608 ffffffffa03c1b2d [ 7260.852998] ffff8800b2bbb800 0000000000000000 ffff8800b17bcc58 ffff8800399dd000 [ 7260.852998] Call Trace: [ 7260.852998] [<ffffffff812648b3>] dump_stack+0x67/0x90 [ 7260.852998] [<ffffffff81051608>] warn_slowpath_common+0x99/0xb2 [ 7260.852998] [<ffffffffa03c1b2d>] ? btrfs_relocate_block_group+0x245/0x2a1 [btrfs] [ 7260.852998] [<ffffffff810516d4>] warn_slowpath_null+0x1a/0x1c [ 7260.852998] [<ffffffffa03c1b2d>] btrfs_relocate_block_group+0x245/0x2a1 [btrfs] [ 7260.852998] [<ffffffffa039d9de>] btrfs_relocate_chunk.isra.29+0x66/0xdb [btrfs] [ 7260.852998] [<ffffffffa039f314>] btrfs_balance+0xde1/0xe4e [btrfs] [ 7260.852998] [<ffffffff8127d671>] ? debug_smp_processor_id+0x17/0x19 [ 7260.852998] [<ffffffffa03a9583>] btrfs_ioctl_balance+0x255/0x2d3 [btrfs] [ 7260.852998] [<ffffffffa03ac96a>] btrfs_ioctl+0x11e0/0x1dff [btrfs] [ 7260.852998] [<ffffffff811451df>] ? handle_mm_fault+0x443/0xd63 [ 7260.852998] [<ffffffff81491817>] ? _raw_spin_unlock+0x31/0x44 [ 7260.852998] [<ffffffff8108b36a>] ? arch_local_irq_save+0x9/0xc [ 7260.852998] [<ffffffff811876ab>] vfs_ioctl+0x18/0x34 [ 7260.852998] [<ffffffff81187cb2>] do_vfs_ioctl+0x550/0x5be [ 7260.852998] [<ffffffff81190c30>] ? __fget_light+0x4d/0x71 [ 7260.852998] [<ffffffff81187d77>] SyS_ioctl+0x57/0x79 [ 7260.852998] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b [ 7260.893268] ---[ end trace eb7803b24ebab8ad ]--- This is because at the end of the first stage, in relocate_block_group(), we commit the current transaction, which makes delayed refs run, the commit roots are switched and so the second stage will find the extent item that the ordered extent added to the delayed refs. But this extent was not moved (ordered extent completed after first stage finished), so at the end of the relocation our block group item still has a positive used bytes counter, triggering a warning at the end of btrfs_relocate_block_group(). Later on when trying to read the extent contents from disk we hit a BUG_ON() due to the inability to map a block with a logical address that belongs to the block group we relocated and is no longer valid, resulting in the following trace: [ 7344.885290] BTRFS critical (device sdi): unable to find logical 12845056 len 4096 [ 7344.887518] ------------[ cut here ]------------ [ 7344.888431] kernel BUG at fs/btrfs/inode.c:1833! [ 7344.888431] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 7344.888431] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc [ 7344.888431] CPU: 0 PID: 6831 Comm: od Tainted: G W 4.5.0-rc6-btrfs-next-28+ #1 [ 7344.888431] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [ 7344.888431] task: ffff880215818600 ti: ffff880204684000 task.ti: ffff880204684000 [ 7344.888431] RIP: 0010:[<ffffffffa037c88c>] [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs] [ 7344.888431] RSP: 0018:ffff8802046878f0 EFLAGS: 00010282 [ 7344.888431] RAX: 00000000ffffffea RBX: 0000000000001000 RCX: 0000000000000001 [ 7344.888431] RDX: ffff88023ec0f950 RSI: ffffffff8183b638 RDI: 00000000ffffffff [ 7344.888431] RBP: ffff880204687908 R08: 0000000000000001 R09: 0000000000000000 [ 7344.888431] R10: ffff880204687770 R11: ffffffff82f2d52d R12: 0000000000001000 [ 7344.888431] R13: ffff88021afbfee8 R14: 0000000000006208 R15: ffff88006cd199b0 [ 7344.888431] FS: 00007f1f9e1d6700(0000) GS:ffff88023ec00000(0000) knlGS:0000000000000000 [ 7344.888431] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 7344.888431] CR2: 00007f1f9dc8cb60 CR3: 000000023e3b6000 CR4: 00000000000006f0 [ 7344.888431] Stack: [ 7344.888431] 0000000000001000 0000000000001000 ffff880204687b98 ffff880204687950 [ 7344.888431] ffffffffa0395c8f ffffea0004d64d48 0000000000000000 0000000000001000 [ 7344.888431] ffffea0004d64d48 0000000000001000 0000000000000000 0000000000000000 [ 7344.888431] Call Trace: [ 7344.888431] [<ffffffffa0395c8f>] submit_extent_page+0xf5/0x16f [btrfs] [ 7344.888431] [<ffffffffa03970ac>] __do_readpage+0x4a0/0x4f1 [btrfs] [ 7344.888431] [<ffffffffa039680d>] ? btrfs_create_repair_bio+0xcb/0xcb [btrfs] [ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs] [ 7344.888431] [<ffffffff8108df55>] ? trace_hardirqs_on+0xd/0xf [ 7344.888431] [<ffffffffa039728c>] __do_contiguous_readpages.constprop.26+0xc2/0xe4 [btrfs] [ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs] [ 7344.888431] [<ffffffffa039739b>] __extent_readpages.constprop.25+0xed/0x100 [btrfs] [ 7344.888431] [<ffffffff81129d24>] ? lru_cache_add+0xe/0x10 [ 7344.888431] [<ffffffffa0397ea8>] extent_readpages+0x160/0x1aa [btrfs] [ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs] [ 7344.888431] [<ffffffff8115daad>] ? alloc_pages_current+0xa9/0xcd [ 7344.888431] [<ffffffffa037cdc9>] btrfs_readpages+0x1f/0x21 [btrfs] [ 7344.888431] [<ffffffff81128316>] __do_page_cache_readahead+0x168/0x1fc [ 7344.888431] [<ffffffff811285a0>] ondemand_readahead+0x1f6/0x207 [ 7344.888431] [<ffffffff811285a0>] ? ondemand_readahead+0x1f6/0x207 [ 7344.888431] [<ffffffff8111cf34>] ? pagecache_get_page+0x2b/0x154 [ 7344.888431] [<ffffffff8112870e>] page_cache_sync_readahead+0x3d/0x3f [ 7344.888431] [<ffffffff8111dbf7>] generic_file_read_iter+0x197/0x4e1 [ 7344.888431] [<ffffffff8117773a>] __vfs_read+0x79/0x9d [ 7344.888431] [<ffffffff81178050>] vfs_read+0x8f/0xd2 [ 7344.888431] [<ffffffff81178a38>] SyS_read+0x50/0x7e [ 7344.888431] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b [ 7344.888431] Code: 8d 4d e8 45 31 c9 45 31 c0 48 8b 00 48 c1 e2 09 48 8b 80 80 fc ff ff 4c 89 65 e8 48 8b b8 f0 01 00 00 e8 1d 42 02 00 85 c0 79 02 <0f> 0b 4c 0 [ 7344.888431] RIP [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs] [ 7344.888431] RSP <ffff8802046878f0> [ 7344.970544] ---[ end trace eb7803b24ebab8ae ]--- Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Josef Bacik <jbacik@fb.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
-rw-r--r--fs/btrfs/ctree.h14
-rw-r--r--fs/btrfs/extent-tree.c58
-rw-r--r--fs/btrfs/inode.c8
-rw-r--r--fs/btrfs/relocation.c6
4 files changed, 79 insertions, 7 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 84a6a5b3384a..90e70e21e479 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1408,6 +1408,17 @@ struct btrfs_block_group_cache {
1408 1408
1409 struct btrfs_io_ctl io_ctl; 1409 struct btrfs_io_ctl io_ctl;
1410 1410
1411 /*
1412 * Incremented when doing extent allocations and holding a read lock
1413 * on the space_info's groups_sem semaphore.
1414 * Decremented when an ordered extent that represents an IO against this
1415 * block group's range is created (after it's added to its inode's
1416 * root's list of ordered extents) or immediately after the allocation
1417 * if it's a metadata extent or fallocate extent (for these cases we
1418 * don't create ordered extents).
1419 */
1420 atomic_t reservations;
1421
1411 /* Lock for free space tree operations. */ 1422 /* Lock for free space tree operations. */
1412 struct mutex free_space_lock; 1423 struct mutex free_space_lock;
1413 1424
@@ -3499,6 +3510,9 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
3499 struct btrfs_root *root); 3510 struct btrfs_root *root);
3500int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 3511int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
3501 struct btrfs_root *root); 3512 struct btrfs_root *root);
3513void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
3514 const u64 start);
3515void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
3502void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3516void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3503int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 3517int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3504 struct btrfs_root *root, unsigned long count); 3518 struct btrfs_root *root, unsigned long count);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 251452a2b72c..09aad7b447f5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6175,6 +6175,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
6175 return 0; 6175 return 0;
6176} 6176}
6177 6177
6178static void
6179btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6180{
6181 atomic_inc(&bg->reservations);
6182}
6183
6184void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6185 const u64 start)
6186{
6187 struct btrfs_block_group_cache *bg;
6188
6189 bg = btrfs_lookup_block_group(fs_info, start);
6190 ASSERT(bg);
6191 if (atomic_dec_and_test(&bg->reservations))
6192 wake_up_atomic_t(&bg->reservations);
6193 btrfs_put_block_group(bg);
6194}
6195
6196static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
6197{
6198 schedule();
6199 return 0;
6200}
6201
6202void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6203{
6204 struct btrfs_space_info *space_info = bg->space_info;
6205
6206 ASSERT(bg->ro);
6207
6208 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6209 return;
6210
6211 /*
6212 * Our block group is read only but before we set it to read only,
6213 * some task might have had allocated an extent from it already, but it
6214 * has not yet created a respective ordered extent (and added it to a
6215 * root's list of ordered extents).
6216 * Therefore wait for any task currently allocating extents, since the
6217 * block group's reservations counter is incremented while a read lock
6218 * on the groups' semaphore is held and decremented after releasing
6219 * the read access on that semaphore and creating the ordered extent.
6220 */
6221 down_write(&space_info->groups_sem);
6222 up_write(&space_info->groups_sem);
6223
6224 wait_on_atomic_t(&bg->reservations,
6225 btrfs_wait_bg_reservations_atomic_t,
6226 TASK_UNINTERRUPTIBLE);
6227}
6228
6178/** 6229/**
6179 * btrfs_update_reserved_bytes - update the block_group and space info counters 6230 * btrfs_update_reserved_bytes - update the block_group and space info counters
6180 * @cache: The cache we are manipulating 6231 * @cache: The cache we are manipulating
@@ -7434,6 +7485,7 @@ checks:
7434 btrfs_add_free_space(block_group, offset, num_bytes); 7485 btrfs_add_free_space(block_group, offset, num_bytes);
7435 goto loop; 7486 goto loop;
7436 } 7487 }
7488 btrfs_inc_block_group_reservations(block_group);
7437 7489
7438 /* we are all good, lets return */ 7490 /* we are all good, lets return */
7439 ins->objectid = search_start; 7491 ins->objectid = search_start;
@@ -7615,8 +7667,10 @@ again:
7615 WARN_ON(num_bytes < root->sectorsize); 7667 WARN_ON(num_bytes < root->sectorsize);
7616 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, 7668 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
7617 flags, delalloc); 7669 flags, delalloc);
7618 7670 if (!ret && !is_data) {
7619 if (ret == -ENOSPC) { 7671 btrfs_dec_block_group_reservations(root->fs_info,
7672 ins->objectid);
7673 } else if (ret == -ENOSPC) {
7620 if (!final_tried && ins->offset) { 7674 if (!final_tried && ins->offset) {
7621 num_bytes = min(num_bytes >> 1, ins->offset); 7675 num_bytes = min(num_bytes >> 1, ins->offset);
7622 num_bytes = round_down(num_bytes, root->sectorsize); 7676 num_bytes = round_down(num_bytes, root->sectorsize);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2aaba58b4856..3991d8a74f61 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -824,6 +824,7 @@ retry:
824 async_extent->ram_size - 1, 0); 824 async_extent->ram_size - 1, 0);
825 goto out_free_reserve; 825 goto out_free_reserve;
826 } 826 }
827 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
827 828
828 /* 829 /*
829 * clear dirty, set writeback and unlock the pages. 830 * clear dirty, set writeback and unlock the pages.
@@ -861,6 +862,7 @@ retry:
861 } 862 }
862 return; 863 return;
863out_free_reserve: 864out_free_reserve:
865 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
864 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 866 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
865out_free: 867out_free:
866 extent_clear_unlock_delalloc(inode, async_extent->start, 868 extent_clear_unlock_delalloc(inode, async_extent->start,
@@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode,
1038 goto out_drop_extent_cache; 1040 goto out_drop_extent_cache;
1039 } 1041 }
1040 1042
1043 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1044
1041 if (disk_num_bytes < cur_alloc_size) 1045 if (disk_num_bytes < cur_alloc_size)
1042 break; 1046 break;
1043 1047
@@ -1066,6 +1070,7 @@ out:
1066out_drop_extent_cache: 1070out_drop_extent_cache:
1067 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); 1071 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1068out_reserve: 1072out_reserve:
1073 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1069 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 1074 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1070out_unlock: 1075out_unlock:
1071 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1076 extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@ -7162,6 +7167,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7162 return ERR_PTR(ret); 7167 return ERR_PTR(ret);
7163 } 7168 }
7164 7169
7170 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
7171
7165 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, 7172 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
7166 ins.offset, ins.offset, ins.offset, 0); 7173 ins.offset, ins.offset, ins.offset, 0);
7167 if (IS_ERR(em)) { 7174 if (IS_ERR(em)) {
@@ -9942,6 +9949,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9942 btrfs_end_transaction(trans, root); 9949 btrfs_end_transaction(trans, root);
9943 break; 9950 break;
9944 } 9951 }
9952 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
9945 9953
9946 last_alloc = ins.offset; 9954 last_alloc = ins.offset;
9947 ret = insert_reserved_file_extent(trans, inode, 9955 ret = insert_reserved_file_extent(trans, inode,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 30f77ed60133..e78f8e44bd9a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4254,11 +4254,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4254 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", 4254 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
4255 rc->block_group->key.objectid, rc->block_group->flags); 4255 rc->block_group->key.objectid, rc->block_group->flags);
4256 4256
4257 ret = btrfs_start_delalloc_roots(fs_info, 0, -1); 4257 btrfs_wait_block_group_reservations(rc->block_group);
4258 if (ret < 0) {
4259 err = ret;
4260 goto out;
4261 }
4262 btrfs_wait_ordered_roots(fs_info, -1, 4258 btrfs_wait_ordered_roots(fs_info, -1,
4263 rc->block_group->key.objectid, 4259 rc->block_group->key.objectid,
4264 rc->block_group->key.offset); 4260 rc->block_group->key.offset);