diff options
author | Filipe Manana <fdmanana@suse.com> | 2015-06-10 19:58:53 -0400 |
---|---|---|
committer | Chris Mason <clm@fb.com> | 2015-06-30 17:36:46 -0400 |
commit | 67c5e7d464bc466471b05e027abe8a6b29687ebd (patch) | |
tree | 20f6112838a0957f6c05e5100aabb581606ea20d /fs | |
parent | e82afc52abff07a4acbc90f899598ebafb662831 (diff) |
Btrfs: fix race between balance and unused block group deletion
We have a race between deleting an unused block group and balancing the
same block group that leads to an assertion failure/BUG(), producing the
following trace:
[181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622
[181631.220591] ------------[ cut here ]------------
[181631.222959] kernel BUG at fs/btrfs/ctree.h:4062!
[181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$
[181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1
[181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000
[181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs]
[181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246
[181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce
[181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff
[181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000
[181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200
[181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000
[181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000
[181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0
[181631.224566] Stack:
[181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e
[181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001
[181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000
[181631.224566] Call Trace:
[181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs]
[181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs]
[181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs]
[181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs]
[181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs]
[181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs]
[181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf
[181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs]
[181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15
[181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc
[181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2
[181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2
[181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424
[181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479
(...)
The sequence of steps leading to this are:
CPU 0 CPU 1
btrfs_balance()
btrfs_relocate_chunk()
btrfs_relocate_block_group(bg X)
btrfs_lookup_block_group(bg X)
cleaner_kthread
locks fs_info->cleaner_mutex
btrfs_delete_unused_bgs()
finds bg X, which became
unused in the previous
transaction
checks bg X ->ro == 0,
so it proceeds
sets bg X ->ro to 1
(btrfs_set_block_group_ro(bg X))
blocks on fs_info->cleaner_mutex
btrfs_remove_chunk(bg X)
unlocks fs_info->cleaner_mutex
acquires fs_info->cleaner_mutex
relocate_block_group()
--> does nothing, no extents found in
the extent tree from bg X
unlocks fs_info->cleaner_mutex
btrfs_relocate_block_group(bg X) returns
btrfs_remove_chunk(bg X)
extent map not found
--> ASSERT(0)
Fix this by using a new mutex to make sure these 2 operations, block
group relocation and removal, are serialized.
This issue is reproducible by running fstests generic/038 (which stresses
chunk allocation and automatic removal of unused block groups) together
with the following balance loop:
while true; do btrfs balance start -dusage=0 <mountpoint> ; done
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/ctree.h | 1 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 12 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 3 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 48 |
4 files changed, 58 insertions, 6 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 80a9aefb0c46..aac314e14188 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -1778,6 +1778,7 @@ struct btrfs_fs_info { | |||
1778 | spinlock_t unused_bgs_lock; | 1778 | spinlock_t unused_bgs_lock; |
1779 | struct list_head unused_bgs; | 1779 | struct list_head unused_bgs; |
1780 | struct mutex unused_bg_unpin_mutex; | 1780 | struct mutex unused_bg_unpin_mutex; |
1781 | struct mutex delete_unused_bgs_mutex; | ||
1781 | 1782 | ||
1782 | /* For btrfs to record security options */ | 1783 | /* For btrfs to record security options */ |
1783 | struct security_mnt_opts security_opts; | 1784 | struct security_mnt_opts security_opts; |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b977fc8d8201..b59deb2c63f4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -1772,7 +1772,6 @@ static int cleaner_kthread(void *arg) | |||
1772 | } | 1772 | } |
1773 | 1773 | ||
1774 | btrfs_run_delayed_iputs(root); | 1774 | btrfs_run_delayed_iputs(root); |
1775 | btrfs_delete_unused_bgs(root->fs_info); | ||
1776 | again = btrfs_clean_one_deleted_snapshot(root); | 1775 | again = btrfs_clean_one_deleted_snapshot(root); |
1777 | mutex_unlock(&root->fs_info->cleaner_mutex); | 1776 | mutex_unlock(&root->fs_info->cleaner_mutex); |
1778 | 1777 | ||
@@ -1781,6 +1780,16 @@ static int cleaner_kthread(void *arg) | |||
1781 | * needn't do anything special here. | 1780 | * needn't do anything special here. |
1782 | */ | 1781 | */ |
1783 | btrfs_run_defrag_inodes(root->fs_info); | 1782 | btrfs_run_defrag_inodes(root->fs_info); |
1783 | |||
1784 | /* | ||
1785 | * Acquires fs_info->delete_unused_bgs_mutex to avoid racing | ||
1786 | * with relocation (btrfs_relocate_chunk) and relocation | ||
1787 | * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) | ||
1788 | * after acquiring fs_info->delete_unused_bgs_mutex. So we | ||
1789 | * can't hold, nor need to, fs_info->cleaner_mutex when deleting | ||
1790 | * unused block groups. | ||
1791 | */ | ||
1792 | btrfs_delete_unused_bgs(root->fs_info); | ||
1784 | sleep: | 1793 | sleep: |
1785 | if (!try_to_freeze() && !again) { | 1794 | if (!try_to_freeze() && !again) { |
1786 | set_current_state(TASK_INTERRUPTIBLE); | 1795 | set_current_state(TASK_INTERRUPTIBLE); |
@@ -2492,6 +2501,7 @@ int open_ctree(struct super_block *sb, | |||
2492 | spin_lock_init(&fs_info->unused_bgs_lock); | 2501 | spin_lock_init(&fs_info->unused_bgs_lock); |
2493 | rwlock_init(&fs_info->tree_mod_log_lock); | 2502 | rwlock_init(&fs_info->tree_mod_log_lock); |
2494 | mutex_init(&fs_info->unused_bg_unpin_mutex); | 2503 | mutex_init(&fs_info->unused_bg_unpin_mutex); |
2504 | mutex_init(&fs_info->delete_unused_bgs_mutex); | ||
2495 | mutex_init(&fs_info->reloc_mutex); | 2505 | mutex_init(&fs_info->reloc_mutex); |
2496 | mutex_init(&fs_info->delalloc_root_mutex); | 2506 | mutex_init(&fs_info->delalloc_root_mutex); |
2497 | seqlock_init(&fs_info->profiles_lock); | 2507 | seqlock_init(&fs_info->profiles_lock); |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 38b76cc02f48..1c2bd1723e40 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -9889,6 +9889,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) | |||
9889 | } | 9889 | } |
9890 | spin_unlock(&fs_info->unused_bgs_lock); | 9890 | spin_unlock(&fs_info->unused_bgs_lock); |
9891 | 9891 | ||
9892 | mutex_lock(&root->fs_info->delete_unused_bgs_mutex); | ||
9893 | |||
9892 | /* Don't want to race with allocators so take the groups_sem */ | 9894 | /* Don't want to race with allocators so take the groups_sem */ |
9893 | down_write(&space_info->groups_sem); | 9895 | down_write(&space_info->groups_sem); |
9894 | spin_lock(&block_group->lock); | 9896 | spin_lock(&block_group->lock); |
@@ -9983,6 +9985,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) | |||
9983 | end_trans: | 9985 | end_trans: |
9984 | btrfs_end_transaction(trans, root); | 9986 | btrfs_end_transaction(trans, root); |
9985 | next: | 9987 | next: |
9988 | mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); | ||
9986 | btrfs_put_block_group(block_group); | 9989 | btrfs_put_block_group(block_group); |
9987 | spin_lock(&fs_info->unused_bgs_lock); | 9990 | spin_lock(&fs_info->unused_bgs_lock); |
9988 | } | 9991 | } |
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d4cd4059bded..9b95503ddd00 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -2766,6 +2766,20 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, | |||
2766 | root = root->fs_info->chunk_root; | 2766 | root = root->fs_info->chunk_root; |
2767 | extent_root = root->fs_info->extent_root; | 2767 | extent_root = root->fs_info->extent_root; |
2768 | 2768 | ||
2769 | /* | ||
2770 | * Prevent races with automatic removal of unused block groups. | ||
2771 | * After we relocate and before we remove the chunk with offset | ||
2772 | * chunk_offset, automatic removal of the block group can kick in, | ||
2773 | * resulting in a failure when calling btrfs_remove_chunk() below. | ||
2774 | * | ||
2775 | * Make sure to acquire this mutex before doing a tree search (dev | ||
2776 | * or chunk trees) to find chunks. Otherwise the cleaner kthread might | ||
2777 | * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after | ||
2778 | * we release the path used to search the chunk/dev tree and before | ||
2779 | * the current task acquires this mutex and calls us. | ||
2780 | */ | ||
2781 | ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex)); | ||
2782 | |||
2769 | ret = btrfs_can_relocate(extent_root, chunk_offset); | 2783 | ret = btrfs_can_relocate(extent_root, chunk_offset); |
2770 | if (ret) | 2784 | if (ret) |
2771 | return -ENOSPC; | 2785 | return -ENOSPC; |
@@ -2814,13 +2828,18 @@ again: | |||
2814 | key.type = BTRFS_CHUNK_ITEM_KEY; | 2828 | key.type = BTRFS_CHUNK_ITEM_KEY; |
2815 | 2829 | ||
2816 | while (1) { | 2830 | while (1) { |
2831 | mutex_lock(&root->fs_info->delete_unused_bgs_mutex); | ||
2817 | ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); | 2832 | ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); |
2818 | if (ret < 0) | 2833 | if (ret < 0) { |
2834 | mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); | ||
2819 | goto error; | 2835 | goto error; |
2836 | } | ||
2820 | BUG_ON(ret == 0); /* Corruption */ | 2837 | BUG_ON(ret == 0); /* Corruption */ |
2821 | 2838 | ||
2822 | ret = btrfs_previous_item(chunk_root, path, key.objectid, | 2839 | ret = btrfs_previous_item(chunk_root, path, key.objectid, |
2823 | key.type); | 2840 | key.type); |
2841 | if (ret) | ||
2842 | mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); | ||
2824 | if (ret < 0) | 2843 | if (ret < 0) |
2825 | goto error; | 2844 | goto error; |
2826 | if (ret > 0) | 2845 | if (ret > 0) |
@@ -2843,6 +2862,7 @@ again: | |||
2843 | else | 2862 | else |
2844 | BUG_ON(ret); | 2863 | BUG_ON(ret); |
2845 | } | 2864 | } |
2865 | mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); | ||
2846 | 2866 | ||
2847 | if (found_key.offset == 0) | 2867 | if (found_key.offset == 0) |
2848 | break; | 2868 | break; |
@@ -3299,9 +3319,12 @@ again: | |||
3299 | goto error; | 3319 | goto error; |
3300 | } | 3320 | } |
3301 | 3321 | ||
3322 | mutex_lock(&fs_info->delete_unused_bgs_mutex); | ||
3302 | ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); | 3323 | ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); |
3303 | if (ret < 0) | 3324 | if (ret < 0) { |
3325 | mutex_unlock(&fs_info->delete_unused_bgs_mutex); | ||
3304 | goto error; | 3326 | goto error; |
3327 | } | ||
3305 | 3328 | ||
3306 | /* | 3329 | /* |
3307 | * this shouldn't happen, it means the last relocate | 3330 | * this shouldn't happen, it means the last relocate |
@@ -3313,6 +3336,7 @@ again: | |||
3313 | ret = btrfs_previous_item(chunk_root, path, 0, | 3336 | ret = btrfs_previous_item(chunk_root, path, 0, |
3314 | BTRFS_CHUNK_ITEM_KEY); | 3337 | BTRFS_CHUNK_ITEM_KEY); |
3315 | if (ret) { | 3338 | if (ret) { |
3339 | mutex_unlock(&fs_info->delete_unused_bgs_mutex); | ||
3316 | ret = 0; | 3340 | ret = 0; |
3317 | break; | 3341 | break; |
3318 | } | 3342 | } |
@@ -3321,8 +3345,10 @@ again: | |||
3321 | slot = path->slots[0]; | 3345 | slot = path->slots[0]; |
3322 | btrfs_item_key_to_cpu(leaf, &found_key, slot); | 3346 | btrfs_item_key_to_cpu(leaf, &found_key, slot); |
3323 | 3347 | ||
3324 | if (found_key.objectid != key.objectid) | 3348 | if (found_key.objectid != key.objectid) { |
3349 | mutex_unlock(&fs_info->delete_unused_bgs_mutex); | ||
3325 | break; | 3350 | break; |
3351 | } | ||
3326 | 3352 | ||
3327 | chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); | 3353 | chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); |
3328 | 3354 | ||
@@ -3335,10 +3361,13 @@ again: | |||
3335 | ret = should_balance_chunk(chunk_root, leaf, chunk, | 3361 | ret = should_balance_chunk(chunk_root, leaf, chunk, |
3336 | found_key.offset); | 3362 | found_key.offset); |
3337 | btrfs_release_path(path); | 3363 | btrfs_release_path(path); |
3338 | if (!ret) | 3364 | if (!ret) { |
3365 | mutex_unlock(&fs_info->delete_unused_bgs_mutex); | ||
3339 | goto loop; | 3366 | goto loop; |
3367 | } | ||
3340 | 3368 | ||
3341 | if (counting) { | 3369 | if (counting) { |
3370 | mutex_unlock(&fs_info->delete_unused_bgs_mutex); | ||
3342 | spin_lock(&fs_info->balance_lock); | 3371 | spin_lock(&fs_info->balance_lock); |
3343 | bctl->stat.expected++; | 3372 | bctl->stat.expected++; |
3344 | spin_unlock(&fs_info->balance_lock); | 3373 | spin_unlock(&fs_info->balance_lock); |
@@ -3348,6 +3377,7 @@ again: | |||
3348 | ret = btrfs_relocate_chunk(chunk_root, | 3377 | ret = btrfs_relocate_chunk(chunk_root, |
3349 | found_key.objectid, | 3378 | found_key.objectid, |
3350 | found_key.offset); | 3379 | found_key.offset); |
3380 | mutex_unlock(&fs_info->delete_unused_bgs_mutex); | ||
3351 | if (ret && ret != -ENOSPC) | 3381 | if (ret && ret != -ENOSPC) |
3352 | goto error; | 3382 | goto error; |
3353 | if (ret == -ENOSPC) { | 3383 | if (ret == -ENOSPC) { |
@@ -4087,11 +4117,16 @@ again: | |||
4087 | key.type = BTRFS_DEV_EXTENT_KEY; | 4117 | key.type = BTRFS_DEV_EXTENT_KEY; |
4088 | 4118 | ||
4089 | do { | 4119 | do { |
4120 | mutex_lock(&root->fs_info->delete_unused_bgs_mutex); | ||
4090 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | 4121 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); |
4091 | if (ret < 0) | 4122 | if (ret < 0) { |
4123 | mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); | ||
4092 | goto done; | 4124 | goto done; |
4125 | } | ||
4093 | 4126 | ||
4094 | ret = btrfs_previous_item(root, path, 0, key.type); | 4127 | ret = btrfs_previous_item(root, path, 0, key.type); |
4128 | if (ret) | ||
4129 | mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); | ||
4095 | if (ret < 0) | 4130 | if (ret < 0) |
4096 | goto done; | 4131 | goto done; |
4097 | if (ret) { | 4132 | if (ret) { |
@@ -4105,6 +4140,7 @@ again: | |||
4105 | btrfs_item_key_to_cpu(l, &key, path->slots[0]); | 4140 | btrfs_item_key_to_cpu(l, &key, path->slots[0]); |
4106 | 4141 | ||
4107 | if (key.objectid != device->devid) { | 4142 | if (key.objectid != device->devid) { |
4143 | mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); | ||
4108 | btrfs_release_path(path); | 4144 | btrfs_release_path(path); |
4109 | break; | 4145 | break; |
4110 | } | 4146 | } |
@@ -4113,6 +4149,7 @@ again: | |||
4113 | length = btrfs_dev_extent_length(l, dev_extent); | 4149 | length = btrfs_dev_extent_length(l, dev_extent); |
4114 | 4150 | ||
4115 | if (key.offset + length <= new_size) { | 4151 | if (key.offset + length <= new_size) { |
4152 | mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); | ||
4116 | btrfs_release_path(path); | 4153 | btrfs_release_path(path); |
4117 | break; | 4154 | break; |
4118 | } | 4155 | } |
@@ -4122,6 +4159,7 @@ again: | |||
4122 | btrfs_release_path(path); | 4159 | btrfs_release_path(path); |
4123 | 4160 | ||
4124 | ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); | 4161 | ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); |
4162 | mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); | ||
4125 | if (ret && ret != -ENOSPC) | 4163 | if (ret && ret != -ENOSPC) |
4126 | goto done; | 4164 | goto done; |
4127 | if (ret == -ENOSPC) | 4165 | if (ret == -ENOSPC) |