diff options
author | Filipe Manana <fdmanana@suse.com> | 2015-04-25 13:29:16 -0400 |
---|---|---|
committer | Chris Mason <clm@fb.com> | 2015-04-26 09:26:37 -0400 |
commit | b58d1a9ef92031a6fc2f418c01abafb4458ba009 (patch) | |
tree | edf1cf7d38effb41856f836cec5250192b68c5e7 /fs | |
parent | a3bdccc4e683f0ac69230707ed3fa20e7cf73a79 (diff) |
Btrfs: fix race between start dirty bg cache writeout and bg deletion
While running xfstests I ran into the following:
[20892.242791] ------------[ cut here ]------------
[20892.243776] WARNING: CPU: 0 PID: 13299 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[20892.245874] BTRFS: Transaction aborted (error -2)
[20892.247329] Modules linked in: btrfs dm_snapshot dm_bufio dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse$
[20892.258488] CPU: 0 PID: 13299 Comm: fsstress Tainted: G W 4.0.0-rc5-btrfs-next-9+ #2
[20892.262011] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
[20892.264738] 0000000000000009 ffff880427f8bc18 ffffffff8142fa46 ffffffff8108b6a2
[20892.266244] ffff880427f8bc68 ffff880427f8bc58 ffffffff81045ea5 ffff880427f8bc48
[20892.267761] ffffffffa0509a6d 00000000fffffffe ffff8803545d6f40 ffffffffa05a15a0
[20892.269378] Call Trace:
[20892.269915] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[20892.271097] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[20892.272173] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[20892.273386] [<ffffffffa0509a6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[20892.274857] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[20892.275851] [<ffffffffa0509a6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[20892.277341] [<ffffffffa0515e10>] write_one_cache_group+0x68/0xaf [btrfs]
[20892.278628] [<ffffffffa052088a>] btrfs_start_dirty_block_groups+0x18d/0x29b [btrfs]
[20892.280191] [<ffffffffa052f077>] btrfs_commit_transaction+0x130/0x9c9 [btrfs]
[20892.281781] [<ffffffff8107d33d>] ? trace_hardirqs_on+0xd/0xf
[20892.282873] [<ffffffffa054163b>] btrfs_sync_file+0x313/0x387 [btrfs]
[20892.284111] [<ffffffff8117acad>] vfs_fsync_range+0x95/0xa4
[20892.285203] [<ffffffff810e603f>] ? time_hardirqs_on+0x15/0x28
[20892.286290] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[20892.287469] [<ffffffff8117acd8>] vfs_fsync+0x1c/0x1e
[20892.288412] [<ffffffff8117ae54>] do_fsync+0x34/0x4e
[20892.289348] [<ffffffff8117b07c>] SyS_fsync+0x10/0x14
[20892.290255] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[20892.291316] ---[ end trace 597f77e664245373 ]---
[20892.293955] BTRFS: error (device sdg) in write_one_cache_group:3184: errno=-2 No such entry
[20892.297390] BTRFS info (device sdg): forced readonly
This happens because in btrfs_start_dirty_block_groups() we splice the
transaction's list of dirty block groups into a local list and then we
keep extracting the first element of the list without holding the
cache_write_mutex mutex. This means that before we acquire that mutex
the first block group on the list might be removed by a conurrent task
running btrfs_remove_block_group(). So make sure we extract the first
element (and test the list emptyness) while holding that mutex.
Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout
outside critical section in commit")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/extent-tree.c | 44 |
1 files changed, 27 insertions, 17 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1eef4ee01d1a..fc0db9887c0e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -3408,17 +3408,14 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, | |||
3408 | int loops = 0; | 3408 | int loops = 0; |
3409 | 3409 | ||
3410 | spin_lock(&cur_trans->dirty_bgs_lock); | 3410 | spin_lock(&cur_trans->dirty_bgs_lock); |
3411 | if (!list_empty(&cur_trans->dirty_bgs)) { | 3411 | if (list_empty(&cur_trans->dirty_bgs)) { |
3412 | list_splice_init(&cur_trans->dirty_bgs, &dirty); | 3412 | spin_unlock(&cur_trans->dirty_bgs_lock); |
3413 | return 0; | ||
3413 | } | 3414 | } |
3415 | list_splice_init(&cur_trans->dirty_bgs, &dirty); | ||
3414 | spin_unlock(&cur_trans->dirty_bgs_lock); | 3416 | spin_unlock(&cur_trans->dirty_bgs_lock); |
3415 | 3417 | ||
3416 | again: | 3418 | again: |
3417 | if (list_empty(&dirty)) { | ||
3418 | btrfs_free_path(path); | ||
3419 | return 0; | ||
3420 | } | ||
3421 | |||
3422 | /* | 3419 | /* |
3423 | * make sure all the block groups on our dirty list actually | 3420 | * make sure all the block groups on our dirty list actually |
3424 | * exist | 3421 | * exist |
@@ -3431,18 +3428,16 @@ again: | |||
3431 | return -ENOMEM; | 3428 | return -ENOMEM; |
3432 | } | 3429 | } |
3433 | 3430 | ||
3431 | /* | ||
3432 | * cache_write_mutex is here only to save us from balance or automatic | ||
3433 | * removal of empty block groups deleting this block group while we are | ||
3434 | * writing out the cache | ||
3435 | */ | ||
3436 | mutex_lock(&trans->transaction->cache_write_mutex); | ||
3434 | while (!list_empty(&dirty)) { | 3437 | while (!list_empty(&dirty)) { |
3435 | cache = list_first_entry(&dirty, | 3438 | cache = list_first_entry(&dirty, |
3436 | struct btrfs_block_group_cache, | 3439 | struct btrfs_block_group_cache, |
3437 | dirty_list); | 3440 | dirty_list); |
3438 | |||
3439 | /* | ||
3440 | * cache_write_mutex is here only to save us from balance | ||
3441 | * deleting this block group while we are writing out the | ||
3442 | * cache | ||
3443 | */ | ||
3444 | mutex_lock(&trans->transaction->cache_write_mutex); | ||
3445 | |||
3446 | /* | 3441 | /* |
3447 | * this can happen if something re-dirties a block | 3442 | * this can happen if something re-dirties a block |
3448 | * group that is already under IO. Just wait for it to | 3443 | * group that is already under IO. Just wait for it to |
@@ -3495,7 +3490,6 @@ again: | |||
3495 | } | 3490 | } |
3496 | if (!ret) | 3491 | if (!ret) |
3497 | ret = write_one_cache_group(trans, root, path, cache); | 3492 | ret = write_one_cache_group(trans, root, path, cache); |
3498 | mutex_unlock(&trans->transaction->cache_write_mutex); | ||
3499 | 3493 | ||
3500 | /* if its not on the io list, we need to put the block group */ | 3494 | /* if its not on the io list, we need to put the block group */ |
3501 | if (should_put) | 3495 | if (should_put) |
@@ -3503,7 +3497,16 @@ again: | |||
3503 | 3497 | ||
3504 | if (ret) | 3498 | if (ret) |
3505 | break; | 3499 | break; |
3500 | |||
3501 | /* | ||
3502 | * Avoid blocking other tasks for too long. It might even save | ||
3503 | * us from writing caches for block groups that are going to be | ||
3504 | * removed. | ||
3505 | */ | ||
3506 | mutex_unlock(&trans->transaction->cache_write_mutex); | ||
3507 | mutex_lock(&trans->transaction->cache_write_mutex); | ||
3506 | } | 3508 | } |
3509 | mutex_unlock(&trans->transaction->cache_write_mutex); | ||
3507 | 3510 | ||
3508 | /* | 3511 | /* |
3509 | * go through delayed refs for all the stuff we've just kicked off | 3512 | * go through delayed refs for all the stuff we've just kicked off |
@@ -3514,8 +3517,15 @@ again: | |||
3514 | loops++; | 3517 | loops++; |
3515 | spin_lock(&cur_trans->dirty_bgs_lock); | 3518 | spin_lock(&cur_trans->dirty_bgs_lock); |
3516 | list_splice_init(&cur_trans->dirty_bgs, &dirty); | 3519 | list_splice_init(&cur_trans->dirty_bgs, &dirty); |
3520 | /* | ||
3521 | * dirty_bgs_lock protects us from concurrent block group | ||
3522 | * deletes too (not just cache_write_mutex). | ||
3523 | */ | ||
3524 | if (!list_empty(&dirty)) { | ||
3525 | spin_unlock(&cur_trans->dirty_bgs_lock); | ||
3526 | goto again; | ||
3527 | } | ||
3517 | spin_unlock(&cur_trans->dirty_bgs_lock); | 3528 | spin_unlock(&cur_trans->dirty_bgs_lock); |
3518 | goto again; | ||
3519 | } | 3529 | } |
3520 | 3530 | ||
3521 | btrfs_free_path(path); | 3531 | btrfs_free_path(path); |