diff options
author | Filipe Manana <fdmanana@suse.com> | 2015-02-09 16:14:24 -0500 |
---|---|---|
committer | Chris Mason <clm@fb.com> | 2015-02-14 11:19:14 -0500 |
commit | f55985f4dda5cfb6967c17e96237f3c859076eb3 (patch) | |
tree | a821d5a175b47ecf2896ad0f0da451454c69f40e /fs/btrfs | |
parent | 575849ecf5d103ca9bbf0a6b9e89eba335d4e750 (diff) |
Btrfs: scrub, fix sleep in atomic context
My previous patch "Btrfs: fix scrub race leading to use-after-free"
introduced the possibility to sleep in an atomic context, which happens
when the scrub_lock mutex is held at the time scrub_pending_bio_dec()
is called - this function can be called under an atomic context.
Chris ran into this in a debug kernel which gave the following trace:
[ 1928.950319] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:621
[ 1928.967334] in_atomic(): 1, irqs_disabled(): 0, pid: 149670, name: fsstress
[ 1928.981324] INFO: lockdep is turned off.
[ 1928.989244] CPU: 24 PID: 149670 Comm: fsstress Tainted: G W 3.19.0-rc7-mason+ #41
[ 1929.006418] Hardware name: ZTSYSTEMS Echo Ridge T4 /A9DRPF-10D, BIOS 1.07 05/10/2012
[ 1929.022207] ffffffff81a22cf8 ffff881076e03b78 ffffffff816b8dd9 ffff881076e03b78
[ 1929.037267] ffff880d8e828710 ffff881076e03ba8 ffffffff810856c4 ffff881076e03bc8
[ 1929.052315] 0000000000000000 000000000000026d ffffffff81a22cf8 ffff881076e03bd8
[ 1929.067381] Call Trace:
[ 1929.072344] <IRQ> [<ffffffff816b8dd9>] dump_stack+0x4f/0x6e
[ 1929.083968] [<ffffffff810856c4>] ___might_sleep+0x174/0x230
[ 1929.095352] [<ffffffff810857d2>] __might_sleep+0x52/0x90
[ 1929.106223] [<ffffffff816bb68f>] mutex_lock_nested+0x2f/0x3b0
[ 1929.117951] [<ffffffff810ab37d>] ? trace_hardirqs_on+0xd/0x10
[ 1929.129708] [<ffffffffa05dc838>] scrub_pending_bio_dec+0x38/0x70 [btrfs]
[ 1929.143370] [<ffffffffa05dd0e0>] scrub_parity_bio_endio+0x50/0x70 [btrfs]
[ 1929.157191] [<ffffffff812fa603>] bio_endio+0x53/0xa0
[ 1929.167382] [<ffffffffa05f96bc>] rbio_orig_end_io+0x7c/0xa0 [btrfs]
[ 1929.180161] [<ffffffffa05f97ba>] raid_write_parity_end_io+0x5a/0x80 [btrfs]
[ 1929.194318] [<ffffffff812fa603>] bio_endio+0x53/0xa0
[ 1929.204496] [<ffffffff8130401b>] blk_update_request+0x1eb/0x450
[ 1929.216569] [<ffffffff81096e58>] ? trigger_load_balance+0x78/0x500
[ 1929.229176] [<ffffffff8144c74d>] scsi_end_request+0x3d/0x1f0
[ 1929.240740] [<ffffffff8144ccac>] scsi_io_completion+0xac/0x5b0
[ 1929.252654] [<ffffffff81441c50>] scsi_finish_command+0xf0/0x150
[ 1929.264725] [<ffffffff8144d317>] scsi_softirq_done+0x147/0x170
[ 1929.276635] [<ffffffff8130ace6>] blk_done_softirq+0x86/0xa0
[ 1929.288014] [<ffffffff8105d92e>] __do_softirq+0xde/0x600
[ 1929.298885] [<ffffffff8105df6d>] irq_exit+0xbd/0xd0
(...)
Fix this by using a reference count on the scrub context structure
instead of locking the scrub_lock mutex.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/scrub.c | 39 |
1 files changed, 23 insertions, 16 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6d6e155c8c8b..db21f17df996 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -193,6 +193,15 @@ struct scrub_ctx { | |||
193 | */ | 193 | */ |
194 | struct btrfs_scrub_progress stat; | 194 | struct btrfs_scrub_progress stat; |
195 | spinlock_t stat_lock; | 195 | spinlock_t stat_lock; |
196 | |||
197 | /* | ||
198 | * Use a ref counter to avoid use-after-free issues. Scrub workers | ||
199 | * decrement bios_in_flight and workers_pending and then do a wakeup | ||
200 | * on the list_wait wait queue. We must ensure the main scrub task | ||
201 | * doesn't free the scrub context before or while the workers are | ||
202 | * doing the wakeup() call. | ||
203 | */ | ||
204 | atomic_t refs; | ||
196 | }; | 205 | }; |
197 | 206 | ||
198 | struct scrub_fixup_nodatasum { | 207 | struct scrub_fixup_nodatasum { |
@@ -297,26 +306,20 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, | |||
297 | static void copy_nocow_pages_worker(struct btrfs_work *work); | 306 | static void copy_nocow_pages_worker(struct btrfs_work *work); |
298 | static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); | 307 | static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); |
299 | static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); | 308 | static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); |
309 | static void scrub_put_ctx(struct scrub_ctx *sctx); | ||
300 | 310 | ||
301 | 311 | ||
302 | static void scrub_pending_bio_inc(struct scrub_ctx *sctx) | 312 | static void scrub_pending_bio_inc(struct scrub_ctx *sctx) |
303 | { | 313 | { |
314 | atomic_inc(&sctx->refs); | ||
304 | atomic_inc(&sctx->bios_in_flight); | 315 | atomic_inc(&sctx->bios_in_flight); |
305 | } | 316 | } |
306 | 317 | ||
307 | static void scrub_pending_bio_dec(struct scrub_ctx *sctx) | 318 | static void scrub_pending_bio_dec(struct scrub_ctx *sctx) |
308 | { | 319 | { |
309 | struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; | ||
310 | |||
311 | /* | ||
312 | * Hold the scrub_lock while doing the wakeup to ensure the | ||
313 | * sctx (and its wait queue list_wait) isn't destroyed/freed | ||
314 | * during the wakeup. | ||
315 | */ | ||
316 | mutex_lock(&fs_info->scrub_lock); | ||
317 | atomic_dec(&sctx->bios_in_flight); | 320 | atomic_dec(&sctx->bios_in_flight); |
318 | wake_up(&sctx->list_wait); | 321 | wake_up(&sctx->list_wait); |
319 | mutex_unlock(&fs_info->scrub_lock); | 322 | scrub_put_ctx(sctx); |
320 | } | 323 | } |
321 | 324 | ||
322 | static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) | 325 | static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) |
@@ -350,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) | |||
350 | { | 353 | { |
351 | struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; | 354 | struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; |
352 | 355 | ||
356 | atomic_inc(&sctx->refs); | ||
353 | /* | 357 | /* |
354 | * increment scrubs_running to prevent cancel requests from | 358 | * increment scrubs_running to prevent cancel requests from |
355 | * completing as long as a worker is running. we must also | 359 | * completing as long as a worker is running. we must also |
@@ -388,15 +392,11 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) | |||
388 | mutex_lock(&fs_info->scrub_lock); | 392 | mutex_lock(&fs_info->scrub_lock); |
389 | atomic_dec(&fs_info->scrubs_running); | 393 | atomic_dec(&fs_info->scrubs_running); |
390 | atomic_dec(&fs_info->scrubs_paused); | 394 | atomic_dec(&fs_info->scrubs_paused); |
395 | mutex_unlock(&fs_info->scrub_lock); | ||
391 | atomic_dec(&sctx->workers_pending); | 396 | atomic_dec(&sctx->workers_pending); |
392 | wake_up(&fs_info->scrub_pause_wait); | 397 | wake_up(&fs_info->scrub_pause_wait); |
393 | /* | ||
394 | * Hold the scrub_lock while doing the wakeup to ensure the | ||
395 | * sctx (and its wait queue list_wait) isn't destroyed/freed | ||
396 | * during the wakeup. | ||
397 | */ | ||
398 | wake_up(&sctx->list_wait); | 398 | wake_up(&sctx->list_wait); |
399 | mutex_unlock(&fs_info->scrub_lock); | 399 | scrub_put_ctx(sctx); |
400 | } | 400 | } |
401 | 401 | ||
402 | static void scrub_free_csums(struct scrub_ctx *sctx) | 402 | static void scrub_free_csums(struct scrub_ctx *sctx) |
@@ -442,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) | |||
442 | kfree(sctx); | 442 | kfree(sctx); |
443 | } | 443 | } |
444 | 444 | ||
445 | static void scrub_put_ctx(struct scrub_ctx *sctx) | ||
446 | { | ||
447 | if (atomic_dec_and_test(&sctx->refs)) | ||
448 | scrub_free_ctx(sctx); | ||
449 | } | ||
450 | |||
445 | static noinline_for_stack | 451 | static noinline_for_stack |
446 | struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) | 452 | struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) |
447 | { | 453 | { |
@@ -466,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) | |||
466 | sctx = kzalloc(sizeof(*sctx), GFP_NOFS); | 472 | sctx = kzalloc(sizeof(*sctx), GFP_NOFS); |
467 | if (!sctx) | 473 | if (!sctx) |
468 | goto nomem; | 474 | goto nomem; |
475 | atomic_set(&sctx->refs, 1); | ||
469 | sctx->is_dev_replace = is_dev_replace; | 476 | sctx->is_dev_replace = is_dev_replace; |
470 | sctx->pages_per_rd_bio = pages_per_rd_bio; | 477 | sctx->pages_per_rd_bio = pages_per_rd_bio; |
471 | sctx->curr = -1; | 478 | sctx->curr = -1; |
@@ -3739,7 +3746,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, | |||
3739 | scrub_workers_put(fs_info); | 3746 | scrub_workers_put(fs_info); |
3740 | mutex_unlock(&fs_info->scrub_lock); | 3747 | mutex_unlock(&fs_info->scrub_lock); |
3741 | 3748 | ||
3742 | scrub_free_ctx(sctx); | 3749 | scrub_put_ctx(sctx); |
3743 | 3750 | ||
3744 | return ret; | 3751 | return ret; |
3745 | } | 3752 | } |