diff options
author | Qu Wenruo <quwenruo@cn.fujitsu.com> | 2017-04-13 20:35:54 -0400 |
---|---|---|
committer | David Sterba <dsterba@suse.com> | 2017-04-18 08:07:27 -0400 |
commit | 0966a7b1300f953b04b436aa82486d3d1b17c96d (patch) | |
tree | 918f983ce93ded51f7c8176740cdaee6f1ee7eb6 | |
parent | fa7aede2ab5f54352c7aec056930dd17b28f3a78 (diff) |
btrfs: scrub: Introduce full stripe lock for RAID56
Unlike mirror based profiles, RAID5/6 recovery needs to read out the
whole full stripe.
And if we don't do proper protection, it can easily cause race condition.
Introduce 2 new functions: lock_full_stripe() and unlock_full_stripe()
for RAID5/6.
Which store a rb_tree of mutexes for full stripes, so scrub callers can
use them to lock a full stripe to avoid race.
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor comment adjustments ]
Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r-- | fs/btrfs/ctree.h | 17 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 11 | ||||
-rw-r--r-- | fs/btrfs/scrub.c | 223 |
3 files changed, 251 insertions, 0 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 70631d773669..1e82516fe2d8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -539,6 +539,14 @@ struct btrfs_io_ctl { | |||
539 | unsigned check_crcs:1; | 539 | unsigned check_crcs:1; |
540 | }; | 540 | }; |
541 | 541 | ||
542 | /* | ||
543 | * Tree to record all locked full stripes of a RAID5/6 block group | ||
544 | */ | ||
545 | struct btrfs_full_stripe_locks_tree { | ||
546 | struct rb_root root; | ||
547 | struct mutex lock; | ||
548 | }; | ||
549 | |||
542 | struct btrfs_block_group_cache { | 550 | struct btrfs_block_group_cache { |
543 | struct btrfs_key key; | 551 | struct btrfs_key key; |
544 | struct btrfs_block_group_item item; | 552 | struct btrfs_block_group_item item; |
@@ -649,6 +657,9 @@ struct btrfs_block_group_cache { | |||
649 | * Protected by free_space_lock. | 657 | * Protected by free_space_lock. |
650 | */ | 658 | */ |
651 | int needs_free_space; | 659 | int needs_free_space; |
660 | |||
661 | /* Record locked full stripes for RAID5/6 block group */ | ||
662 | struct btrfs_full_stripe_locks_tree full_stripe_locks_root; | ||
652 | }; | 663 | }; |
653 | 664 | ||
654 | /* delayed seq elem */ | 665 | /* delayed seq elem */ |
@@ -3653,6 +3664,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, | |||
3653 | struct btrfs_device *dev); | 3664 | struct btrfs_device *dev); |
3654 | int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, | 3665 | int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, |
3655 | struct btrfs_scrub_progress *progress); | 3666 | struct btrfs_scrub_progress *progress); |
3667 | static inline void btrfs_init_full_stripe_locks_tree( | ||
3668 | struct btrfs_full_stripe_locks_tree *locks_root) | ||
3669 | { | ||
3670 | locks_root->root = RB_ROOT; | ||
3671 | mutex_init(&locks_root->lock); | ||
3672 | } | ||
3656 | 3673 | ||
3657 | /* dev-replace.c */ | 3674 | /* dev-replace.c */ |
3658 | void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); | 3675 | void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e870e60e33bc..e390451c72e6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -131,6 +131,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) | |||
131 | if (atomic_dec_and_test(&cache->count)) { | 131 | if (atomic_dec_and_test(&cache->count)) { |
132 | WARN_ON(cache->pinned > 0); | 132 | WARN_ON(cache->pinned > 0); |
133 | WARN_ON(cache->reserved > 0); | 133 | WARN_ON(cache->reserved > 0); |
134 | |||
135 | /* | ||
136 | * If not empty, someone is still holding mutex of | ||
137 | * full_stripe_lock, which can only be released by caller. | ||
138 | * And it will definitely cause use-after-free when caller | ||
139 | * tries to release full stripe lock. | ||
140 | * | ||
141 | * No better way to resolve, but only to warn. | ||
142 | */ | ||
143 | WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); | ||
134 | kfree(cache->free_space_ctl); | 144 | kfree(cache->free_space_ctl); |
135 | kfree(cache); | 145 | kfree(cache); |
136 | } | 146 | } |
@@ -9917,6 +9927,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, | |||
9917 | btrfs_init_free_space_ctl(cache); | 9927 | btrfs_init_free_space_ctl(cache); |
9918 | atomic_set(&cache->trimming, 0); | 9928 | atomic_set(&cache->trimming, 0); |
9919 | mutex_init(&cache->free_space_lock); | 9929 | mutex_init(&cache->free_space_lock); |
9930 | btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); | ||
9920 | 9931 | ||
9921 | return cache; | 9932 | return cache; |
9922 | } | 9933 | } |
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4786eff53011..34160d350c77 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -240,6 +240,13 @@ struct scrub_warning { | |||
240 | struct btrfs_device *dev; | 240 | struct btrfs_device *dev; |
241 | }; | 241 | }; |
242 | 242 | ||
243 | struct full_stripe_lock { | ||
244 | struct rb_node node; | ||
245 | u64 logical; | ||
246 | u64 refs; | ||
247 | struct mutex mutex; | ||
248 | }; | ||
249 | |||
243 | static void scrub_pending_bio_inc(struct scrub_ctx *sctx); | 250 | static void scrub_pending_bio_inc(struct scrub_ctx *sctx); |
244 | static void scrub_pending_bio_dec(struct scrub_ctx *sctx); | 251 | static void scrub_pending_bio_dec(struct scrub_ctx *sctx); |
245 | static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); | 252 | static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); |
@@ -349,6 +356,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) | |||
349 | } | 356 | } |
350 | 357 | ||
351 | /* | 358 | /* |
359 | * Insert new full stripe lock into full stripe locks tree | ||
360 | * | ||
361 | * Return pointer to existing or newly inserted full_stripe_lock structure if | ||
362 | * everything works well. | ||
363 | * Return ERR_PTR(-ENOMEM) if we failed to allocate memory | ||
364 | * | ||
365 | * NOTE: caller must hold full_stripe_locks_root->lock before calling this | ||
366 | * function | ||
367 | */ | ||
368 | static struct full_stripe_lock *insert_full_stripe_lock( | ||
369 | struct btrfs_full_stripe_locks_tree *locks_root, | ||
370 | u64 fstripe_logical) | ||
371 | { | ||
372 | struct rb_node **p; | ||
373 | struct rb_node *parent = NULL; | ||
374 | struct full_stripe_lock *entry; | ||
375 | struct full_stripe_lock *ret; | ||
376 | |||
377 | WARN_ON(!mutex_is_locked(&locks_root->lock)); | ||
378 | |||
379 | p = &locks_root->root.rb_node; | ||
380 | while (*p) { | ||
381 | parent = *p; | ||
382 | entry = rb_entry(parent, struct full_stripe_lock, node); | ||
383 | if (fstripe_logical < entry->logical) { | ||
384 | p = &(*p)->rb_left; | ||
385 | } else if (fstripe_logical > entry->logical) { | ||
386 | p = &(*p)->rb_right; | ||
387 | } else { | ||
388 | entry->refs++; | ||
389 | return entry; | ||
390 | } | ||
391 | } | ||
392 | |||
393 | /* Insert new lock */ | ||
394 | ret = kmalloc(sizeof(*ret), GFP_KERNEL); | ||
395 | if (!ret) | ||
396 | return ERR_PTR(-ENOMEM); | ||
397 | ret->logical = fstripe_logical; | ||
398 | ret->refs = 1; | ||
399 | mutex_init(&ret->mutex); | ||
400 | |||
401 | rb_link_node(&ret->node, parent, p); | ||
402 | rb_insert_color(&ret->node, &locks_root->root); | ||
403 | return ret; | ||
404 | } | ||
405 | |||
406 | /* | ||
407 | * Search for a full stripe lock of a block group | ||
408 | * | ||
409 | * Return pointer to existing full stripe lock if found | ||
410 | * Return NULL if not found | ||
411 | */ | ||
412 | static struct full_stripe_lock *search_full_stripe_lock( | ||
413 | struct btrfs_full_stripe_locks_tree *locks_root, | ||
414 | u64 fstripe_logical) | ||
415 | { | ||
416 | struct rb_node *node; | ||
417 | struct full_stripe_lock *entry; | ||
418 | |||
419 | WARN_ON(!mutex_is_locked(&locks_root->lock)); | ||
420 | |||
421 | node = locks_root->root.rb_node; | ||
422 | while (node) { | ||
423 | entry = rb_entry(node, struct full_stripe_lock, node); | ||
424 | if (fstripe_logical < entry->logical) | ||
425 | node = node->rb_left; | ||
426 | else if (fstripe_logical > entry->logical) | ||
427 | node = node->rb_right; | ||
428 | else | ||
429 | return entry; | ||
430 | } | ||
431 | return NULL; | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * Helper to get full stripe logical from a normal bytenr. | ||
436 | * | ||
437 | * Caller must ensure @cache is a RAID56 block group. | ||
438 | */ | ||
439 | static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, | ||
440 | u64 bytenr) | ||
441 | { | ||
442 | u64 ret; | ||
443 | |||
444 | /* | ||
445 | * Due to chunk item size limit, full stripe length should not be | ||
446 | * larger than U32_MAX. Just a sanity check here. | ||
447 | */ | ||
448 | WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX); | ||
449 | |||
450 | /* | ||
451 | * round_down() can only handle power of 2, while RAID56 full | ||
452 | * stripe length can be 64KiB * n, so we need to manually round down. | ||
453 | */ | ||
454 | ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) * | ||
455 | cache->full_stripe_len + cache->key.objectid; | ||
456 | return ret; | ||
457 | } | ||
458 | |||
459 | /* | ||
460 | * Lock a full stripe to avoid concurrency of recovery and read | ||
461 | * | ||
462 | * It's only used for profiles with parities (RAID5/6), for other profiles it | ||
463 | * does nothing. | ||
464 | * | ||
465 | * Return 0 if we locked full stripe covering @bytenr, with a mutex held. | ||
466 | * So caller must call unlock_full_stripe() at the same context. | ||
467 | * | ||
468 | * Return <0 if encounters error. | ||
469 | */ | ||
470 | static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, | ||
471 | bool *locked_ret) | ||
472 | { | ||
473 | struct btrfs_block_group_cache *bg_cache; | ||
474 | struct btrfs_full_stripe_locks_tree *locks_root; | ||
475 | struct full_stripe_lock *existing; | ||
476 | u64 fstripe_start; | ||
477 | int ret = 0; | ||
478 | |||
479 | *locked_ret = false; | ||
480 | bg_cache = btrfs_lookup_block_group(fs_info, bytenr); | ||
481 | if (!bg_cache) { | ||
482 | ASSERT(0); | ||
483 | return -ENOENT; | ||
484 | } | ||
485 | |||
486 | /* Profiles not based on parity don't need full stripe lock */ | ||
487 | if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) | ||
488 | goto out; | ||
489 | locks_root = &bg_cache->full_stripe_locks_root; | ||
490 | |||
491 | fstripe_start = get_full_stripe_logical(bg_cache, bytenr); | ||
492 | |||
493 | /* Now insert the full stripe lock */ | ||
494 | mutex_lock(&locks_root->lock); | ||
495 | existing = insert_full_stripe_lock(locks_root, fstripe_start); | ||
496 | mutex_unlock(&locks_root->lock); | ||
497 | if (IS_ERR(existing)) { | ||
498 | ret = PTR_ERR(existing); | ||
499 | goto out; | ||
500 | } | ||
501 | mutex_lock(&existing->mutex); | ||
502 | *locked_ret = true; | ||
503 | out: | ||
504 | btrfs_put_block_group(bg_cache); | ||
505 | return ret; | ||
506 | } | ||
507 | |||
508 | /* | ||
509 | * Unlock a full stripe. | ||
510 | * | ||
511 | * NOTE: Caller must ensure it's the same context calling corresponding | ||
512 | * lock_full_stripe(). | ||
513 | * | ||
514 | * Return 0 if we unlock full stripe without problem. | ||
515 | * Return <0 for error | ||
516 | */ | ||
517 | static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, | ||
518 | bool locked) | ||
519 | { | ||
520 | struct btrfs_block_group_cache *bg_cache; | ||
521 | struct btrfs_full_stripe_locks_tree *locks_root; | ||
522 | struct full_stripe_lock *fstripe_lock; | ||
523 | u64 fstripe_start; | ||
524 | bool freeit = false; | ||
525 | int ret = 0; | ||
526 | |||
527 | /* If we didn't acquire full stripe lock, no need to continue */ | ||
528 | if (!locked) | ||
529 | return 0; | ||
530 | |||
531 | bg_cache = btrfs_lookup_block_group(fs_info, bytenr); | ||
532 | if (!bg_cache) { | ||
533 | ASSERT(0); | ||
534 | return -ENOENT; | ||
535 | } | ||
536 | if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) | ||
537 | goto out; | ||
538 | |||
539 | locks_root = &bg_cache->full_stripe_locks_root; | ||
540 | fstripe_start = get_full_stripe_logical(bg_cache, bytenr); | ||
541 | |||
542 | mutex_lock(&locks_root->lock); | ||
543 | fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start); | ||
544 | /* Unpaired unlock_full_stripe() detected */ | ||
545 | if (!fstripe_lock) { | ||
546 | WARN_ON(1); | ||
547 | ret = -ENOENT; | ||
548 | mutex_unlock(&locks_root->lock); | ||
549 | goto out; | ||
550 | } | ||
551 | |||
552 | if (fstripe_lock->refs == 0) { | ||
553 | WARN_ON(1); | ||
554 | btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow", | ||
555 | fstripe_lock->logical); | ||
556 | } else { | ||
557 | fstripe_lock->refs--; | ||
558 | } | ||
559 | |||
560 | if (fstripe_lock->refs == 0) { | ||
561 | rb_erase(&fstripe_lock->node, &locks_root->root); | ||
562 | freeit = true; | ||
563 | } | ||
564 | mutex_unlock(&locks_root->lock); | ||
565 | |||
566 | mutex_unlock(&fstripe_lock->mutex); | ||
567 | if (freeit) | ||
568 | kfree(fstripe_lock); | ||
569 | out: | ||
570 | btrfs_put_block_group(bg_cache); | ||
571 | return ret; | ||
572 | } | ||
573 | |||
574 | /* | ||
352 | * used for workers that require transaction commits (i.e., for the | 575 | * used for workers that require transaction commits (i.e., for the |
353 | * NOCOW case) | 576 | * NOCOW case) |
354 | */ | 577 | */ |