aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorQu Wenruo <quwenruo@cn.fujitsu.com>2017-04-13 20:35:54 -0400
committerDavid Sterba <dsterba@suse.com>2017-04-18 08:07:27 -0400
commit0966a7b1300f953b04b436aa82486d3d1b17c96d (patch)
tree918f983ce93ded51f7c8176740cdaee6f1ee7eb6
parentfa7aede2ab5f54352c7aec056930dd17b28f3a78 (diff)
btrfs: scrub: Introduce full stripe lock for RAID56
Unlike mirror based profiles, RAID5/6 recovery needs to read out the whole full stripe. And if we don't do proper protection, it can easily cause race condition. Introduce 2 new functions: lock_full_stripe() and unlock_full_stripe() for RAID5/6. Which store a rb_tree of mutexes for full stripes, so scrub callers can use them to lock a full stripe to avoid race. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor comment adjustments ] Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/ctree.h17
-rw-r--r--fs/btrfs/extent-tree.c11
-rw-r--r--fs/btrfs/scrub.c223
3 files changed, 251 insertions, 0 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 70631d773669..1e82516fe2d8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -539,6 +539,14 @@ struct btrfs_io_ctl {
539 unsigned check_crcs:1; 539 unsigned check_crcs:1;
540}; 540};
541 541
542/*
543 * Tree to record all locked full stripes of a RAID5/6 block group
544 */
545struct btrfs_full_stripe_locks_tree {
546 struct rb_root root;
547 struct mutex lock;
548};
549
542struct btrfs_block_group_cache { 550struct btrfs_block_group_cache {
543 struct btrfs_key key; 551 struct btrfs_key key;
544 struct btrfs_block_group_item item; 552 struct btrfs_block_group_item item;
@@ -649,6 +657,9 @@ struct btrfs_block_group_cache {
649 * Protected by free_space_lock. 657 * Protected by free_space_lock.
650 */ 658 */
651 int needs_free_space; 659 int needs_free_space;
660
661 /* Record locked full stripes for RAID5/6 block group */
662 struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
652}; 663};
653 664
654/* delayed seq elem */ 665/* delayed seq elem */
@@ -3653,6 +3664,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
3653 struct btrfs_device *dev); 3664 struct btrfs_device *dev);
3654int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, 3665int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
3655 struct btrfs_scrub_progress *progress); 3666 struct btrfs_scrub_progress *progress);
3667static inline void btrfs_init_full_stripe_locks_tree(
3668 struct btrfs_full_stripe_locks_tree *locks_root)
3669{
3670 locks_root->root = RB_ROOT;
3671 mutex_init(&locks_root->lock);
3672}
3656 3673
3657/* dev-replace.c */ 3674/* dev-replace.c */
3658void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); 3675void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e870e60e33bc..e390451c72e6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -131,6 +131,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
131 if (atomic_dec_and_test(&cache->count)) { 131 if (atomic_dec_and_test(&cache->count)) {
132 WARN_ON(cache->pinned > 0); 132 WARN_ON(cache->pinned > 0);
133 WARN_ON(cache->reserved > 0); 133 WARN_ON(cache->reserved > 0);
134
135 /*
136 * If not empty, someone is still holding mutex of
137 * full_stripe_lock, which can only be released by caller.
138 * And it will definitely cause use-after-free when caller
139 * tries to release full stripe lock.
140 *
141 * No better way to resolve, but only to warn.
142 */
143 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
134 kfree(cache->free_space_ctl); 144 kfree(cache->free_space_ctl);
135 kfree(cache); 145 kfree(cache);
136 } 146 }
@@ -9917,6 +9927,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
9917 btrfs_init_free_space_ctl(cache); 9927 btrfs_init_free_space_ctl(cache);
9918 atomic_set(&cache->trimming, 0); 9928 atomic_set(&cache->trimming, 0);
9919 mutex_init(&cache->free_space_lock); 9929 mutex_init(&cache->free_space_lock);
9930 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
9920 9931
9921 return cache; 9932 return cache;
9922} 9933}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4786eff53011..34160d350c77 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -240,6 +240,13 @@ struct scrub_warning {
240 struct btrfs_device *dev; 240 struct btrfs_device *dev;
241}; 241};
242 242
243struct full_stripe_lock {
244 struct rb_node node;
245 u64 logical;
246 u64 refs;
247 struct mutex mutex;
248};
249
243static void scrub_pending_bio_inc(struct scrub_ctx *sctx); 250static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
244static void scrub_pending_bio_dec(struct scrub_ctx *sctx); 251static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
245static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); 252static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -349,6 +356,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
349} 356}
350 357
351/* 358/*
359 * Insert new full stripe lock into full stripe locks tree
360 *
361 * Return pointer to existing or newly inserted full_stripe_lock structure if
362 * everything works well.
363 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
364 *
365 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
366 * function
367 */
368static struct full_stripe_lock *insert_full_stripe_lock(
369 struct btrfs_full_stripe_locks_tree *locks_root,
370 u64 fstripe_logical)
371{
372 struct rb_node **p;
373 struct rb_node *parent = NULL;
374 struct full_stripe_lock *entry;
375 struct full_stripe_lock *ret;
376
377 WARN_ON(!mutex_is_locked(&locks_root->lock));
378
379 p = &locks_root->root.rb_node;
380 while (*p) {
381 parent = *p;
382 entry = rb_entry(parent, struct full_stripe_lock, node);
383 if (fstripe_logical < entry->logical) {
384 p = &(*p)->rb_left;
385 } else if (fstripe_logical > entry->logical) {
386 p = &(*p)->rb_right;
387 } else {
388 entry->refs++;
389 return entry;
390 }
391 }
392
393 /* Insert new lock */
394 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
395 if (!ret)
396 return ERR_PTR(-ENOMEM);
397 ret->logical = fstripe_logical;
398 ret->refs = 1;
399 mutex_init(&ret->mutex);
400
401 rb_link_node(&ret->node, parent, p);
402 rb_insert_color(&ret->node, &locks_root->root);
403 return ret;
404}
405
406/*
407 * Search for a full stripe lock of a block group
408 *
409 * Return pointer to existing full stripe lock if found
410 * Return NULL if not found
411 */
412static struct full_stripe_lock *search_full_stripe_lock(
413 struct btrfs_full_stripe_locks_tree *locks_root,
414 u64 fstripe_logical)
415{
416 struct rb_node *node;
417 struct full_stripe_lock *entry;
418
419 WARN_ON(!mutex_is_locked(&locks_root->lock));
420
421 node = locks_root->root.rb_node;
422 while (node) {
423 entry = rb_entry(node, struct full_stripe_lock, node);
424 if (fstripe_logical < entry->logical)
425 node = node->rb_left;
426 else if (fstripe_logical > entry->logical)
427 node = node->rb_right;
428 else
429 return entry;
430 }
431 return NULL;
432}
433
434/*
435 * Helper to get full stripe logical from a normal bytenr.
436 *
437 * Caller must ensure @cache is a RAID56 block group.
438 */
439static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
440 u64 bytenr)
441{
442 u64 ret;
443
444 /*
445 * Due to chunk item size limit, full stripe length should not be
446 * larger than U32_MAX. Just a sanity check here.
447 */
448 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
449
450 /*
451 * round_down() can only handle power of 2, while RAID56 full
452 * stripe length can be 64KiB * n, so we need to manually round down.
453 */
454 ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
455 cache->full_stripe_len + cache->key.objectid;
456 return ret;
457}
458
459/*
460 * Lock a full stripe to avoid concurrency of recovery and read
461 *
462 * It's only used for profiles with parities (RAID5/6), for other profiles it
463 * does nothing.
464 *
465 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
466 * So caller must call unlock_full_stripe() at the same context.
467 *
468 * Return <0 if encounters error.
469 */
470static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
471 bool *locked_ret)
472{
473 struct btrfs_block_group_cache *bg_cache;
474 struct btrfs_full_stripe_locks_tree *locks_root;
475 struct full_stripe_lock *existing;
476 u64 fstripe_start;
477 int ret = 0;
478
479 *locked_ret = false;
480 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
481 if (!bg_cache) {
482 ASSERT(0);
483 return -ENOENT;
484 }
485
486 /* Profiles not based on parity don't need full stripe lock */
487 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
488 goto out;
489 locks_root = &bg_cache->full_stripe_locks_root;
490
491 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
492
493 /* Now insert the full stripe lock */
494 mutex_lock(&locks_root->lock);
495 existing = insert_full_stripe_lock(locks_root, fstripe_start);
496 mutex_unlock(&locks_root->lock);
497 if (IS_ERR(existing)) {
498 ret = PTR_ERR(existing);
499 goto out;
500 }
501 mutex_lock(&existing->mutex);
502 *locked_ret = true;
503out:
504 btrfs_put_block_group(bg_cache);
505 return ret;
506}
507
508/*
509 * Unlock a full stripe.
510 *
511 * NOTE: Caller must ensure it's the same context calling corresponding
512 * lock_full_stripe().
513 *
514 * Return 0 if we unlock full stripe without problem.
515 * Return <0 for error
516 */
517static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
518 bool locked)
519{
520 struct btrfs_block_group_cache *bg_cache;
521 struct btrfs_full_stripe_locks_tree *locks_root;
522 struct full_stripe_lock *fstripe_lock;
523 u64 fstripe_start;
524 bool freeit = false;
525 int ret = 0;
526
527 /* If we didn't acquire full stripe lock, no need to continue */
528 if (!locked)
529 return 0;
530
531 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
532 if (!bg_cache) {
533 ASSERT(0);
534 return -ENOENT;
535 }
536 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
537 goto out;
538
539 locks_root = &bg_cache->full_stripe_locks_root;
540 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
541
542 mutex_lock(&locks_root->lock);
543 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
544 /* Unpaired unlock_full_stripe() detected */
545 if (!fstripe_lock) {
546 WARN_ON(1);
547 ret = -ENOENT;
548 mutex_unlock(&locks_root->lock);
549 goto out;
550 }
551
552 if (fstripe_lock->refs == 0) {
553 WARN_ON(1);
554 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
555 fstripe_lock->logical);
556 } else {
557 fstripe_lock->refs--;
558 }
559
560 if (fstripe_lock->refs == 0) {
561 rb_erase(&fstripe_lock->node, &locks_root->root);
562 freeit = true;
563 }
564 mutex_unlock(&locks_root->lock);
565
566 mutex_unlock(&fstripe_lock->mutex);
567 if (freeit)
568 kfree(fstripe_lock);
569out:
570 btrfs_put_block_group(bg_cache);
571 return ret;
572}
573
574/*
352 * used for workers that require transaction commits (i.e., for the 575 * used for workers that require transaction commits (i.e., for the
353 * NOCOW case) 576 * NOCOW case)
354 */ 577 */