diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-09-13 14:19:52 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-09-13 14:19:52 -0400 |
commit | 106f2e59ee3b89a2f93735f65499eae4e8d55abc (patch) | |
tree | 113f2892a5ebf0ba2b3be38133b47df2e3795c2d | |
parent | 309a18ae360d1d8741c676a37a3daae319fe722a (diff) | |
parent | c94455558337eece474eebb6a16b905f98930418 (diff) |
Merge tag 'md/4.8-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD fixes from Shaohua Li:
"A few bug fixes for MD:
- Guoqing fixed a bug compiling md-cluster in kernel
- I fixed a potential deadlock in raid5-cache superblock write, a
hang in raid5 reshape resume and a race condition introduced in
rc4"
* tag 'md/4.8-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
raid5: fix a small race condition
md-cluster: make md-cluster also can work when compiled into kernel
raid5: guarantee enough stripes to avoid reshape hang
raid5-cache: fix a deadlock in superblock write
-rw-r--r-- | drivers/md/md.c | 12 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 46 | ||||
-rw-r--r-- | drivers/md/raid5.c | 14 |
3 files changed, 31 insertions, 41 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 67642bacd597..915e84d631a2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -7610,16 +7610,12 @@ EXPORT_SYMBOL(unregister_md_cluster_operations); | |||
7610 | 7610 | ||
7611 | int md_setup_cluster(struct mddev *mddev, int nodes) | 7611 | int md_setup_cluster(struct mddev *mddev, int nodes) |
7612 | { | 7612 | { |
7613 | int err; | 7613 | if (!md_cluster_ops) |
7614 | 7614 | request_module("md-cluster"); | |
7615 | err = request_module("md-cluster"); | ||
7616 | if (err) { | ||
7617 | pr_err("md-cluster module not found.\n"); | ||
7618 | return -ENOENT; | ||
7619 | } | ||
7620 | |||
7621 | spin_lock(&pers_lock); | 7615 | spin_lock(&pers_lock); |
7616 | /* ensure module won't be unloaded */ | ||
7622 | if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { | 7617 | if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { |
7618 | pr_err("can't find md-cluster module or get it's reference.\n"); | ||
7623 | spin_unlock(&pers_lock); | 7619 | spin_unlock(&pers_lock); |
7624 | return -ENOENT; | 7620 | return -ENOENT; |
7625 | } | 7621 | } |
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 51f76ddbe265..1b1ab4a1d132 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c | |||
@@ -96,7 +96,6 @@ struct r5l_log { | |||
96 | spinlock_t no_space_stripes_lock; | 96 | spinlock_t no_space_stripes_lock; |
97 | 97 | ||
98 | bool need_cache_flush; | 98 | bool need_cache_flush; |
99 | bool in_teardown; | ||
100 | }; | 99 | }; |
101 | 100 | ||
102 | /* | 101 | /* |
@@ -704,31 +703,22 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, | |||
704 | 703 | ||
705 | mddev = log->rdev->mddev; | 704 | mddev = log->rdev->mddev; |
706 | /* | 705 | /* |
707 | * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and | 706 | * Discard could zero data, so before discard we must make sure |
708 | * wait for this thread to finish. This thread waits for | 707 | * superblock is updated to new log tail. Updating superblock (either |
709 | * MD_CHANGE_PENDING clear, which is supposed to be done in | 708 | * directly call md_update_sb() or depend on md thread) must hold |
710 | * md_check_recovery(). md_check_recovery() tries to get | 709 | * reconfig mutex. On the other hand, raid5_quiesce is called with |
711 | * reconfig_mutex. Since r5l_quiesce already holds the mutex, | 710 | * reconfig_mutex hold. The first step of raid5_quiesce() is waitting |
712 | * md_check_recovery() fails, so the PENDING never get cleared. The | 711 | * for all IO finish, hence waitting for reclaim thread, while reclaim |
713 | * in_teardown check workaround this issue. | 712 | * thread is calling this function and waitting for reconfig mutex. So |
713 | * there is a deadlock. We workaround this issue with a trylock. | ||
714 | * FIXME: we could miss discard if we can't take reconfig mutex | ||
714 | */ | 715 | */ |
715 | if (!log->in_teardown) { | 716 | set_mask_bits(&mddev->flags, 0, |
716 | set_mask_bits(&mddev->flags, 0, | 717 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); |
717 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); | 718 | if (!mddev_trylock(mddev)) |
718 | md_wakeup_thread(mddev->thread); | 719 | return; |
719 | wait_event(mddev->sb_wait, | 720 | md_update_sb(mddev, 1); |
720 | !test_bit(MD_CHANGE_PENDING, &mddev->flags) || | 721 | mddev_unlock(mddev); |
721 | log->in_teardown); | ||
722 | /* | ||
723 | * r5l_quiesce could run after in_teardown check and hold | ||
724 | * mutex first. Superblock might get updated twice. | ||
725 | */ | ||
726 | if (log->in_teardown) | ||
727 | md_update_sb(mddev, 1); | ||
728 | } else { | ||
729 | WARN_ON(!mddev_is_locked(mddev)); | ||
730 | md_update_sb(mddev, 1); | ||
731 | } | ||
732 | 722 | ||
733 | /* discard IO error really doesn't matter, ignore it */ | 723 | /* discard IO error really doesn't matter, ignore it */ |
734 | if (log->last_checkpoint < end) { | 724 | if (log->last_checkpoint < end) { |
@@ -827,7 +817,6 @@ void r5l_quiesce(struct r5l_log *log, int state) | |||
827 | if (!log || state == 2) | 817 | if (!log || state == 2) |
828 | return; | 818 | return; |
829 | if (state == 0) { | 819 | if (state == 0) { |
830 | log->in_teardown = 0; | ||
831 | /* | 820 | /* |
832 | * This is a special case for hotadd. In suspend, the array has | 821 | * This is a special case for hotadd. In suspend, the array has |
833 | * no journal. In resume, journal is initialized as well as the | 822 | * no journal. In resume, journal is initialized as well as the |
@@ -838,11 +827,6 @@ void r5l_quiesce(struct r5l_log *log, int state) | |||
838 | log->reclaim_thread = md_register_thread(r5l_reclaim_thread, | 827 | log->reclaim_thread = md_register_thread(r5l_reclaim_thread, |
839 | log->rdev->mddev, "reclaim"); | 828 | log->rdev->mddev, "reclaim"); |
840 | } else if (state == 1) { | 829 | } else if (state == 1) { |
841 | /* | ||
842 | * at this point all stripes are finished, so io_unit is at | ||
843 | * least in STRIPE_END state | ||
844 | */ | ||
845 | log->in_teardown = 1; | ||
846 | /* make sure r5l_write_super_and_discard_space exits */ | 830 | /* make sure r5l_write_super_and_discard_space exits */ |
847 | mddev = log->rdev->mddev; | 831 | mddev = log->rdev->mddev; |
848 | wake_up(&mddev->sb_wait); | 832 | wake_up(&mddev->sb_wait); |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index da583bb43c84..ee7fc3701700 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -2423,10 +2423,10 @@ static void raid5_end_read_request(struct bio * bi) | |||
2423 | } | 2423 | } |
2424 | } | 2424 | } |
2425 | rdev_dec_pending(rdev, conf->mddev); | 2425 | rdev_dec_pending(rdev, conf->mddev); |
2426 | bio_reset(bi); | ||
2426 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | 2427 | clear_bit(R5_LOCKED, &sh->dev[i].flags); |
2427 | set_bit(STRIPE_HANDLE, &sh->state); | 2428 | set_bit(STRIPE_HANDLE, &sh->state); |
2428 | raid5_release_stripe(sh); | 2429 | raid5_release_stripe(sh); |
2429 | bio_reset(bi); | ||
2430 | } | 2430 | } |
2431 | 2431 | ||
2432 | static void raid5_end_write_request(struct bio *bi) | 2432 | static void raid5_end_write_request(struct bio *bi) |
@@ -2498,6 +2498,7 @@ static void raid5_end_write_request(struct bio *bi) | |||
2498 | if (sh->batch_head && bi->bi_error && !replacement) | 2498 | if (sh->batch_head && bi->bi_error && !replacement) |
2499 | set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); | 2499 | set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); |
2500 | 2500 | ||
2501 | bio_reset(bi); | ||
2501 | if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) | 2502 | if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) |
2502 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | 2503 | clear_bit(R5_LOCKED, &sh->dev[i].flags); |
2503 | set_bit(STRIPE_HANDLE, &sh->state); | 2504 | set_bit(STRIPE_HANDLE, &sh->state); |
@@ -2505,7 +2506,6 @@ static void raid5_end_write_request(struct bio *bi) | |||
2505 | 2506 | ||
2506 | if (sh->batch_head && sh != sh->batch_head) | 2507 | if (sh->batch_head && sh != sh->batch_head) |
2507 | raid5_release_stripe(sh->batch_head); | 2508 | raid5_release_stripe(sh->batch_head); |
2508 | bio_reset(bi); | ||
2509 | } | 2509 | } |
2510 | 2510 | ||
2511 | static void raid5_build_block(struct stripe_head *sh, int i, int previous) | 2511 | static void raid5_build_block(struct stripe_head *sh, int i, int previous) |
@@ -6639,6 +6639,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
6639 | } | 6639 | } |
6640 | 6640 | ||
6641 | conf->min_nr_stripes = NR_STRIPES; | 6641 | conf->min_nr_stripes = NR_STRIPES; |
6642 | if (mddev->reshape_position != MaxSector) { | ||
6643 | int stripes = max_t(int, | ||
6644 | ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, | ||
6645 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); | ||
6646 | conf->min_nr_stripes = max(NR_STRIPES, stripes); | ||
6647 | if (conf->min_nr_stripes != NR_STRIPES) | ||
6648 | printk(KERN_INFO | ||
6649 | "md/raid:%s: force stripe size %d for reshape\n", | ||
6650 | mdname(mddev), conf->min_nr_stripes); | ||
6651 | } | ||
6642 | memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + | 6652 | memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + |
6643 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | 6653 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; |
6644 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); | 6654 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); |