aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShaohua Li <shli@fb.com>2016-08-25 13:09:39 -0400
committerShaohua Li <shli@fb.com>2016-08-31 12:05:18 -0400
commit8e018c21da3febb558586b48c8db0d6d66cb6593 (patch)
tree70d8d62900570c7cade6a7554236f0014ee1e43e
parent86a1679860babbacd61fc1e8c0c0f43641d5860d (diff)
raid5-cache: fix a deadlock in superblock write
There is a potential deadlock in superblock write. Discard could zero data, so before discard we must make sure superblock is updated to new log tail. Updating superblock (either directly call md_update_sb() or depend on md thread) must hold reconfig mutex. On the other hand, raid5_quiesce is called with reconfig_mutex hold. The first step of raid5_quiesce() is waitting for all IO finish, hence waitting for reclaim thread, while reclaim thread is calling this function and waitting for reconfig mutex. So there is a deadlock. We workaround this issue with a trylock. The downside of the solution is we could miss discard if we can't take reconfig mutex. But this should happen rarely (mainly in raid array stop), so miss discard shouldn't be a big problem. Cc: NeilBrown <neilb@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
-rw-r--r--drivers/md/raid5-cache.c46
1 files changed, 15 insertions, 31 deletions
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 51f76ddbe265..1b1ab4a1d132 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -96,7 +96,6 @@ struct r5l_log {
96 spinlock_t no_space_stripes_lock; 96 spinlock_t no_space_stripes_lock;
97 97
98 bool need_cache_flush; 98 bool need_cache_flush;
99 bool in_teardown;
100}; 99};
101 100
102/* 101/*
@@ -704,31 +703,22 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
704 703
705 mddev = log->rdev->mddev; 704 mddev = log->rdev->mddev;
706 /* 705 /*
707 * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and 706 * Discard could zero data, so before discard we must make sure
708 * wait for this thread to finish. This thread waits for 707 * superblock is updated to new log tail. Updating superblock (either
709 * MD_CHANGE_PENDING clear, which is supposed to be done in 708 * directly call md_update_sb() or depend on md thread) must hold
710 * md_check_recovery(). md_check_recovery() tries to get 709 * reconfig mutex. On the other hand, raid5_quiesce is called with
711 * reconfig_mutex. Since r5l_quiesce already holds the mutex, 710 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
712 * md_check_recovery() fails, so the PENDING never get cleared. The 711 * for all IO finish, hence waitting for reclaim thread, while reclaim
713 * in_teardown check workaround this issue. 712 * thread is calling this function and waitting for reconfig mutex. So
713 * there is a deadlock. We workaround this issue with a trylock.
714 * FIXME: we could miss discard if we can't take reconfig mutex
714 */ 715 */
715 if (!log->in_teardown) { 716 set_mask_bits(&mddev->flags, 0,
716 set_mask_bits(&mddev->flags, 0, 717 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
717 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 718 if (!mddev_trylock(mddev))
718 md_wakeup_thread(mddev->thread); 719 return;
719 wait_event(mddev->sb_wait, 720 md_update_sb(mddev, 1);
720 !test_bit(MD_CHANGE_PENDING, &mddev->flags) || 721 mddev_unlock(mddev);
721 log->in_teardown);
722 /*
723 * r5l_quiesce could run after in_teardown check and hold
724 * mutex first. Superblock might get updated twice.
725 */
726 if (log->in_teardown)
727 md_update_sb(mddev, 1);
728 } else {
729 WARN_ON(!mddev_is_locked(mddev));
730 md_update_sb(mddev, 1);
731 }
732 722
733 /* discard IO error really doesn't matter, ignore it */ 723 /* discard IO error really doesn't matter, ignore it */
734 if (log->last_checkpoint < end) { 724 if (log->last_checkpoint < end) {
@@ -827,7 +817,6 @@ void r5l_quiesce(struct r5l_log *log, int state)
827 if (!log || state == 2) 817 if (!log || state == 2)
828 return; 818 return;
829 if (state == 0) { 819 if (state == 0) {
830 log->in_teardown = 0;
831 /* 820 /*
832 * This is a special case for hotadd. In suspend, the array has 821 * This is a special case for hotadd. In suspend, the array has
833 * no journal. In resume, journal is initialized as well as the 822 * no journal. In resume, journal is initialized as well as the
@@ -838,11 +827,6 @@ void r5l_quiesce(struct r5l_log *log, int state)
838 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 827 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
839 log->rdev->mddev, "reclaim"); 828 log->rdev->mddev, "reclaim");
840 } else if (state == 1) { 829 } else if (state == 1) {
841 /*
842 * at this point all stripes are finished, so io_unit is at
843 * least in STRIPE_END state
844 */
845 log->in_teardown = 1;
846 /* make sure r5l_write_super_and_discard_space exits */ 830 /* make sure r5l_write_super_and_discard_space exits */
847 mddev = log->rdev->mddev; 831 mddev = log->rdev->mddev;
848 wake_up(&mddev->sb_wait); 832 wake_up(&mddev->sb_wait);