aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlex Wu <alexwu@synology.com>2018-09-21 04:05:03 -0400
committerShaohua Li <shli@fb.com>2018-09-28 14:42:47 -0400
commitee37d7314a32ab6809eacc3389bad0406c69a81f (patch)
tree4ee862af088839bb3f8a25c7919442774a1ccfb6
parentfb73b357fb985cc652a72a41541d25915c7f9635 (diff)
md/raid10: Fix raid10 replace hang when new added disk faulty
[Symptom] Resync thread hang when new added disk faulty during replacing. [Root Cause] In raid10_sync_request(), we expect to issue a bio with callback end_sync_read(), and a bio with callback end_sync_write(). In normal situation, we will add resyncing sectors into mddev->recovery_active when raid10_sync_request() returned, and sub resynced sectors from mddev->recovery_active when end_sync_write() calls end_sync_request(). If new added disk, which are replacing the old disk, is set faulty, there is a race condition: 1. In the first rcu protected section, resync thread did not detect that mreplace is set faulty and pass the condition. 2. In the second rcu protected section, mreplace is set faulty. 3. But, resync thread will prepare the read object first, and then check the write condition. 4. It will find that mreplace is set faulty and do not have to prepare write object. This cause we add resync sectors but never sub it. [How to Reproduce] This issue can be easily reproduced by the following steps: mdadm -C /dev/md0 --assume-clean -l 10 -n 4 /dev/sd[abcd] mdadm /dev/md0 -a /dev/sde mdadm /dev/md0 --replace /dev/sdd sleep 1 mdadm /dev/md0 -f /dev/sde [How to Fix] This issue can be fixed by using local variables to record the result of test conditions. Once the conditions are satisfied, we can make sure that we need to issue a bio for read and a bio for write. Previous 'commit 24afd80d99f8 ("md/raid10: handle recovery of replacement devices.")' will also check whether bio is NULL, but leave the comment saying that it is a pointless test. So we remove this dummy check. Reported-by: Alex Chen <alexchen@synology.com> Reviewed-by: Allen Peng <allenpeng@synology.com> Reviewed-by: BingJing Chang <bingjingc@synology.com> Signed-off-by: Alex Wu <alexwu@synology.com> Signed-off-by: Shaohua Li <shli@fb.com>
-rw-r--r--drivers/md/raid10.c27
1 files changed, 14 insertions, 13 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d6f7978b4449..749848b2c477 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3079,6 +3079,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3079 sector_t sect; 3079 sector_t sect;
3080 int must_sync; 3080 int must_sync;
3081 int any_working; 3081 int any_working;
3082 int need_recover = 0;
3083 int need_replace = 0;
3082 struct raid10_info *mirror = &conf->mirrors[i]; 3084 struct raid10_info *mirror = &conf->mirrors[i];
3083 struct md_rdev *mrdev, *mreplace; 3085 struct md_rdev *mrdev, *mreplace;
3084 3086
@@ -3086,11 +3088,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3086 mrdev = rcu_dereference(mirror->rdev); 3088 mrdev = rcu_dereference(mirror->rdev);
3087 mreplace = rcu_dereference(mirror->replacement); 3089 mreplace = rcu_dereference(mirror->replacement);
3088 3090
3089 if ((mrdev == NULL || 3091 if (mrdev != NULL &&
3090 test_bit(Faulty, &mrdev->flags) || 3092 !test_bit(Faulty, &mrdev->flags) &&
3091 test_bit(In_sync, &mrdev->flags)) && 3093 !test_bit(In_sync, &mrdev->flags))
3092 (mreplace == NULL || 3094 need_recover = 1;
3093 test_bit(Faulty, &mreplace->flags))) { 3095 if (mreplace != NULL &&
3096 !test_bit(Faulty, &mreplace->flags))
3097 need_replace = 1;
3098
3099 if (!need_recover && !need_replace) {
3094 rcu_read_unlock(); 3100 rcu_read_unlock();
3095 continue; 3101 continue;
3096 } 3102 }
@@ -3213,7 +3219,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3213 r10_bio->devs[1].devnum = i; 3219 r10_bio->devs[1].devnum = i;
3214 r10_bio->devs[1].addr = to_addr; 3220 r10_bio->devs[1].addr = to_addr;
3215 3221
3216 if (!test_bit(In_sync, &mrdev->flags)) { 3222 if (need_recover) {
3217 bio = r10_bio->devs[1].bio; 3223 bio = r10_bio->devs[1].bio;
3218 bio->bi_next = biolist; 3224 bio->bi_next = biolist;
3219 biolist = bio; 3225 biolist = bio;
@@ -3230,16 +3236,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3230 bio = r10_bio->devs[1].repl_bio; 3236 bio = r10_bio->devs[1].repl_bio;
3231 if (bio) 3237 if (bio)
3232 bio->bi_end_io = NULL; 3238 bio->bi_end_io = NULL;
3233 /* Note: if mreplace != NULL, then bio 3239 /* Note: if need_replace, then bio
3234 * cannot be NULL as r10buf_pool_alloc will 3240 * cannot be NULL as r10buf_pool_alloc will
3235 * have allocated it. 3241 * have allocated it.
3236 * So the second test here is pointless.
3237 * But it keeps semantic-checkers happy, and
3238 * this comment keeps human reviewers
3239 * happy.
3240 */ 3242 */
3241 if (mreplace == NULL || bio == NULL || 3243 if (!need_replace)
3242 test_bit(Faulty, &mreplace->flags))
3243 break; 3244 break;
3244 bio->bi_next = biolist; 3245 bio->bi_next = biolist;
3245 biolist = bio; 3246 biolist = bio;