aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShaohua Li <shli@fb.com>2016-07-28 12:34:14 -0400
committerShaohua Li <shli@fb.com>2016-07-28 12:34:14 -0400
commit3f35e210ed4617a68b6baa9b7ac6c72bf7e313d9 (patch)
tree2cce851a454be4deea141593c3db62001ab65108
parent194dc870a5890e855ecffb30f3b80ba7c88f96d6 (diff)
parent5d8817833c7609c24da9a92f71c53caa9c1424eb (diff)
Merge branch 'mymd/for-next' into mymd/for-linus
-rw-r--r--drivers/md/md.c74
-rw-r--r--drivers/md/md.h10
-rw-r--r--drivers/md/multipath.c29
-rw-r--r--drivers/md/raid1.c130
-rw-r--r--drivers/md/raid10.c250
-rw-r--r--drivers/md/raid10.h3
-rw-r--r--drivers/md/raid5.c45
7 files changed, 328 insertions, 213 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1f123f5a29da..2c3ab6f5e6be 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2482,8 +2482,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
2482 if (add_journal) 2482 if (add_journal)
2483 mddev_resume(mddev); 2483 mddev_resume(mddev);
2484 if (err) { 2484 if (err) {
2485 unbind_rdev_from_array(rdev); 2485 md_kick_rdev_from_array(rdev);
2486 export_rdev(rdev);
2487 return err; 2486 return err;
2488 } 2487 }
2489 } 2488 }
@@ -2600,6 +2599,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2600 else 2599 else
2601 err = -EBUSY; 2600 err = -EBUSY;
2602 } else if (cmd_match(buf, "remove")) { 2601 } else if (cmd_match(buf, "remove")) {
2602 if (rdev->mddev->pers) {
2603 clear_bit(Blocked, &rdev->flags);
2604 remove_and_add_spares(rdev->mddev, rdev);
2605 }
2603 if (rdev->raid_disk >= 0) 2606 if (rdev->raid_disk >= 0)
2604 err = -EBUSY; 2607 err = -EBUSY;
2605 else { 2608 else {
@@ -3176,8 +3179,7 @@ int md_rdev_init(struct md_rdev *rdev)
3176 rdev->data_offset = 0; 3179 rdev->data_offset = 0;
3177 rdev->new_data_offset = 0; 3180 rdev->new_data_offset = 0;
3178 rdev->sb_events = 0; 3181 rdev->sb_events = 0;
3179 rdev->last_read_error.tv_sec = 0; 3182 rdev->last_read_error = 0;
3180 rdev->last_read_error.tv_nsec = 0;
3181 rdev->sb_loaded = 0; 3183 rdev->sb_loaded = 0;
3182 rdev->bb_page = NULL; 3184 rdev->bb_page = NULL;
3183 atomic_set(&rdev->nr_pending, 0); 3185 atomic_set(&rdev->nr_pending, 0);
@@ -3583,6 +3585,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3583 mddev->to_remove = &md_redundancy_group; 3585 mddev->to_remove = &md_redundancy_group;
3584 } 3586 }
3585 3587
3588 module_put(oldpers->owner);
3589
3586 rdev_for_each(rdev, mddev) { 3590 rdev_for_each(rdev, mddev) {
3587 if (rdev->raid_disk < 0) 3591 if (rdev->raid_disk < 0)
3588 continue; 3592 continue;
@@ -3940,6 +3944,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3940 } else 3944 } else
3941 err = -EBUSY; 3945 err = -EBUSY;
3942 } 3946 }
3947 if (!err)
3948 sysfs_notify_dirent_safe(mddev->sysfs_state);
3943 spin_unlock(&mddev->lock); 3949 spin_unlock(&mddev->lock);
3944 return err ?: len; 3950 return err ?: len;
3945 } 3951 }
@@ -4191,7 +4197,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
4191 return err; 4197 return err;
4192 if (mddev->pers) { 4198 if (mddev->pers) {
4193 err = update_size(mddev, sectors); 4199 err = update_size(mddev, sectors);
4194 md_update_sb(mddev, 1); 4200 if (err == 0)
4201 md_update_sb(mddev, 1);
4195 } else { 4202 } else {
4196 if (mddev->dev_sectors == 0 || 4203 if (mddev->dev_sectors == 0 ||
4197 mddev->dev_sectors > sectors) 4204 mddev->dev_sectors > sectors)
@@ -7813,6 +7820,7 @@ void md_do_sync(struct md_thread *thread)
7813 if (ret) 7820 if (ret)
7814 goto skip; 7821 goto skip;
7815 7822
7823 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
7816 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 7824 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
7817 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 7825 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
7818 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 7826 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
@@ -8151,18 +8159,11 @@ void md_do_sync(struct md_thread *thread)
8151 } 8159 }
8152 } 8160 }
8153 skip: 8161 skip:
8154 if (mddev_is_clustered(mddev) && 8162 /* set CHANGE_PENDING here since maybe another update is needed,
8155 ret == 0) { 8163 * so other nodes are informed. It should be harmless for normal
8156 /* set CHANGE_PENDING here since maybe another 8164 * raid */
8157 * update is needed, so other nodes are informed */ 8165 set_mask_bits(&mddev->flags, 0,
8158 set_mask_bits(&mddev->flags, 0, 8166 BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS));
8159 BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS));
8160 md_wakeup_thread(mddev->thread);
8161 wait_event(mddev->sb_wait,
8162 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
8163 md_cluster_ops->resync_finish(mddev);
8164 } else
8165 set_bit(MD_CHANGE_DEVS, &mddev->flags);
8166 8167
8167 spin_lock(&mddev->lock); 8168 spin_lock(&mddev->lock);
8168 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8169 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -8188,15 +8189,34 @@ static int remove_and_add_spares(struct mddev *mddev,
8188 struct md_rdev *rdev; 8189 struct md_rdev *rdev;
8189 int spares = 0; 8190 int spares = 0;
8190 int removed = 0; 8191 int removed = 0;
8192 bool remove_some = false;
8191 8193
8192 rdev_for_each(rdev, mddev) 8194 rdev_for_each(rdev, mddev) {
8195 if ((this == NULL || rdev == this) &&
8196 rdev->raid_disk >= 0 &&
8197 !test_bit(Blocked, &rdev->flags) &&
8198 test_bit(Faulty, &rdev->flags) &&
8199 atomic_read(&rdev->nr_pending)==0) {
8200 /* Faulty non-Blocked devices with nr_pending == 0
8201 * never get nr_pending incremented,
8202 * never get Faulty cleared, and never get Blocked set.
8203 * So we can synchronize_rcu now rather than once per device
8204 */
8205 remove_some = true;
8206 set_bit(RemoveSynchronized, &rdev->flags);
8207 }
8208 }
8209
8210 if (remove_some)
8211 synchronize_rcu();
8212 rdev_for_each(rdev, mddev) {
8193 if ((this == NULL || rdev == this) && 8213 if ((this == NULL || rdev == this) &&
8194 rdev->raid_disk >= 0 && 8214 rdev->raid_disk >= 0 &&
8195 !test_bit(Blocked, &rdev->flags) && 8215 !test_bit(Blocked, &rdev->flags) &&
8196 (test_bit(Faulty, &rdev->flags) || 8216 ((test_bit(RemoveSynchronized, &rdev->flags) ||
8197 (!test_bit(In_sync, &rdev->flags) && 8217 (!test_bit(In_sync, &rdev->flags) &&
8198 !test_bit(Journal, &rdev->flags))) && 8218 !test_bit(Journal, &rdev->flags))) &&
8199 atomic_read(&rdev->nr_pending)==0) { 8219 atomic_read(&rdev->nr_pending)==0)) {
8200 if (mddev->pers->hot_remove_disk( 8220 if (mddev->pers->hot_remove_disk(
8201 mddev, rdev) == 0) { 8221 mddev, rdev) == 0) {
8202 sysfs_unlink_rdev(mddev, rdev); 8222 sysfs_unlink_rdev(mddev, rdev);
@@ -8204,6 +8224,10 @@ static int remove_and_add_spares(struct mddev *mddev,
8204 removed++; 8224 removed++;
8205 } 8225 }
8206 } 8226 }
8227 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
8228 clear_bit(RemoveSynchronized, &rdev->flags);
8229 }
8230
8207 if (removed && mddev->kobj.sd) 8231 if (removed && mddev->kobj.sd)
8208 sysfs_notify(&mddev->kobj, NULL, "degraded"); 8232 sysfs_notify(&mddev->kobj, NULL, "degraded");
8209 8233
@@ -8506,6 +8530,11 @@ void md_reap_sync_thread(struct mddev *mddev)
8506 rdev->saved_raid_disk = -1; 8530 rdev->saved_raid_disk = -1;
8507 8531
8508 md_update_sb(mddev, 1); 8532 md_update_sb(mddev, 1);
8533 /* MD_CHANGE_PENDING should be cleared by md_update_sb, so we can
8534 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
8535 * clustered raid */
8536 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
8537 md_cluster_ops->resync_finish(mddev);
8509 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8538 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8510 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8539 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8511 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8540 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -8803,6 +8832,7 @@ EXPORT_SYMBOL(md_reload_sb);
8803 * at boot time. 8832 * at boot time.
8804 */ 8833 */
8805 8834
8835static DEFINE_MUTEX(detected_devices_mutex);
8806static LIST_HEAD(all_detected_devices); 8836static LIST_HEAD(all_detected_devices);
8807struct detected_devices_node { 8837struct detected_devices_node {
8808 struct list_head list; 8838 struct list_head list;
@@ -8816,7 +8846,9 @@ void md_autodetect_dev(dev_t dev)
8816 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 8846 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
8817 if (node_detected_dev) { 8847 if (node_detected_dev) {
8818 node_detected_dev->dev = dev; 8848 node_detected_dev->dev = dev;
8849 mutex_lock(&detected_devices_mutex);
8819 list_add_tail(&node_detected_dev->list, &all_detected_devices); 8850 list_add_tail(&node_detected_dev->list, &all_detected_devices);
8851 mutex_unlock(&detected_devices_mutex);
8820 } else { 8852 } else {
8821 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 8853 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
8822 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 8854 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
@@ -8835,6 +8867,7 @@ static void autostart_arrays(int part)
8835 8867
8836 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 8868 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
8837 8869
8870 mutex_lock(&detected_devices_mutex);
8838 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 8871 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
8839 i_scanned++; 8872 i_scanned++;
8840 node_detected_dev = list_entry(all_detected_devices.next, 8873 node_detected_dev = list_entry(all_detected_devices.next,
@@ -8853,6 +8886,7 @@ static void autostart_arrays(int part)
8853 list_add(&rdev->same_set, &pending_raid_disks); 8886 list_add(&rdev->same_set, &pending_raid_disks);
8854 i_passed++; 8887 i_passed++;
8855 } 8888 }
8889 mutex_unlock(&detected_devices_mutex);
8856 8890
8857 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 8891 printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
8858 i_scanned, i_passed); 8892 i_scanned, i_passed);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index b4f335245bd6..20c667579ede 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -99,7 +99,7 @@ struct md_rdev {
99 atomic_t read_errors; /* number of consecutive read errors that 99 atomic_t read_errors; /* number of consecutive read errors that
100 * we have tried to ignore. 100 * we have tried to ignore.
101 */ 101 */
102 struct timespec last_read_error; /* monotonic time since our 102 time64_t last_read_error; /* monotonic time since our
103 * last read error 103 * last read error
104 */ 104 */
105 atomic_t corrected_errors; /* number of corrected read errors, 105 atomic_t corrected_errors; /* number of corrected read errors,
@@ -163,6 +163,11 @@ enum flag_bits {
163 * than other devices in the array 163 * than other devices in the array
164 */ 164 */
165 ClusterRemove, 165 ClusterRemove,
166 RemoveSynchronized, /* synchronize_rcu() was called after
167 * this device was known to be faulty,
168 * so it is safe to remove without
169 * another synchronize_rcu() call.
170 */
166}; 171};
167 172
168static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, 173static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -204,6 +209,9 @@ struct mddev {
204#define MD_RELOAD_SB 7 /* Reload the superblock because another node 209#define MD_RELOAD_SB 7 /* Reload the superblock because another node
205 * updated it. 210 * updated it.
206 */ 211 */
212#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node
213 * already took resync lock, need to
214 * release the lock */
207 215
208 int suspended; 216 int suspended;
209 atomic_t active_io; 217 atomic_t active_io;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 72ea98e89e57..4974682842ae 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -43,7 +43,8 @@ static int multipath_map (struct mpconf *conf)
43 rcu_read_lock(); 43 rcu_read_lock();
44 for (i = 0; i < disks; i++) { 44 for (i = 0; i < disks; i++) {
45 struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); 45 struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
46 if (rdev && test_bit(In_sync, &rdev->flags)) { 46 if (rdev && test_bit(In_sync, &rdev->flags) &&
47 !test_bit(Faulty, &rdev->flags)) {
47 atomic_inc(&rdev->nr_pending); 48 atomic_inc(&rdev->nr_pending);
48 rcu_read_unlock(); 49 rcu_read_unlock();
49 return i; 50 return i;
@@ -141,17 +142,19 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
141 return; 142 return;
142} 143}
143 144
144static void multipath_status (struct seq_file *seq, struct mddev *mddev) 145static void multipath_status(struct seq_file *seq, struct mddev *mddev)
145{ 146{
146 struct mpconf *conf = mddev->private; 147 struct mpconf *conf = mddev->private;
147 int i; 148 int i;
148 149
149 seq_printf (seq, " [%d/%d] [", conf->raid_disks, 150 seq_printf (seq, " [%d/%d] [", conf->raid_disks,
150 conf->raid_disks - mddev->degraded); 151 conf->raid_disks - mddev->degraded);
151 for (i = 0; i < conf->raid_disks; i++) 152 rcu_read_lock();
152 seq_printf (seq, "%s", 153 for (i = 0; i < conf->raid_disks; i++) {
153 conf->multipaths[i].rdev && 154 struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
154 test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_"); 155 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
156 }
157 rcu_read_unlock();
155 seq_printf (seq, "]"); 158 seq_printf (seq, "]");
156} 159}
157 160
@@ -295,12 +298,14 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
295 goto abort; 298 goto abort;
296 } 299 }
297 p->rdev = NULL; 300 p->rdev = NULL;
298 synchronize_rcu(); 301 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
299 if (atomic_read(&rdev->nr_pending)) { 302 synchronize_rcu();
300 /* lost the race, try later */ 303 if (atomic_read(&rdev->nr_pending)) {
301 err = -EBUSY; 304 /* lost the race, try later */
302 p->rdev = rdev; 305 err = -EBUSY;
303 goto abort; 306 p->rdev = rdev;
307 goto abort;
308 }
304 } 309 }
305 err = md_integrity_register(mddev); 310 err = md_integrity_register(mddev);
306 } 311 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4e6da4497553..46168ef2e279 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -319,14 +319,13 @@ static void raid1_end_read_request(struct bio *bio)
319{ 319{
320 int uptodate = !bio->bi_error; 320 int uptodate = !bio->bi_error;
321 struct r1bio *r1_bio = bio->bi_private; 321 struct r1bio *r1_bio = bio->bi_private;
322 int mirror;
323 struct r1conf *conf = r1_bio->mddev->private; 322 struct r1conf *conf = r1_bio->mddev->private;
323 struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
324 324
325 mirror = r1_bio->read_disk;
326 /* 325 /*
327 * this branch is our 'one mirror IO has finished' event handler: 326 * this branch is our 'one mirror IO has finished' event handler:
328 */ 327 */
329 update_head_pos(mirror, r1_bio); 328 update_head_pos(r1_bio->read_disk, r1_bio);
330 329
331 if (uptodate) 330 if (uptodate)
332 set_bit(R1BIO_Uptodate, &r1_bio->state); 331 set_bit(R1BIO_Uptodate, &r1_bio->state);
@@ -339,14 +338,14 @@ static void raid1_end_read_request(struct bio *bio)
339 spin_lock_irqsave(&conf->device_lock, flags); 338 spin_lock_irqsave(&conf->device_lock, flags);
340 if (r1_bio->mddev->degraded == conf->raid_disks || 339 if (r1_bio->mddev->degraded == conf->raid_disks ||
341 (r1_bio->mddev->degraded == conf->raid_disks-1 && 340 (r1_bio->mddev->degraded == conf->raid_disks-1 &&
342 test_bit(In_sync, &conf->mirrors[mirror].rdev->flags))) 341 test_bit(In_sync, &rdev->flags)))
343 uptodate = 1; 342 uptodate = 1;
344 spin_unlock_irqrestore(&conf->device_lock, flags); 343 spin_unlock_irqrestore(&conf->device_lock, flags);
345 } 344 }
346 345
347 if (uptodate) { 346 if (uptodate) {
348 raid_end_bio_io(r1_bio); 347 raid_end_bio_io(r1_bio);
349 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 348 rdev_dec_pending(rdev, conf->mddev);
350 } else { 349 } else {
351 /* 350 /*
352 * oops, read error: 351 * oops, read error:
@@ -356,7 +355,7 @@ static void raid1_end_read_request(struct bio *bio)
356 KERN_ERR "md/raid1:%s: %s: " 355 KERN_ERR "md/raid1:%s: %s: "
357 "rescheduling sector %llu\n", 356 "rescheduling sector %llu\n",
358 mdname(conf->mddev), 357 mdname(conf->mddev),
359 bdevname(conf->mirrors[mirror].rdev->bdev, 358 bdevname(rdev->bdev,
360 b), 359 b),
361 (unsigned long long)r1_bio->sector); 360 (unsigned long long)r1_bio->sector);
362 set_bit(R1BIO_ReadError, &r1_bio->state); 361 set_bit(R1BIO_ReadError, &r1_bio->state);
@@ -403,20 +402,18 @@ static void r1_bio_write_done(struct r1bio *r1_bio)
403static void raid1_end_write_request(struct bio *bio) 402static void raid1_end_write_request(struct bio *bio)
404{ 403{
405 struct r1bio *r1_bio = bio->bi_private; 404 struct r1bio *r1_bio = bio->bi_private;
406 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 405 int behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
407 struct r1conf *conf = r1_bio->mddev->private; 406 struct r1conf *conf = r1_bio->mddev->private;
408 struct bio *to_put = NULL; 407 struct bio *to_put = NULL;
409 408 int mirror = find_bio_disk(r1_bio, bio);
410 mirror = find_bio_disk(r1_bio, bio); 409 struct md_rdev *rdev = conf->mirrors[mirror].rdev;
411 410
412 /* 411 /*
413 * 'one mirror IO has finished' event handler: 412 * 'one mirror IO has finished' event handler:
414 */ 413 */
415 if (bio->bi_error) { 414 if (bio->bi_error) {
416 set_bit(WriteErrorSeen, 415 set_bit(WriteErrorSeen, &rdev->flags);
417 &conf->mirrors[mirror].rdev->flags); 416 if (!test_and_set_bit(WantReplacement, &rdev->flags))
418 if (!test_and_set_bit(WantReplacement,
419 &conf->mirrors[mirror].rdev->flags))
420 set_bit(MD_RECOVERY_NEEDED, & 417 set_bit(MD_RECOVERY_NEEDED, &
421 conf->mddev->recovery); 418 conf->mddev->recovery);
422 419
@@ -445,13 +442,12 @@ static void raid1_end_write_request(struct bio *bio)
445 * before rdev->recovery_offset, but for simplicity we don't 442 * before rdev->recovery_offset, but for simplicity we don't
446 * check this here. 443 * check this here.
447 */ 444 */
448 if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) && 445 if (test_bit(In_sync, &rdev->flags) &&
449 !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)) 446 !test_bit(Faulty, &rdev->flags))
450 set_bit(R1BIO_Uptodate, &r1_bio->state); 447 set_bit(R1BIO_Uptodate, &r1_bio->state);
451 448
452 /* Maybe we can clear some bad blocks. */ 449 /* Maybe we can clear some bad blocks. */
453 if (is_badblock(conf->mirrors[mirror].rdev, 450 if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
454 r1_bio->sector, r1_bio->sectors,
455 &first_bad, &bad_sectors)) { 451 &first_bad, &bad_sectors)) {
456 r1_bio->bios[mirror] = IO_MADE_GOOD; 452 r1_bio->bios[mirror] = IO_MADE_GOOD;
457 set_bit(R1BIO_MadeGood, &r1_bio->state); 453 set_bit(R1BIO_MadeGood, &r1_bio->state);
@@ -459,7 +455,7 @@ static void raid1_end_write_request(struct bio *bio)
459 } 455 }
460 456
461 if (behind) { 457 if (behind) {
462 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 458 if (test_bit(WriteMostly, &rdev->flags))
463 atomic_dec(&r1_bio->behind_remaining); 459 atomic_dec(&r1_bio->behind_remaining);
464 460
465 /* 461 /*
@@ -483,8 +479,7 @@ static void raid1_end_write_request(struct bio *bio)
483 } 479 }
484 } 480 }
485 if (r1_bio->bios[mirror] == NULL) 481 if (r1_bio->bios[mirror] == NULL)
486 rdev_dec_pending(conf->mirrors[mirror].rdev, 482 rdev_dec_pending(rdev, conf->mddev);
487 conf->mddev);
488 483
489 /* 484 /*
490 * Let's see if all mirrored write operations have finished 485 * Let's see if all mirrored write operations have finished
@@ -689,13 +684,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
689 if (!rdev) 684 if (!rdev)
690 goto retry; 685 goto retry;
691 atomic_inc(&rdev->nr_pending); 686 atomic_inc(&rdev->nr_pending);
692 if (test_bit(Faulty, &rdev->flags)) {
693 /* cannot risk returning a device that failed
694 * before we inc'ed nr_pending
695 */
696 rdev_dec_pending(rdev, conf->mddev);
697 goto retry;
698 }
699 sectors = best_good_sectors; 687 sectors = best_good_sectors;
700 688
701 if (conf->mirrors[best_disk].next_seq_sect != this_sector) 689 if (conf->mirrors[best_disk].next_seq_sect != this_sector)
@@ -1666,13 +1654,16 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1666 goto abort; 1654 goto abort;
1667 } 1655 }
1668 p->rdev = NULL; 1656 p->rdev = NULL;
1669 synchronize_rcu(); 1657 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1670 if (atomic_read(&rdev->nr_pending)) { 1658 synchronize_rcu();
1671 /* lost the race, try later */ 1659 if (atomic_read(&rdev->nr_pending)) {
1672 err = -EBUSY; 1660 /* lost the race, try later */
1673 p->rdev = rdev; 1661 err = -EBUSY;
1674 goto abort; 1662 p->rdev = rdev;
1675 } else if (conf->mirrors[conf->raid_disks + number].rdev) { 1663 goto abort;
1664 }
1665 }
1666 if (conf->mirrors[conf->raid_disks + number].rdev) {
1676 /* We just removed a device that is being replaced. 1667 /* We just removed a device that is being replaced.
1677 * Move down the replacement. We drain all IO before 1668 * Move down the replacement. We drain all IO before
1678 * doing this to avoid confusion. 1669 * doing this to avoid confusion.
@@ -1719,11 +1710,9 @@ static void end_sync_write(struct bio *bio)
1719 struct r1bio *r1_bio = bio->bi_private; 1710 struct r1bio *r1_bio = bio->bi_private;
1720 struct mddev *mddev = r1_bio->mddev; 1711 struct mddev *mddev = r1_bio->mddev;
1721 struct r1conf *conf = mddev->private; 1712 struct r1conf *conf = mddev->private;
1722 int mirror=0;
1723 sector_t first_bad; 1713 sector_t first_bad;
1724 int bad_sectors; 1714 int bad_sectors;
1725 1715 struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
1726 mirror = find_bio_disk(r1_bio, bio);
1727 1716
1728 if (!uptodate) { 1717 if (!uptodate) {
1729 sector_t sync_blocks = 0; 1718 sector_t sync_blocks = 0;
@@ -1736,16 +1725,12 @@ static void end_sync_write(struct bio *bio)
1736 s += sync_blocks; 1725 s += sync_blocks;
1737 sectors_to_go -= sync_blocks; 1726 sectors_to_go -= sync_blocks;
1738 } while (sectors_to_go > 0); 1727 } while (sectors_to_go > 0);
1739 set_bit(WriteErrorSeen, 1728 set_bit(WriteErrorSeen, &rdev->flags);
1740 &conf->mirrors[mirror].rdev->flags); 1729 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1741 if (!test_and_set_bit(WantReplacement,
1742 &conf->mirrors[mirror].rdev->flags))
1743 set_bit(MD_RECOVERY_NEEDED, & 1730 set_bit(MD_RECOVERY_NEEDED, &
1744 mddev->recovery); 1731 mddev->recovery);
1745 set_bit(R1BIO_WriteError, &r1_bio->state); 1732 set_bit(R1BIO_WriteError, &r1_bio->state);
1746 } else if (is_badblock(conf->mirrors[mirror].rdev, 1733 } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
1747 r1_bio->sector,
1748 r1_bio->sectors,
1749 &first_bad, &bad_sectors) && 1734 &first_bad, &bad_sectors) &&
1750 !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, 1735 !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
1751 r1_bio->sector, 1736 r1_bio->sector,
@@ -2072,29 +2057,30 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
2072 s = PAGE_SIZE >> 9; 2057 s = PAGE_SIZE >> 9;
2073 2058
2074 do { 2059 do {
2075 /* Note: no rcu protection needed here
2076 * as this is synchronous in the raid1d thread
2077 * which is the thread that might remove
2078 * a device. If raid1d ever becomes multi-threaded....
2079 */
2080 sector_t first_bad; 2060 sector_t first_bad;
2081 int bad_sectors; 2061 int bad_sectors;
2082 2062
2083 rdev = conf->mirrors[d].rdev; 2063 rcu_read_lock();
2064 rdev = rcu_dereference(conf->mirrors[d].rdev);
2084 if (rdev && 2065 if (rdev &&
2085 (test_bit(In_sync, &rdev->flags) || 2066 (test_bit(In_sync, &rdev->flags) ||
2086 (!test_bit(Faulty, &rdev->flags) && 2067 (!test_bit(Faulty, &rdev->flags) &&
2087 rdev->recovery_offset >= sect + s)) && 2068 rdev->recovery_offset >= sect + s)) &&
2088 is_badblock(rdev, sect, s, 2069 is_badblock(rdev, sect, s,
2089 &first_bad, &bad_sectors) == 0 && 2070 &first_bad, &bad_sectors) == 0) {
2090 sync_page_io(rdev, sect, s<<9, 2071 atomic_inc(&rdev->nr_pending);
2072 rcu_read_unlock();
2073 if (sync_page_io(rdev, sect, s<<9,
2091 conf->tmppage, REQ_OP_READ, 0, false)) 2074 conf->tmppage, REQ_OP_READ, 0, false))
2092 success = 1; 2075 success = 1;
2093 else { 2076 rdev_dec_pending(rdev, mddev);
2094 d++; 2077 if (success)
2095 if (d == conf->raid_disks * 2) 2078 break;
2096 d = 0; 2079 } else
2097 } 2080 rcu_read_unlock();
2081 d++;
2082 if (d == conf->raid_disks * 2)
2083 d = 0;
2098 } while (!success && d != read_disk); 2084 } while (!success && d != read_disk);
2099 2085
2100 if (!success) { 2086 if (!success) {
@@ -2110,11 +2096,17 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
2110 if (d==0) 2096 if (d==0)
2111 d = conf->raid_disks * 2; 2097 d = conf->raid_disks * 2;
2112 d--; 2098 d--;
2113 rdev = conf->mirrors[d].rdev; 2099 rcu_read_lock();
2100 rdev = rcu_dereference(conf->mirrors[d].rdev);
2114 if (rdev && 2101 if (rdev &&
2115 !test_bit(Faulty, &rdev->flags)) 2102 !test_bit(Faulty, &rdev->flags)) {
2103 atomic_inc(&rdev->nr_pending);
2104 rcu_read_unlock();
2116 r1_sync_page_io(rdev, sect, s, 2105 r1_sync_page_io(rdev, sect, s,
2117 conf->tmppage, WRITE); 2106 conf->tmppage, WRITE);
2107 rdev_dec_pending(rdev, mddev);
2108 } else
2109 rcu_read_unlock();
2118 } 2110 }
2119 d = start; 2111 d = start;
2120 while (d != read_disk) { 2112 while (d != read_disk) {
@@ -2122,9 +2114,12 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
2122 if (d==0) 2114 if (d==0)
2123 d = conf->raid_disks * 2; 2115 d = conf->raid_disks * 2;
2124 d--; 2116 d--;
2125 rdev = conf->mirrors[d].rdev; 2117 rcu_read_lock();
2118 rdev = rcu_dereference(conf->mirrors[d].rdev);
2126 if (rdev && 2119 if (rdev &&
2127 !test_bit(Faulty, &rdev->flags)) { 2120 !test_bit(Faulty, &rdev->flags)) {
2121 atomic_inc(&rdev->nr_pending);
2122 rcu_read_unlock();
2128 if (r1_sync_page_io(rdev, sect, s, 2123 if (r1_sync_page_io(rdev, sect, s,
2129 conf->tmppage, READ)) { 2124 conf->tmppage, READ)) {
2130 atomic_add(s, &rdev->corrected_errors); 2125 atomic_add(s, &rdev->corrected_errors);
@@ -2133,10 +2128,12 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
2133 "(%d sectors at %llu on %s)\n", 2128 "(%d sectors at %llu on %s)\n",
2134 mdname(mddev), s, 2129 mdname(mddev), s,
2135 (unsigned long long)(sect + 2130 (unsigned long long)(sect +
2136 rdev->data_offset), 2131 rdev->data_offset),
2137 bdevname(rdev->bdev, b)); 2132 bdevname(rdev->bdev, b));
2138 } 2133 }
2139 } 2134 rdev_dec_pending(rdev, mddev);
2135 } else
2136 rcu_read_unlock();
2140 } 2137 }
2141 sectors -= s; 2138 sectors -= s;
2142 sect += s; 2139 sect += s;
@@ -2534,6 +2531,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2534 return sync_blocks; 2531 return sync_blocks;
2535 } 2532 }
2536 2533
2534 /*
2535 * If there is non-resync activity waiting for a turn, then let it
2536 * though before starting on this new sync request.
2537 */
2538 if (conf->nr_waiting)
2539 schedule_timeout_uninterruptible(1);
2540
2537 /* we are incrementing sector_nr below. To be safe, we check against 2541 /* we are incrementing sector_nr below. To be safe, we check against
2538 * sector_nr + two times RESYNC_SECTORS 2542 * sector_nr + two times RESYNC_SECTORS
2539 */ 2543 */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 26ae74fd0d01..ed29fc899f06 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -707,7 +707,6 @@ static struct md_rdev *read_balance(struct r10conf *conf,
707 707
708 raid10_find_phys(conf, r10_bio); 708 raid10_find_phys(conf, r10_bio);
709 rcu_read_lock(); 709 rcu_read_lock();
710retry:
711 sectors = r10_bio->sectors; 710 sectors = r10_bio->sectors;
712 best_slot = -1; 711 best_slot = -1;
713 best_rdev = NULL; 712 best_rdev = NULL;
@@ -804,13 +803,6 @@ retry:
804 803
805 if (slot >= 0) { 804 if (slot >= 0) {
806 atomic_inc(&rdev->nr_pending); 805 atomic_inc(&rdev->nr_pending);
807 if (test_bit(Faulty, &rdev->flags)) {
808 /* Cannot risk returning a device that failed
809 * before we inc'ed nr_pending
810 */
811 rdev_dec_pending(rdev, conf->mddev);
812 goto retry;
813 }
814 r10_bio->read_slot = slot; 806 r10_bio->read_slot = slot;
815 } else 807 } else
816 rdev = NULL; 808 rdev = NULL;
@@ -913,7 +905,7 @@ static void raise_barrier(struct r10conf *conf, int force)
913 905
914 /* Now wait for all pending IO to complete */ 906 /* Now wait for all pending IO to complete */
915 wait_event_lock_irq(conf->wait_barrier, 907 wait_event_lock_irq(conf->wait_barrier,
916 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 908 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
917 conf->resync_lock); 909 conf->resync_lock);
918 910
919 spin_unlock_irq(&conf->resync_lock); 911 spin_unlock_irq(&conf->resync_lock);
@@ -944,23 +936,23 @@ static void wait_barrier(struct r10conf *conf)
944 */ 936 */
945 wait_event_lock_irq(conf->wait_barrier, 937 wait_event_lock_irq(conf->wait_barrier,
946 !conf->barrier || 938 !conf->barrier ||
947 (conf->nr_pending && 939 (atomic_read(&conf->nr_pending) &&
948 current->bio_list && 940 current->bio_list &&
949 !bio_list_empty(current->bio_list)), 941 !bio_list_empty(current->bio_list)),
950 conf->resync_lock); 942 conf->resync_lock);
951 conf->nr_waiting--; 943 conf->nr_waiting--;
944 if (!conf->nr_waiting)
945 wake_up(&conf->wait_barrier);
952 } 946 }
953 conf->nr_pending++; 947 atomic_inc(&conf->nr_pending);
954 spin_unlock_irq(&conf->resync_lock); 948 spin_unlock_irq(&conf->resync_lock);
955} 949}
956 950
957static void allow_barrier(struct r10conf *conf) 951static void allow_barrier(struct r10conf *conf)
958{ 952{
959 unsigned long flags; 953 if ((atomic_dec_and_test(&conf->nr_pending)) ||
960 spin_lock_irqsave(&conf->resync_lock, flags); 954 (conf->array_freeze_pending))
961 conf->nr_pending--; 955 wake_up(&conf->wait_barrier);
962 spin_unlock_irqrestore(&conf->resync_lock, flags);
963 wake_up(&conf->wait_barrier);
964} 956}
965 957
966static void freeze_array(struct r10conf *conf, int extra) 958static void freeze_array(struct r10conf *conf, int extra)
@@ -978,13 +970,15 @@ static void freeze_array(struct r10conf *conf, int extra)
978 * we continue. 970 * we continue.
979 */ 971 */
980 spin_lock_irq(&conf->resync_lock); 972 spin_lock_irq(&conf->resync_lock);
973 conf->array_freeze_pending++;
981 conf->barrier++; 974 conf->barrier++;
982 conf->nr_waiting++; 975 conf->nr_waiting++;
983 wait_event_lock_irq_cmd(conf->wait_barrier, 976 wait_event_lock_irq_cmd(conf->wait_barrier,
984 conf->nr_pending == conf->nr_queued+extra, 977 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
985 conf->resync_lock, 978 conf->resync_lock,
986 flush_pending_writes(conf)); 979 flush_pending_writes(conf));
987 980
981 conf->array_freeze_pending--;
988 spin_unlock_irq(&conf->resync_lock); 982 spin_unlock_irq(&conf->resync_lock);
989} 983}
990 984
@@ -1499,10 +1493,12 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1499 } 1493 }
1500 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, 1494 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1501 conf->geo.raid_disks - mddev->degraded); 1495 conf->geo.raid_disks - mddev->degraded);
1502 for (i = 0; i < conf->geo.raid_disks; i++) 1496 rcu_read_lock();
1503 seq_printf(seq, "%s", 1497 for (i = 0; i < conf->geo.raid_disks; i++) {
1504 conf->mirrors[i].rdev && 1498 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1505 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); 1499 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1500 }
1501 rcu_read_unlock();
1506 seq_printf(seq, "]"); 1502 seq_printf(seq, "]");
1507} 1503}
1508 1504
@@ -1600,7 +1596,7 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1600static void print_conf(struct r10conf *conf) 1596static void print_conf(struct r10conf *conf)
1601{ 1597{
1602 int i; 1598 int i;
1603 struct raid10_info *tmp; 1599 struct md_rdev *rdev;
1604 1600
1605 printk(KERN_DEBUG "RAID10 conf printout:\n"); 1601 printk(KERN_DEBUG "RAID10 conf printout:\n");
1606 if (!conf) { 1602 if (!conf) {
@@ -1610,14 +1606,16 @@ static void print_conf(struct r10conf *conf)
1610 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, 1606 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1611 conf->geo.raid_disks); 1607 conf->geo.raid_disks);
1612 1608
1609 /* This is only called with ->reconfix_mutex held, so
1610 * rcu protection of rdev is not needed */
1613 for (i = 0; i < conf->geo.raid_disks; i++) { 1611 for (i = 0; i < conf->geo.raid_disks; i++) {
1614 char b[BDEVNAME_SIZE]; 1612 char b[BDEVNAME_SIZE];
1615 tmp = conf->mirrors + i; 1613 rdev = conf->mirrors[i].rdev;
1616 if (tmp->rdev) 1614 if (rdev)
1617 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", 1615 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1618 i, !test_bit(In_sync, &tmp->rdev->flags), 1616 i, !test_bit(In_sync, &rdev->flags),
1619 !test_bit(Faulty, &tmp->rdev->flags), 1617 !test_bit(Faulty, &rdev->flags),
1620 bdevname(tmp->rdev->bdev,b)); 1618 bdevname(rdev->bdev,b));
1621 } 1619 }
1622} 1620}
1623 1621
@@ -1766,7 +1764,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1766 err = -EBUSY; 1764 err = -EBUSY;
1767 goto abort; 1765 goto abort;
1768 } 1766 }
1769 /* Only remove faulty devices if recovery 1767 /* Only remove non-faulty devices if recovery
1770 * is not possible. 1768 * is not possible.
1771 */ 1769 */
1772 if (!test_bit(Faulty, &rdev->flags) && 1770 if (!test_bit(Faulty, &rdev->flags) &&
@@ -1778,13 +1776,16 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1778 goto abort; 1776 goto abort;
1779 } 1777 }
1780 *rdevp = NULL; 1778 *rdevp = NULL;
1781 synchronize_rcu(); 1779 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1782 if (atomic_read(&rdev->nr_pending)) { 1780 synchronize_rcu();
1783 /* lost the race, try later */ 1781 if (atomic_read(&rdev->nr_pending)) {
1784 err = -EBUSY; 1782 /* lost the race, try later */
1785 *rdevp = rdev; 1783 err = -EBUSY;
1786 goto abort; 1784 *rdevp = rdev;
1787 } else if (p->replacement) { 1785 goto abort;
1786 }
1787 }
1788 if (p->replacement) {
1788 /* We must have just cleared 'rdev' */ 1789 /* We must have just cleared 'rdev' */
1789 p->rdev = p->replacement; 1790 p->rdev = p->replacement;
1790 clear_bit(Replacement, &p->replacement->flags); 1791 clear_bit(Replacement, &p->replacement->flags);
@@ -2171,21 +2172,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2171 */ 2172 */
2172static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) 2173static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2173{ 2174{
2174 struct timespec cur_time_mon; 2175 long cur_time_mon;
2175 unsigned long hours_since_last; 2176 unsigned long hours_since_last;
2176 unsigned int read_errors = atomic_read(&rdev->read_errors); 2177 unsigned int read_errors = atomic_read(&rdev->read_errors);
2177 2178
2178 ktime_get_ts(&cur_time_mon); 2179 cur_time_mon = ktime_get_seconds();
2179 2180
2180 if (rdev->last_read_error.tv_sec == 0 && 2181 if (rdev->last_read_error == 0) {
2181 rdev->last_read_error.tv_nsec == 0) {
2182 /* first time we've seen a read error */ 2182 /* first time we've seen a read error */
2183 rdev->last_read_error = cur_time_mon; 2183 rdev->last_read_error = cur_time_mon;
2184 return; 2184 return;
2185 } 2185 }
2186 2186
2187 hours_since_last = (cur_time_mon.tv_sec - 2187 hours_since_last = (long)(cur_time_mon -
2188 rdev->last_read_error.tv_sec) / 3600; 2188 rdev->last_read_error) / 3600;
2189 2189
2190 rdev->last_read_error = cur_time_mon; 2190 rdev->last_read_error = cur_time_mon;
2191 2191
@@ -2264,7 +2264,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2264 printk(KERN_NOTICE 2264 printk(KERN_NOTICE
2265 "md/raid10:%s: %s: Failing raid device\n", 2265 "md/raid10:%s: %s: Failing raid device\n",
2266 mdname(mddev), b); 2266 mdname(mddev), b);
2267 md_error(mddev, conf->mirrors[d].rdev); 2267 md_error(mddev, rdev);
2268 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; 2268 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2269 return; 2269 return;
2270 } 2270 }
@@ -2287,6 +2287,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2287 rdev = rcu_dereference(conf->mirrors[d].rdev); 2287 rdev = rcu_dereference(conf->mirrors[d].rdev);
2288 if (rdev && 2288 if (rdev &&
2289 test_bit(In_sync, &rdev->flags) && 2289 test_bit(In_sync, &rdev->flags) &&
2290 !test_bit(Faulty, &rdev->flags) &&
2290 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 2291 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2291 &first_bad, &bad_sectors) == 0) { 2292 &first_bad, &bad_sectors) == 0) {
2292 atomic_inc(&rdev->nr_pending); 2293 atomic_inc(&rdev->nr_pending);
@@ -2340,6 +2341,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2340 d = r10_bio->devs[sl].devnum; 2341 d = r10_bio->devs[sl].devnum;
2341 rdev = rcu_dereference(conf->mirrors[d].rdev); 2342 rdev = rcu_dereference(conf->mirrors[d].rdev);
2342 if (!rdev || 2343 if (!rdev ||
2344 test_bit(Faulty, &rdev->flags) ||
2343 !test_bit(In_sync, &rdev->flags)) 2345 !test_bit(In_sync, &rdev->flags))
2344 continue; 2346 continue;
2345 2347
@@ -2379,6 +2381,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2379 d = r10_bio->devs[sl].devnum; 2381 d = r10_bio->devs[sl].devnum;
2380 rdev = rcu_dereference(conf->mirrors[d].rdev); 2382 rdev = rcu_dereference(conf->mirrors[d].rdev);
2381 if (!rdev || 2383 if (!rdev ||
2384 test_bit(Faulty, &rdev->flags) ||
2382 !test_bit(In_sync, &rdev->flags)) 2385 !test_bit(In_sync, &rdev->flags))
2383 continue; 2386 continue;
2384 2387
@@ -2876,11 +2879,14 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2876 /* Completed a full sync so the replacements 2879 /* Completed a full sync so the replacements
2877 * are now fully recovered. 2880 * are now fully recovered.
2878 */ 2881 */
2879 for (i = 0; i < conf->geo.raid_disks; i++) 2882 rcu_read_lock();
2880 if (conf->mirrors[i].replacement) 2883 for (i = 0; i < conf->geo.raid_disks; i++) {
2881 conf->mirrors[i].replacement 2884 struct md_rdev *rdev =
2882 ->recovery_offset 2885 rcu_dereference(conf->mirrors[i].replacement);
2883 = MaxSector; 2886 if (rdev)
2887 rdev->recovery_offset = MaxSector;
2888 }
2889 rcu_read_unlock();
2884 } 2890 }
2885 conf->fullsync = 0; 2891 conf->fullsync = 0;
2886 } 2892 }
@@ -2911,6 +2917,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2911 max_sector > (sector_nr | chunk_mask)) 2917 max_sector > (sector_nr | chunk_mask))
2912 max_sector = (sector_nr | chunk_mask) + 1; 2918 max_sector = (sector_nr | chunk_mask) + 1;
2913 2919
2920 /*
2921 * If there is non-resync activity waiting for a turn, then let it
2922 * though before starting on this new sync request.
2923 */
2924 if (conf->nr_waiting)
2925 schedule_timeout_uninterruptible(1);
2926
2914 /* Again, very different code for resync and recovery. 2927 /* Again, very different code for resync and recovery.
2915 * Both must result in an r10bio with a list of bios that 2928 * Both must result in an r10bio with a list of bios that
2916 * have bi_end_io, bi_sector, bi_bdev set, 2929 * have bi_end_io, bi_sector, bi_bdev set,
@@ -2939,14 +2952,20 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2939 int must_sync; 2952 int must_sync;
2940 int any_working; 2953 int any_working;
2941 struct raid10_info *mirror = &conf->mirrors[i]; 2954 struct raid10_info *mirror = &conf->mirrors[i];
2955 struct md_rdev *mrdev, *mreplace;
2942 2956
2943 if ((mirror->rdev == NULL || 2957 rcu_read_lock();
2944 test_bit(In_sync, &mirror->rdev->flags)) 2958 mrdev = rcu_dereference(mirror->rdev);
2945 && 2959 mreplace = rcu_dereference(mirror->replacement);
2946 (mirror->replacement == NULL || 2960
2947 test_bit(Faulty, 2961 if ((mrdev == NULL ||
2948 &mirror->replacement->flags))) 2962 test_bit(Faulty, &mrdev->flags) ||
2963 test_bit(In_sync, &mrdev->flags)) &&
2964 (mreplace == NULL ||
2965 test_bit(Faulty, &mreplace->flags))) {
2966 rcu_read_unlock();
2949 continue; 2967 continue;
2968 }
2950 2969
2951 still_degraded = 0; 2970 still_degraded = 0;
2952 /* want to reconstruct this device */ 2971 /* want to reconstruct this device */
@@ -2956,8 +2975,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2956 /* last stripe is not complete - don't 2975 /* last stripe is not complete - don't
2957 * try to recover this sector. 2976 * try to recover this sector.
2958 */ 2977 */
2978 rcu_read_unlock();
2959 continue; 2979 continue;
2960 } 2980 }
2981 if (mreplace && test_bit(Faulty, &mreplace->flags))
2982 mreplace = NULL;
2961 /* Unless we are doing a full sync, or a replacement 2983 /* Unless we are doing a full sync, or a replacement
2962 * we only need to recover the block if it is set in 2984 * we only need to recover the block if it is set in
2963 * the bitmap 2985 * the bitmap
@@ -2967,14 +2989,19 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2967 if (sync_blocks < max_sync) 2989 if (sync_blocks < max_sync)
2968 max_sync = sync_blocks; 2990 max_sync = sync_blocks;
2969 if (!must_sync && 2991 if (!must_sync &&
2970 mirror->replacement == NULL && 2992 mreplace == NULL &&
2971 !conf->fullsync) { 2993 !conf->fullsync) {
2972 /* yep, skip the sync_blocks here, but don't assume 2994 /* yep, skip the sync_blocks here, but don't assume
2973 * that there will never be anything to do here 2995 * that there will never be anything to do here
2974 */ 2996 */
2975 chunks_skipped = -1; 2997 chunks_skipped = -1;
2998 rcu_read_unlock();
2976 continue; 2999 continue;
2977 } 3000 }
3001 atomic_inc(&mrdev->nr_pending);
3002 if (mreplace)
3003 atomic_inc(&mreplace->nr_pending);
3004 rcu_read_unlock();
2978 3005
2979 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); 3006 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
2980 r10_bio->state = 0; 3007 r10_bio->state = 0;
@@ -2993,12 +3020,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2993 /* Need to check if the array will still be 3020 /* Need to check if the array will still be
2994 * degraded 3021 * degraded
2995 */ 3022 */
2996 for (j = 0; j < conf->geo.raid_disks; j++) 3023 rcu_read_lock();
2997 if (conf->mirrors[j].rdev == NULL || 3024 for (j = 0; j < conf->geo.raid_disks; j++) {
2998 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { 3025 struct md_rdev *rdev = rcu_dereference(
3026 conf->mirrors[j].rdev);
3027 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
2999 still_degraded = 1; 3028 still_degraded = 1;
3000 break; 3029 break;
3001 } 3030 }
3031 }
3002 3032
3003 must_sync = bitmap_start_sync(mddev->bitmap, sect, 3033 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3004 &sync_blocks, still_degraded); 3034 &sync_blocks, still_degraded);
@@ -3008,15 +3038,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3008 int k; 3038 int k;
3009 int d = r10_bio->devs[j].devnum; 3039 int d = r10_bio->devs[j].devnum;
3010 sector_t from_addr, to_addr; 3040 sector_t from_addr, to_addr;
3011 struct md_rdev *rdev; 3041 struct md_rdev *rdev =
3042 rcu_dereference(conf->mirrors[d].rdev);
3012 sector_t sector, first_bad; 3043 sector_t sector, first_bad;
3013 int bad_sectors; 3044 int bad_sectors;
3014 if (!conf->mirrors[d].rdev || 3045 if (!rdev ||
3015 !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) 3046 !test_bit(In_sync, &rdev->flags))
3016 continue; 3047 continue;
3017 /* This is where we read from */ 3048 /* This is where we read from */
3018 any_working = 1; 3049 any_working = 1;
3019 rdev = conf->mirrors[d].rdev;
3020 sector = r10_bio->devs[j].addr; 3050 sector = r10_bio->devs[j].addr;
3021 3051
3022 if (is_badblock(rdev, sector, max_sync, 3052 if (is_badblock(rdev, sector, max_sync,
@@ -3055,8 +3085,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3055 r10_bio->devs[1].devnum = i; 3085 r10_bio->devs[1].devnum = i;
3056 r10_bio->devs[1].addr = to_addr; 3086 r10_bio->devs[1].addr = to_addr;
3057 3087
3058 rdev = mirror->rdev; 3088 if (!test_bit(In_sync, &mrdev->flags)) {
3059 if (!test_bit(In_sync, &rdev->flags)) {
3060 bio = r10_bio->devs[1].bio; 3089 bio = r10_bio->devs[1].bio;
3061 bio_reset(bio); 3090 bio_reset(bio);
3062 bio->bi_next = biolist; 3091 bio->bi_next = biolist;
@@ -3065,8 +3094,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3065 bio->bi_end_io = end_sync_write; 3094 bio->bi_end_io = end_sync_write;
3066 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3095 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3067 bio->bi_iter.bi_sector = to_addr 3096 bio->bi_iter.bi_sector = to_addr
3068 + rdev->data_offset; 3097 + mrdev->data_offset;
3069 bio->bi_bdev = rdev->bdev; 3098 bio->bi_bdev = mrdev->bdev;
3070 atomic_inc(&r10_bio->remaining); 3099 atomic_inc(&r10_bio->remaining);
3071 } else 3100 } else
3072 r10_bio->devs[1].bio->bi_end_io = NULL; 3101 r10_bio->devs[1].bio->bi_end_io = NULL;
@@ -3075,8 +3104,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3075 bio = r10_bio->devs[1].repl_bio; 3104 bio = r10_bio->devs[1].repl_bio;
3076 if (bio) 3105 if (bio)
3077 bio->bi_end_io = NULL; 3106 bio->bi_end_io = NULL;
3078 rdev = mirror->replacement; 3107 /* Note: if mreplace != NULL, then bio
3079 /* Note: if rdev != NULL, then bio
3080 * cannot be NULL as r10buf_pool_alloc will 3108 * cannot be NULL as r10buf_pool_alloc will
3081 * have allocated it. 3109 * have allocated it.
3082 * So the second test here is pointless. 3110 * So the second test here is pointless.
@@ -3084,8 +3112,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3084 * this comment keeps human reviewers 3112 * this comment keeps human reviewers
3085 * happy. 3113 * happy.
3086 */ 3114 */
3087 if (rdev == NULL || bio == NULL || 3115 if (mreplace == NULL || bio == NULL ||
3088 test_bit(Faulty, &rdev->flags)) 3116 test_bit(Faulty, &mreplace->flags))
3089 break; 3117 break;
3090 bio_reset(bio); 3118 bio_reset(bio);
3091 bio->bi_next = biolist; 3119 bio->bi_next = biolist;
@@ -3094,11 +3122,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3094 bio->bi_end_io = end_sync_write; 3122 bio->bi_end_io = end_sync_write;
3095 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3123 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3096 bio->bi_iter.bi_sector = to_addr + 3124 bio->bi_iter.bi_sector = to_addr +
3097 rdev->data_offset; 3125 mreplace->data_offset;
3098 bio->bi_bdev = rdev->bdev; 3126 bio->bi_bdev = mreplace->bdev;
3099 atomic_inc(&r10_bio->remaining); 3127 atomic_inc(&r10_bio->remaining);
3100 break; 3128 break;
3101 } 3129 }
3130 rcu_read_unlock();
3102 if (j == conf->copies) { 3131 if (j == conf->copies) {
3103 /* Cannot recover, so abort the recovery or 3132 /* Cannot recover, so abort the recovery or
3104 * record a bad block */ 3133 * record a bad block */
@@ -3111,15 +3140,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3111 if (r10_bio->devs[k].devnum == i) 3140 if (r10_bio->devs[k].devnum == i)
3112 break; 3141 break;
3113 if (!test_bit(In_sync, 3142 if (!test_bit(In_sync,
3114 &mirror->rdev->flags) 3143 &mrdev->flags)
3115 && !rdev_set_badblocks( 3144 && !rdev_set_badblocks(
3116 mirror->rdev, 3145 mrdev,
3117 r10_bio->devs[k].addr, 3146 r10_bio->devs[k].addr,
3118 max_sync, 0)) 3147 max_sync, 0))
3119 any_working = 0; 3148 any_working = 0;
3120 if (mirror->replacement && 3149 if (mreplace &&
3121 !rdev_set_badblocks( 3150 !rdev_set_badblocks(
3122 mirror->replacement, 3151 mreplace,
3123 r10_bio->devs[k].addr, 3152 r10_bio->devs[k].addr,
3124 max_sync, 0)) 3153 max_sync, 0))
3125 any_working = 0; 3154 any_working = 0;
@@ -3137,8 +3166,14 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3137 if (rb2) 3166 if (rb2)
3138 atomic_dec(&rb2->remaining); 3167 atomic_dec(&rb2->remaining);
3139 r10_bio = rb2; 3168 r10_bio = rb2;
3169 rdev_dec_pending(mrdev, mddev);
3170 if (mreplace)
3171 rdev_dec_pending(mreplace, mddev);
3140 break; 3172 break;
3141 } 3173 }
3174 rdev_dec_pending(mrdev, mddev);
3175 if (mreplace)
3176 rdev_dec_pending(mreplace, mddev);
3142 } 3177 }
3143 if (biolist == NULL) { 3178 if (biolist == NULL) {
3144 while (r10_bio) { 3179 while (r10_bio) {
@@ -3183,6 +3218,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3183 int d = r10_bio->devs[i].devnum; 3218 int d = r10_bio->devs[i].devnum;
3184 sector_t first_bad, sector; 3219 sector_t first_bad, sector;
3185 int bad_sectors; 3220 int bad_sectors;
3221 struct md_rdev *rdev;
3186 3222
3187 if (r10_bio->devs[i].repl_bio) 3223 if (r10_bio->devs[i].repl_bio)
3188 r10_bio->devs[i].repl_bio->bi_end_io = NULL; 3224 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
@@ -3190,12 +3226,14 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3190 bio = r10_bio->devs[i].bio; 3226 bio = r10_bio->devs[i].bio;
3191 bio_reset(bio); 3227 bio_reset(bio);
3192 bio->bi_error = -EIO; 3228 bio->bi_error = -EIO;
3193 if (conf->mirrors[d].rdev == NULL || 3229 rcu_read_lock();
3194 test_bit(Faulty, &conf->mirrors[d].rdev->flags)) 3230 rdev = rcu_dereference(conf->mirrors[d].rdev);
3231 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3232 rcu_read_unlock();
3195 continue; 3233 continue;
3234 }
3196 sector = r10_bio->devs[i].addr; 3235 sector = r10_bio->devs[i].addr;
3197 if (is_badblock(conf->mirrors[d].rdev, 3236 if (is_badblock(rdev, sector, max_sync,
3198 sector, max_sync,
3199 &first_bad, &bad_sectors)) { 3237 &first_bad, &bad_sectors)) {
3200 if (first_bad > sector) 3238 if (first_bad > sector)
3201 max_sync = first_bad - sector; 3239 max_sync = first_bad - sector;
@@ -3203,25 +3241,28 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3203 bad_sectors -= (sector - first_bad); 3241 bad_sectors -= (sector - first_bad);
3204 if (max_sync > bad_sectors) 3242 if (max_sync > bad_sectors)
3205 max_sync = bad_sectors; 3243 max_sync = bad_sectors;
3244 rcu_read_unlock();
3206 continue; 3245 continue;
3207 } 3246 }
3208 } 3247 }
3209 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 3248 atomic_inc(&rdev->nr_pending);
3210 atomic_inc(&r10_bio->remaining); 3249 atomic_inc(&r10_bio->remaining);
3211 bio->bi_next = biolist; 3250 bio->bi_next = biolist;
3212 biolist = bio; 3251 biolist = bio;
3213 bio->bi_private = r10_bio; 3252 bio->bi_private = r10_bio;
3214 bio->bi_end_io = end_sync_read; 3253 bio->bi_end_io = end_sync_read;
3215 bio_set_op_attrs(bio, REQ_OP_READ, 0); 3254 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3216 bio->bi_iter.bi_sector = sector + 3255 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3217 conf->mirrors[d].rdev->data_offset; 3256 bio->bi_bdev = rdev->bdev;
3218 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3219 count++; 3257 count++;
3220 3258
3221 if (conf->mirrors[d].replacement == NULL || 3259 rdev = rcu_dereference(conf->mirrors[d].replacement);
3222 test_bit(Faulty, 3260 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3223 &conf->mirrors[d].replacement->flags)) 3261 rcu_read_unlock();
3224 continue; 3262 continue;
3263 }
3264 atomic_inc(&rdev->nr_pending);
3265 rcu_read_unlock();
3225 3266
3226 /* Need to set up for writing to the replacement */ 3267 /* Need to set up for writing to the replacement */
3227 bio = r10_bio->devs[i].repl_bio; 3268 bio = r10_bio->devs[i].repl_bio;
@@ -3229,15 +3270,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3229 bio->bi_error = -EIO; 3270 bio->bi_error = -EIO;
3230 3271
3231 sector = r10_bio->devs[i].addr; 3272 sector = r10_bio->devs[i].addr;
3232 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3233 bio->bi_next = biolist; 3273 bio->bi_next = biolist;
3234 biolist = bio; 3274 biolist = bio;
3235 bio->bi_private = r10_bio; 3275 bio->bi_private = r10_bio;
3236 bio->bi_end_io = end_sync_write; 3276 bio->bi_end_io = end_sync_write;
3237 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3277 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3238 bio->bi_iter.bi_sector = sector + 3278 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3239 conf->mirrors[d].replacement->data_offset; 3279 bio->bi_bdev = rdev->bdev;
3240 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3241 count++; 3280 count++;
3242 } 3281 }
3243 3282
@@ -3504,6 +3543,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3504 3543
3505 spin_lock_init(&conf->resync_lock); 3544 spin_lock_init(&conf->resync_lock);
3506 init_waitqueue_head(&conf->wait_barrier); 3545 init_waitqueue_head(&conf->wait_barrier);
3546 atomic_set(&conf->nr_pending, 0);
3507 3547
3508 conf->thread = md_register_thread(raid10d, mddev, "raid10"); 3548 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3509 if (!conf->thread) 3549 if (!conf->thread)
@@ -4333,15 +4373,16 @@ read_more:
4333 blist = read_bio; 4373 blist = read_bio;
4334 read_bio->bi_next = NULL; 4374 read_bio->bi_next = NULL;
4335 4375
4376 rcu_read_lock();
4336 for (s = 0; s < conf->copies*2; s++) { 4377 for (s = 0; s < conf->copies*2; s++) {
4337 struct bio *b; 4378 struct bio *b;
4338 int d = r10_bio->devs[s/2].devnum; 4379 int d = r10_bio->devs[s/2].devnum;
4339 struct md_rdev *rdev2; 4380 struct md_rdev *rdev2;
4340 if (s&1) { 4381 if (s&1) {
4341 rdev2 = conf->mirrors[d].replacement; 4382 rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4342 b = r10_bio->devs[s/2].repl_bio; 4383 b = r10_bio->devs[s/2].repl_bio;
4343 } else { 4384 } else {
4344 rdev2 = conf->mirrors[d].rdev; 4385 rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4345 b = r10_bio->devs[s/2].bio; 4386 b = r10_bio->devs[s/2].bio;
4346 } 4387 }
4347 if (!rdev2 || test_bit(Faulty, &rdev2->flags)) 4388 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
@@ -4386,6 +4427,7 @@ read_more:
4386 nr_sectors += len >> 9; 4427 nr_sectors += len >> 9;
4387 } 4428 }
4388bio_full: 4429bio_full:
4430 rcu_read_unlock();
4389 r10_bio->sectors = nr_sectors; 4431 r10_bio->sectors = nr_sectors;
4390 4432
4391 /* Now submit the read */ 4433 /* Now submit the read */
@@ -4437,16 +4479,20 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4437 struct bio *b; 4479 struct bio *b;
4438 int d = r10_bio->devs[s/2].devnum; 4480 int d = r10_bio->devs[s/2].devnum;
4439 struct md_rdev *rdev; 4481 struct md_rdev *rdev;
4482 rcu_read_lock();
4440 if (s&1) { 4483 if (s&1) {
4441 rdev = conf->mirrors[d].replacement; 4484 rdev = rcu_dereference(conf->mirrors[d].replacement);
4442 b = r10_bio->devs[s/2].repl_bio; 4485 b = r10_bio->devs[s/2].repl_bio;
4443 } else { 4486 } else {
4444 rdev = conf->mirrors[d].rdev; 4487 rdev = rcu_dereference(conf->mirrors[d].rdev);
4445 b = r10_bio->devs[s/2].bio; 4488 b = r10_bio->devs[s/2].bio;
4446 } 4489 }
4447 if (!rdev || test_bit(Faulty, &rdev->flags)) 4490 if (!rdev || test_bit(Faulty, &rdev->flags)) {
4491 rcu_read_unlock();
4448 continue; 4492 continue;
4493 }
4449 atomic_inc(&rdev->nr_pending); 4494 atomic_inc(&rdev->nr_pending);
4495 rcu_read_unlock();
4450 md_sync_acct(b->bi_bdev, r10_bio->sectors); 4496 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4451 atomic_inc(&r10_bio->remaining); 4497 atomic_inc(&r10_bio->remaining);
4452 b->bi_next = NULL; 4498 b->bi_next = NULL;
@@ -4507,9 +4553,10 @@ static int handle_reshape_read_error(struct mddev *mddev,
4507 if (s > (PAGE_SIZE >> 9)) 4553 if (s > (PAGE_SIZE >> 9))
4508 s = PAGE_SIZE >> 9; 4554 s = PAGE_SIZE >> 9;
4509 4555
4556 rcu_read_lock();
4510 while (!success) { 4557 while (!success) {
4511 int d = r10b->devs[slot].devnum; 4558 int d = r10b->devs[slot].devnum;
4512 struct md_rdev *rdev = conf->mirrors[d].rdev; 4559 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4513 sector_t addr; 4560 sector_t addr;
4514 if (rdev == NULL || 4561 if (rdev == NULL ||
4515 test_bit(Faulty, &rdev->flags) || 4562 test_bit(Faulty, &rdev->flags) ||
@@ -4517,11 +4564,15 @@ static int handle_reshape_read_error(struct mddev *mddev,
4517 goto failed; 4564 goto failed;
4518 4565
4519 addr = r10b->devs[slot].addr + idx * PAGE_SIZE; 4566 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4567 atomic_inc(&rdev->nr_pending);
4568 rcu_read_unlock();
4520 success = sync_page_io(rdev, 4569 success = sync_page_io(rdev,
4521 addr, 4570 addr,
4522 s << 9, 4571 s << 9,
4523 bvec[idx].bv_page, 4572 bvec[idx].bv_page,
4524 REQ_OP_READ, 0, false); 4573 REQ_OP_READ, 0, false);
4574 rdev_dec_pending(rdev, mddev);
4575 rcu_read_lock();
4525 if (success) 4576 if (success)
4526 break; 4577 break;
4527 failed: 4578 failed:
@@ -4531,6 +4582,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
4531 if (slot == first_slot) 4582 if (slot == first_slot)
4532 break; 4583 break;
4533 } 4584 }
4585 rcu_read_unlock();
4534 if (!success) { 4586 if (!success) {
4535 /* couldn't read this block, must give up */ 4587 /* couldn't read this block, must give up */
4536 set_bit(MD_RECOVERY_INTR, 4588 set_bit(MD_RECOVERY_INTR,
@@ -4600,16 +4652,18 @@ static void raid10_finish_reshape(struct mddev *mddev)
4600 } 4652 }
4601 } else { 4653 } else {
4602 int d; 4654 int d;
4655 rcu_read_lock();
4603 for (d = conf->geo.raid_disks ; 4656 for (d = conf->geo.raid_disks ;
4604 d < conf->geo.raid_disks - mddev->delta_disks; 4657 d < conf->geo.raid_disks - mddev->delta_disks;
4605 d++) { 4658 d++) {
4606 struct md_rdev *rdev = conf->mirrors[d].rdev; 4659 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4607 if (rdev) 4660 if (rdev)
4608 clear_bit(In_sync, &rdev->flags); 4661 clear_bit(In_sync, &rdev->flags);
4609 rdev = conf->mirrors[d].replacement; 4662 rdev = rcu_dereference(conf->mirrors[d].replacement);
4610 if (rdev) 4663 if (rdev)
4611 clear_bit(In_sync, &rdev->flags); 4664 clear_bit(In_sync, &rdev->flags);
4612 } 4665 }
4666 rcu_read_unlock();
4613 } 4667 }
4614 mddev->layout = mddev->new_layout; 4668 mddev->layout = mddev->new_layout;
4615 mddev->chunk_sectors = 1 << conf->geo.chunk_shift; 4669 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 6fc2c75759bf..18ec1f7a98bf 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -64,10 +64,11 @@ struct r10conf {
64 int pending_count; 64 int pending_count;
65 65
66 spinlock_t resync_lock; 66 spinlock_t resync_lock;
67 int nr_pending; 67 atomic_t nr_pending;
68 int nr_waiting; 68 int nr_waiting;
69 int nr_queued; 69 int nr_queued;
70 int barrier; 70 int barrier;
71 int array_freeze_pending;
71 sector_t next_resync; 72 sector_t next_resync;
72 int fullsync; /* set to 1 if a full sync is needed, 73 int fullsync; /* set to 1 if a full sync is needed,
73 * (fresh device added). 74 * (fresh device added).
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6953d78297b0..d189e894b921 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3080,7 +3080,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3080 struct md_rdev *rdev; 3080 struct md_rdev *rdev;
3081 rcu_read_lock(); 3081 rcu_read_lock();
3082 rdev = rcu_dereference(conf->disks[i].rdev); 3082 rdev = rcu_dereference(conf->disks[i].rdev);
3083 if (rdev && test_bit(In_sync, &rdev->flags)) 3083 if (rdev && test_bit(In_sync, &rdev->flags) &&
3084 !test_bit(Faulty, &rdev->flags))
3084 atomic_inc(&rdev->nr_pending); 3085 atomic_inc(&rdev->nr_pending);
3085 else 3086 else
3086 rdev = NULL; 3087 rdev = NULL;
@@ -3210,15 +3211,16 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3210 /* During recovery devices cannot be removed, so 3211 /* During recovery devices cannot be removed, so
3211 * locking and refcounting of rdevs is not needed 3212 * locking and refcounting of rdevs is not needed
3212 */ 3213 */
3214 rcu_read_lock();
3213 for (i = 0; i < conf->raid_disks; i++) { 3215 for (i = 0; i < conf->raid_disks; i++) {
3214 struct md_rdev *rdev = conf->disks[i].rdev; 3216 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3215 if (rdev 3217 if (rdev
3216 && !test_bit(Faulty, &rdev->flags) 3218 && !test_bit(Faulty, &rdev->flags)
3217 && !test_bit(In_sync, &rdev->flags) 3219 && !test_bit(In_sync, &rdev->flags)
3218 && !rdev_set_badblocks(rdev, sh->sector, 3220 && !rdev_set_badblocks(rdev, sh->sector,
3219 STRIPE_SECTORS, 0)) 3221 STRIPE_SECTORS, 0))
3220 abort = 1; 3222 abort = 1;
3221 rdev = conf->disks[i].replacement; 3223 rdev = rcu_dereference(conf->disks[i].replacement);
3222 if (rdev 3224 if (rdev
3223 && !test_bit(Faulty, &rdev->flags) 3225 && !test_bit(Faulty, &rdev->flags)
3224 && !test_bit(In_sync, &rdev->flags) 3226 && !test_bit(In_sync, &rdev->flags)
@@ -3226,6 +3228,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3226 STRIPE_SECTORS, 0)) 3228 STRIPE_SECTORS, 0))
3227 abort = 1; 3229 abort = 1;
3228 } 3230 }
3231 rcu_read_unlock();
3229 if (abort) 3232 if (abort)
3230 conf->recovery_disabled = 3233 conf->recovery_disabled =
3231 conf->mddev->recovery_disabled; 3234 conf->mddev->recovery_disabled;
@@ -3237,15 +3240,16 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
3237{ 3240{
3238 struct md_rdev *rdev; 3241 struct md_rdev *rdev;
3239 int rv = 0; 3242 int rv = 0;
3240 /* Doing recovery so rcu locking not required */ 3243
3241 rdev = sh->raid_conf->disks[disk_idx].replacement; 3244 rcu_read_lock();
3245 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3242 if (rdev 3246 if (rdev
3243 && !test_bit(Faulty, &rdev->flags) 3247 && !test_bit(Faulty, &rdev->flags)
3244 && !test_bit(In_sync, &rdev->flags) 3248 && !test_bit(In_sync, &rdev->flags)
3245 && (rdev->recovery_offset <= sh->sector 3249 && (rdev->recovery_offset <= sh->sector
3246 || rdev->mddev->recovery_cp <= sh->sector)) 3250 || rdev->mddev->recovery_cp <= sh->sector))
3247 rv = 1; 3251 rv = 1;
3248 3252 rcu_read_unlock();
3249 return rv; 3253 return rv;
3250} 3254}
3251 3255
@@ -3600,7 +3604,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3600 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 3604 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
3601 (unsigned long long)sh->sector, rmw, rcw); 3605 (unsigned long long)sh->sector, rmw, rcw);
3602 set_bit(STRIPE_HANDLE, &sh->state); 3606 set_bit(STRIPE_HANDLE, &sh->state);
3603 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { 3607 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
3604 /* prefer read-modify-write, but need to get some data */ 3608 /* prefer read-modify-write, but need to get some data */
3605 if (conf->mddev->queue) 3609 if (conf->mddev->queue)
3606 blk_add_trace_msg(conf->mddev->queue, 3610 blk_add_trace_msg(conf->mddev->queue,
@@ -3627,7 +3631,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3627 } 3631 }
3628 } 3632 }
3629 } 3633 }
3630 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { 3634 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
3631 /* want reconstruct write, but need to get some data */ 3635 /* want reconstruct write, but need to get some data */
3632 int qread =0; 3636 int qread =0;
3633 rcw = 0; 3637 rcw = 0;
@@ -7066,10 +7070,12 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7066 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 7070 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7067 conf->chunk_sectors / 2, mddev->layout); 7071 conf->chunk_sectors / 2, mddev->layout);
7068 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 7072 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7069 for (i = 0; i < conf->raid_disks; i++) 7073 rcu_read_lock();
7070 seq_printf (seq, "%s", 7074 for (i = 0; i < conf->raid_disks; i++) {
7071 conf->disks[i].rdev && 7075 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
7072 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 7076 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
7077 }
7078 rcu_read_unlock();
7073 seq_printf (seq, "]"); 7079 seq_printf (seq, "]");
7074} 7080}
7075 7081
@@ -7191,12 +7197,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7191 goto abort; 7197 goto abort;
7192 } 7198 }
7193 *rdevp = NULL; 7199 *rdevp = NULL;
7194 synchronize_rcu(); 7200 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
7195 if (atomic_read(&rdev->nr_pending)) { 7201 synchronize_rcu();
7196 /* lost the race, try later */ 7202 if (atomic_read(&rdev->nr_pending)) {
7197 err = -EBUSY; 7203 /* lost the race, try later */
7198 *rdevp = rdev; 7204 err = -EBUSY;
7199 } else if (p->replacement) { 7205 *rdevp = rdev;
7206 }
7207 }
7208 if (p->replacement) {
7200 /* We must have just cleared 'rdev' */ 7209 /* We must have just cleared 'rdev' */
7201 p->rdev = p->replacement; 7210 p->rdev = p->replacement;
7202 clear_bit(Replacement, &p->replacement->flags); 7211 clear_bit(Replacement, &p->replacement->flags);