aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-01-08 16:28:33 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-08 16:28:33 -0500
commit2943c833222ef87c111ee0c6b7b8519ad2983e99 (patch)
tree0ef8cc4f72a63b325e7ae858ec68822ec4f3c64f /drivers
parent98793265b429a3f0b3f1750e74d67cd4d740d162 (diff)
parent19d671695e1931ebfd75b2b888778201aefe35ca (diff)
Merge tag 'md-3.3' of git://neil.brown.name/md
md update for 3.3 Big change is new hot-replacement. A slot in an array can hold 2 devices - one that wants-replacement and one that is the replacement. Once the replacement is built - either from the original or (in the case of errors) from elsewhere, the wants-replacement device will be removed. * tag 'md-3.3' of git://neil.brown.name/md: (36 commits) md/raid1: Mark device want_replacement when we see a write error. md/raid1: If there is a spare and a want_replacement device, start replacement. md/raid1: recognise replacements when assembling arrays. md/raid1: handle activation of replacement device when recovery completes. md/raid1: Allow a failed replacement device to be removed. md/raid1: Allocate spare to store replacement devices and their bios. md/raid1: Replace use of mddev->raid_disks with conf->raid_disks. md/raid10: If there is a spare and a want_replacement device, start replacement. md/raid10: recognise replacements when assembling array. md/raid10: Allow replacement device to be replace old drive. md/raid10: handle recovery of replacement devices. md/raid10: Handle replacement devices during resync. md/raid10: writes should get directed to replacement as well as original. md/raid10: allow removal of failed replacement devices. md/raid10: preferentially read from replacement device if possible. md/raid10: change read_balance to return an rdev md/raid10: prepare data structures for handling replacement. md/raid5: Mark device want_replacement when we see a write error. md/raid5: If there is a spare and a want_replacement device, start replacement. md/raid5: recognise replacements when assembling array. ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/bitmap.c12
-rw-r--r--drivers/md/md.c107
-rw-r--r--drivers/md/md.h82
-rw-r--r--drivers/md/multipath.c7
-rw-r--r--drivers/md/raid1.c174
-rw-r--r--drivers/md/raid1.h7
-rw-r--r--drivers/md/raid10.c582
-rw-r--r--drivers/md/raid10.h61
-rw-r--r--drivers/md/raid5.c557
-rw-r--r--drivers/md/raid5.h98
10 files changed, 1256 insertions, 431 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 6d03774b176e..cdf36b1e9aa6 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1149,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev)
1149 return; 1149 return;
1150 } 1150 }
1151 if (time_before(jiffies, bitmap->daemon_lastrun 1151 if (time_before(jiffies, bitmap->daemon_lastrun
1152 + bitmap->mddev->bitmap_info.daemon_sleep)) 1152 + mddev->bitmap_info.daemon_sleep))
1153 goto done; 1153 goto done;
1154 1154
1155 bitmap->daemon_lastrun = jiffies; 1155 bitmap->daemon_lastrun = jiffies;
1156 if (bitmap->allclean) { 1156 if (bitmap->allclean) {
1157 bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 1157 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1158 goto done; 1158 goto done;
1159 } 1159 }
1160 bitmap->allclean = 1; 1160 bitmap->allclean = 1;
@@ -1206,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev)
1206 * sure that events_cleared is up-to-date. 1206 * sure that events_cleared is up-to-date.
1207 */ 1207 */
1208 if (bitmap->need_sync && 1208 if (bitmap->need_sync &&
1209 bitmap->mddev->bitmap_info.external == 0) { 1209 mddev->bitmap_info.external == 0) {
1210 bitmap_super_t *sb; 1210 bitmap_super_t *sb;
1211 bitmap->need_sync = 0; 1211 bitmap->need_sync = 0;
1212 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 1212 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
@@ -1270,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev)
1270 1270
1271 done: 1271 done:
1272 if (bitmap->allclean == 0) 1272 if (bitmap->allclean == 0)
1273 bitmap->mddev->thread->timeout = 1273 mddev->thread->timeout =
1274 bitmap->mddev->bitmap_info.daemon_sleep; 1274 mddev->bitmap_info.daemon_sleep;
1275 mutex_unlock(&mddev->bitmap_info.mutex); 1275 mutex_unlock(&mddev->bitmap_info.mutex);
1276} 1276}
1277 1277
@@ -1587,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
1587 } 1587 }
1588 if (!*bmc) { 1588 if (!*bmc) {
1589 struct page *page; 1589 struct page *page;
1590 *bmc = 1 | (needed ? NEEDED_MASK : 0); 1590 *bmc = 2 | (needed ? NEEDED_MASK : 0);
1591 bitmap_count_page(bitmap, offset, 1); 1591 bitmap_count_page(bitmap, offset, 1);
1592 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); 1592 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
1593 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); 1593 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5d1b6762f108..ca8527fe77eb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1713,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1713 } 1713 }
1714 if (sb->devflags & WriteMostly1) 1714 if (sb->devflags & WriteMostly1)
1715 set_bit(WriteMostly, &rdev->flags); 1715 set_bit(WriteMostly, &rdev->flags);
1716 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1717 set_bit(Replacement, &rdev->flags);
1716 } else /* MULTIPATH are always insync */ 1718 } else /* MULTIPATH are always insync */
1717 set_bit(In_sync, &rdev->flags); 1719 set_bit(In_sync, &rdev->flags);
1718 1720
@@ -1766,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1766 sb->recovery_offset = 1768 sb->recovery_offset =
1767 cpu_to_le64(rdev->recovery_offset); 1769 cpu_to_le64(rdev->recovery_offset);
1768 } 1770 }
1771 if (test_bit(Replacement, &rdev->flags))
1772 sb->feature_map |=
1773 cpu_to_le32(MD_FEATURE_REPLACEMENT);
1769 1774
1770 if (mddev->reshape_position != MaxSector) { 1775 if (mddev->reshape_position != MaxSector) {
1771 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1776 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
@@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page)
2559 len += sprintf(page+len, "%swrite_error", sep); 2564 len += sprintf(page+len, "%swrite_error", sep);
2560 sep = ","; 2565 sep = ",";
2561 } 2566 }
2567 if (test_bit(WantReplacement, &rdev->flags)) {
2568 len += sprintf(page+len, "%swant_replacement", sep);
2569 sep = ",";
2570 }
2571 if (test_bit(Replacement, &rdev->flags)) {
2572 len += sprintf(page+len, "%sreplacement", sep);
2573 sep = ",";
2574 }
2575
2562 return len+sprintf(page+len, "\n"); 2576 return len+sprintf(page+len, "\n");
2563} 2577}
2564 2578
@@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2627 } else if (cmd_match(buf, "-write_error")) { 2641 } else if (cmd_match(buf, "-write_error")) {
2628 clear_bit(WriteErrorSeen, &rdev->flags); 2642 clear_bit(WriteErrorSeen, &rdev->flags);
2629 err = 0; 2643 err = 0;
2644 } else if (cmd_match(buf, "want_replacement")) {
2645 /* Any non-spare device that is not a replacement can
2646 * become want_replacement at any time, but we then need to
2647 * check if recovery is needed.
2648 */
2649 if (rdev->raid_disk >= 0 &&
2650 !test_bit(Replacement, &rdev->flags))
2651 set_bit(WantReplacement, &rdev->flags);
2652 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2653 md_wakeup_thread(rdev->mddev->thread);
2654 err = 0;
2655 } else if (cmd_match(buf, "-want_replacement")) {
2656 /* Clearing 'want_replacement' is always allowed.
2657 * Once replacements starts it is too late though.
2658 */
2659 err = 0;
2660 clear_bit(WantReplacement, &rdev->flags);
2661 } else if (cmd_match(buf, "replacement")) {
2662 /* Can only set a device as a replacement when array has not
2663 * yet been started. Once running, replacement is automatic
2664 * from spares, or by assigning 'slot'.
2665 */
2666 if (rdev->mddev->pers)
2667 err = -EBUSY;
2668 else {
2669 set_bit(Replacement, &rdev->flags);
2670 err = 0;
2671 }
2672 } else if (cmd_match(buf, "-replacement")) {
2673 /* Similarly, can only clear Replacement before start */
2674 if (rdev->mddev->pers)
2675 err = -EBUSY;
2676 else {
2677 clear_bit(Replacement, &rdev->flags);
2678 err = 0;
2679 }
2630 } 2680 }
2631 if (!err) 2681 if (!err)
2632 sysfs_notify_dirent_safe(rdev->sysfs_state); 2682 sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2688,7 +2738,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2688 if (rdev->mddev->pers->hot_remove_disk == NULL) 2738 if (rdev->mddev->pers->hot_remove_disk == NULL)
2689 return -EINVAL; 2739 return -EINVAL;
2690 err = rdev->mddev->pers-> 2740 err = rdev->mddev->pers->
2691 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2741 hot_remove_disk(rdev->mddev, rdev);
2692 if (err) 2742 if (err)
2693 return err; 2743 return err;
2694 sysfs_unlink_rdev(rdev->mddev, rdev); 2744 sysfs_unlink_rdev(rdev->mddev, rdev);
@@ -2696,7 +2746,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2696 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2746 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2697 md_wakeup_thread(rdev->mddev->thread); 2747 md_wakeup_thread(rdev->mddev->thread);
2698 } else if (rdev->mddev->pers) { 2748 } else if (rdev->mddev->pers) {
2699 struct md_rdev *rdev2;
2700 /* Activating a spare .. or possibly reactivating 2749 /* Activating a spare .. or possibly reactivating
2701 * if we ever get bitmaps working here. 2750 * if we ever get bitmaps working here.
2702 */ 2751 */
@@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2710 if (rdev->mddev->pers->hot_add_disk == NULL) 2759 if (rdev->mddev->pers->hot_add_disk == NULL)
2711 return -EINVAL; 2760 return -EINVAL;
2712 2761
2713 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2714 if (rdev2->raid_disk == slot)
2715 return -EEXIST;
2716
2717 if (slot >= rdev->mddev->raid_disks && 2762 if (slot >= rdev->mddev->raid_disks &&
2718 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2763 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2719 return -ENOSPC; 2764 return -ENOSPC;
@@ -6053,8 +6098,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6053 struct mddev *mddev = NULL; 6098 struct mddev *mddev = NULL;
6054 int ro; 6099 int ro;
6055 6100
6056 if (!capable(CAP_SYS_ADMIN)) 6101 switch (cmd) {
6057 return -EACCES; 6102 case RAID_VERSION:
6103 case GET_ARRAY_INFO:
6104 case GET_DISK_INFO:
6105 break;
6106 default:
6107 if (!capable(CAP_SYS_ADMIN))
6108 return -EACCES;
6109 }
6058 6110
6059 /* 6111 /*
6060 * Commands dealing with the RAID driver but not any 6112 * Commands dealing with the RAID driver but not any
@@ -6714,8 +6766,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
6714 if (test_bit(Faulty, &rdev->flags)) { 6766 if (test_bit(Faulty, &rdev->flags)) {
6715 seq_printf(seq, "(F)"); 6767 seq_printf(seq, "(F)");
6716 continue; 6768 continue;
6717 } else if (rdev->raid_disk < 0) 6769 }
6770 if (rdev->raid_disk < 0)
6718 seq_printf(seq, "(S)"); /* spare */ 6771 seq_printf(seq, "(S)"); /* spare */
6772 if (test_bit(Replacement, &rdev->flags))
6773 seq_printf(seq, "(R)");
6719 sectors += rdev->sectors; 6774 sectors += rdev->sectors;
6720 } 6775 }
6721 6776
@@ -7337,29 +7392,27 @@ static int remove_and_add_spares(struct mddev *mddev)
7337 ! test_bit(In_sync, &rdev->flags)) && 7392 ! test_bit(In_sync, &rdev->flags)) &&
7338 atomic_read(&rdev->nr_pending)==0) { 7393 atomic_read(&rdev->nr_pending)==0) {
7339 if (mddev->pers->hot_remove_disk( 7394 if (mddev->pers->hot_remove_disk(
7340 mddev, rdev->raid_disk)==0) { 7395 mddev, rdev) == 0) {
7341 sysfs_unlink_rdev(mddev, rdev); 7396 sysfs_unlink_rdev(mddev, rdev);
7342 rdev->raid_disk = -1; 7397 rdev->raid_disk = -1;
7343 } 7398 }
7344 } 7399 }
7345 7400
7346 if (mddev->degraded) { 7401 list_for_each_entry(rdev, &mddev->disks, same_set) {
7347 list_for_each_entry(rdev, &mddev->disks, same_set) { 7402 if (rdev->raid_disk >= 0 &&
7348 if (rdev->raid_disk >= 0 && 7403 !test_bit(In_sync, &rdev->flags) &&
7349 !test_bit(In_sync, &rdev->flags) && 7404 !test_bit(Faulty, &rdev->flags))
7350 !test_bit(Faulty, &rdev->flags)) 7405 spares++;
7406 if (rdev->raid_disk < 0
7407 && !test_bit(Faulty, &rdev->flags)) {
7408 rdev->recovery_offset = 0;
7409 if (mddev->pers->
7410 hot_add_disk(mddev, rdev) == 0) {
7411 if (sysfs_link_rdev(mddev, rdev))
7412 /* failure here is OK */;
7351 spares++; 7413 spares++;
7352 if (rdev->raid_disk < 0 7414 md_new_event(mddev);
7353 && !test_bit(Faulty, &rdev->flags)) { 7415 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7354 rdev->recovery_offset = 0;
7355 if (mddev->pers->
7356 hot_add_disk(mddev, rdev) == 0) {
7357 if (sysfs_link_rdev(mddev, rdev))
7358 /* failure here is OK */;
7359 spares++;
7360 md_new_event(mddev);
7361 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7362 }
7363 } 7416 }
7364 } 7417 }
7365 } 7418 }
@@ -7474,7 +7527,7 @@ void md_check_recovery(struct mddev *mddev)
7474 test_bit(Faulty, &rdev->flags) && 7527 test_bit(Faulty, &rdev->flags) &&
7475 atomic_read(&rdev->nr_pending)==0) { 7528 atomic_read(&rdev->nr_pending)==0) {
7476 if (mddev->pers->hot_remove_disk( 7529 if (mddev->pers->hot_remove_disk(
7477 mddev, rdev->raid_disk)==0) { 7530 mddev, rdev) == 0) {
7478 sysfs_unlink_rdev(mddev, rdev); 7531 sysfs_unlink_rdev(mddev, rdev);
7479 rdev->raid_disk = -1; 7532 rdev->raid_disk = -1;
7480 } 7533 }
diff --git a/drivers/md/md.h b/drivers/md/md.h
index cf742d9306ec..44c63dfeeb2b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -72,34 +72,7 @@ struct md_rdev {
72 * This reduces the burden of testing multiple flags in many cases 72 * This reduces the burden of testing multiple flags in many cases
73 */ 73 */
74 74
75 unsigned long flags; 75 unsigned long flags; /* bit set of 'enum flag_bits' bits. */
76#define Faulty 1 /* device is known to have a fault */
77#define In_sync 2 /* device is in_sync with rest of array */
78#define WriteMostly 4 /* Avoid reading if at all possible */
79#define AutoDetected 7 /* added by auto-detect */
80#define Blocked 8 /* An error occurred but has not yet
81 * been acknowledged by the metadata
82 * handler, so don't allow writes
83 * until it is cleared */
84#define WriteErrorSeen 9 /* A write error has been seen on this
85 * device
86 */
87#define FaultRecorded 10 /* Intermediate state for clearing
88 * Blocked. The Fault is/will-be
89 * recorded in the metadata, but that
90 * metadata hasn't been stored safely
91 * on disk yet.
92 */
93#define BlockedBadBlocks 11 /* A writer is blocked because they
94 * found an unacknowledged bad-block.
95 * This can safely be cleared at any
96 * time, and the writer will re-check.
97 * It may be set at any time, and at
98 * worst the writer will timeout and
99 * re-check. So setting it as
100 * accurately as possible is good, but
101 * not absolutely critical.
102 */
103 wait_queue_head_t blocked_wait; 76 wait_queue_head_t blocked_wait;
104 77
105 int desc_nr; /* descriptor index in the superblock */ 78 int desc_nr; /* descriptor index in the superblock */
@@ -152,6 +125,44 @@ struct md_rdev {
152 sector_t size; /* in sectors */ 125 sector_t size; /* in sectors */
153 } badblocks; 126 } badblocks;
154}; 127};
128enum flag_bits {
129 Faulty, /* device is known to have a fault */
130 In_sync, /* device is in_sync with rest of array */
131 WriteMostly, /* Avoid reading if at all possible */
132 AutoDetected, /* added by auto-detect */
133 Blocked, /* An error occurred but has not yet
134 * been acknowledged by the metadata
135 * handler, so don't allow writes
136 * until it is cleared */
137 WriteErrorSeen, /* A write error has been seen on this
138 * device
139 */
140 FaultRecorded, /* Intermediate state for clearing
141 * Blocked. The Fault is/will-be
142 * recorded in the metadata, but that
143 * metadata hasn't been stored safely
144 * on disk yet.
145 */
146 BlockedBadBlocks, /* A writer is blocked because they
147 * found an unacknowledged bad-block.
148 * This can safely be cleared at any
149 * time, and the writer will re-check.
150 * It may be set at any time, and at
151 * worst the writer will timeout and
152 * re-check. So setting it as
153 * accurately as possible is good, but
154 * not absolutely critical.
155 */
156 WantReplacement, /* This device is a candidate to be
157 * hot-replaced, either because it has
158 * reported some faults, or because
159 * of explicit request.
160 */
161 Replacement, /* This device is a replacement for
162 * a want_replacement device with same
163 * raid_disk number.
164 */
165};
155 166
156#define BB_LEN_MASK (0x00000000000001FFULL) 167#define BB_LEN_MASK (0x00000000000001FFULL)
157#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) 168#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
@@ -428,7 +439,7 @@ struct md_personality
428 */ 439 */
429 void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); 440 void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
430 int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); 441 int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
431 int (*hot_remove_disk) (struct mddev *mddev, int number); 442 int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
432 int (*spare_active) (struct mddev *mddev); 443 int (*spare_active) (struct mddev *mddev);
433 sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); 444 sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster);
434 int (*resize) (struct mddev *mddev, sector_t sectors); 445 int (*resize) (struct mddev *mddev, sector_t sectors);
@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev)
482static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) 493static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
483{ 494{
484 char nm[20]; 495 char nm[20];
485 sprintf(nm, "rd%d", rdev->raid_disk); 496 if (!test_bit(Replacement, &rdev->flags)) {
486 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 497 sprintf(nm, "rd%d", rdev->raid_disk);
498 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
499 } else
500 return 0;
487} 501}
488 502
489static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) 503static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
490{ 504{
491 char nm[20]; 505 char nm[20];
492 sprintf(nm, "rd%d", rdev->raid_disk); 506 if (!test_bit(Replacement, &rdev->flags)) {
493 sysfs_remove_link(&mddev->kobj, nm); 507 sprintf(nm, "rd%d", rdev->raid_disk);
508 sysfs_remove_link(&mddev->kobj, nm);
509 }
494} 510}
495 511
496/* 512/*
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 5899246fa37e..a222f516660e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -292,17 +292,16 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
292 return err; 292 return err;
293} 293}
294 294
295static int multipath_remove_disk(struct mddev *mddev, int number) 295static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
296{ 296{
297 struct mpconf *conf = mddev->private; 297 struct mpconf *conf = mddev->private;
298 int err = 0; 298 int err = 0;
299 struct md_rdev *rdev; 299 int number = rdev->raid_disk;
300 struct multipath_info *p = conf->multipaths + number; 300 struct multipath_info *p = conf->multipaths + number;
301 301
302 print_multipath_conf(conf); 302 print_multipath_conf(conf);
303 303
304 rdev = p->rdev; 304 if (rdev == p->rdev) {
305 if (rdev) {
306 if (test_bit(In_sync, &rdev->flags) || 305 if (test_bit(In_sync, &rdev->flags) ||
307 atomic_read(&rdev->nr_pending)) { 306 atomic_read(&rdev->nr_pending)) {
308 printk(KERN_ERR "hot-remove-disk, slot %d is identified" 307 printk(KERN_ERR "hot-remove-disk, slot %d is identified"
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ede2461e79c5..cc24f0cb7ee3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -135,7 +135,7 @@ out_free_pages:
135 put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); 135 put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
136 j = -1; 136 j = -1;
137out_free_bio: 137out_free_bio:
138 while ( ++j < pi->raid_disks ) 138 while (++j < pi->raid_disks)
139 bio_put(r1_bio->bios[j]); 139 bio_put(r1_bio->bios[j]);
140 r1bio_pool_free(r1_bio, data); 140 r1bio_pool_free(r1_bio, data);
141 return NULL; 141 return NULL;
@@ -164,7 +164,7 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
164{ 164{
165 int i; 165 int i;
166 166
167 for (i = 0; i < conf->raid_disks; i++) { 167 for (i = 0; i < conf->raid_disks * 2; i++) {
168 struct bio **bio = r1_bio->bios + i; 168 struct bio **bio = r1_bio->bios + i;
169 if (!BIO_SPECIAL(*bio)) 169 if (!BIO_SPECIAL(*bio))
170 bio_put(*bio); 170 bio_put(*bio);
@@ -185,7 +185,7 @@ static void put_buf(struct r1bio *r1_bio)
185 struct r1conf *conf = r1_bio->mddev->private; 185 struct r1conf *conf = r1_bio->mddev->private;
186 int i; 186 int i;
187 187
188 for (i=0; i<conf->raid_disks; i++) { 188 for (i = 0; i < conf->raid_disks * 2; i++) {
189 struct bio *bio = r1_bio->bios[i]; 189 struct bio *bio = r1_bio->bios[i];
190 if (bio->bi_end_io) 190 if (bio->bi_end_io)
191 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); 191 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
@@ -277,13 +277,14 @@ static inline void update_head_pos(int disk, struct r1bio *r1_bio)
277static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) 277static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
278{ 278{
279 int mirror; 279 int mirror;
280 int raid_disks = r1_bio->mddev->raid_disks; 280 struct r1conf *conf = r1_bio->mddev->private;
281 int raid_disks = conf->raid_disks;
281 282
282 for (mirror = 0; mirror < raid_disks; mirror++) 283 for (mirror = 0; mirror < raid_disks * 2; mirror++)
283 if (r1_bio->bios[mirror] == bio) 284 if (r1_bio->bios[mirror] == bio)
284 break; 285 break;
285 286
286 BUG_ON(mirror == raid_disks); 287 BUG_ON(mirror == raid_disks * 2);
287 update_head_pos(mirror, r1_bio); 288 update_head_pos(mirror, r1_bio);
288 289
289 return mirror; 290 return mirror;
@@ -390,6 +391,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
390 if (!uptodate) { 391 if (!uptodate) {
391 set_bit(WriteErrorSeen, 392 set_bit(WriteErrorSeen,
392 &conf->mirrors[mirror].rdev->flags); 393 &conf->mirrors[mirror].rdev->flags);
394 if (!test_and_set_bit(WantReplacement,
395 &conf->mirrors[mirror].rdev->flags))
396 set_bit(MD_RECOVERY_NEEDED, &
397 conf->mddev->recovery);
398
393 set_bit(R1BIO_WriteError, &r1_bio->state); 399 set_bit(R1BIO_WriteError, &r1_bio->state);
394 } else { 400 } else {
395 /* 401 /*
@@ -505,7 +511,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
505 start_disk = conf->last_used; 511 start_disk = conf->last_used;
506 } 512 }
507 513
508 for (i = 0 ; i < conf->raid_disks ; i++) { 514 for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
509 sector_t dist; 515 sector_t dist;
510 sector_t first_bad; 516 sector_t first_bad;
511 int bad_sectors; 517 int bad_sectors;
@@ -609,7 +615,7 @@ int md_raid1_congested(struct mddev *mddev, int bits)
609 return 1; 615 return 1;
610 616
611 rcu_read_lock(); 617 rcu_read_lock();
612 for (i = 0; i < mddev->raid_disks; i++) { 618 for (i = 0; i < conf->raid_disks; i++) {
613 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 619 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
614 if (rdev && !test_bit(Faulty, &rdev->flags)) { 620 if (rdev && !test_bit(Faulty, &rdev->flags)) {
615 struct request_queue *q = bdev_get_queue(rdev->bdev); 621 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -974,7 +980,7 @@ read_again:
974 */ 980 */
975 plugged = mddev_check_plugged(mddev); 981 plugged = mddev_check_plugged(mddev);
976 982
977 disks = conf->raid_disks; 983 disks = conf->raid_disks * 2;
978 retry_write: 984 retry_write:
979 blocked_rdev = NULL; 985 blocked_rdev = NULL;
980 rcu_read_lock(); 986 rcu_read_lock();
@@ -988,7 +994,8 @@ read_again:
988 } 994 }
989 r1_bio->bios[i] = NULL; 995 r1_bio->bios[i] = NULL;
990 if (!rdev || test_bit(Faulty, &rdev->flags)) { 996 if (!rdev || test_bit(Faulty, &rdev->flags)) {
991 set_bit(R1BIO_Degraded, &r1_bio->state); 997 if (i < conf->raid_disks)
998 set_bit(R1BIO_Degraded, &r1_bio->state);
992 continue; 999 continue;
993 } 1000 }
994 1001
@@ -1263,6 +1270,25 @@ static int raid1_spare_active(struct mddev *mddev)
1263 */ 1270 */
1264 for (i = 0; i < conf->raid_disks; i++) { 1271 for (i = 0; i < conf->raid_disks; i++) {
1265 struct md_rdev *rdev = conf->mirrors[i].rdev; 1272 struct md_rdev *rdev = conf->mirrors[i].rdev;
1273 struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
1274 if (repl
1275 && repl->recovery_offset == MaxSector
1276 && !test_bit(Faulty, &repl->flags)
1277 && !test_and_set_bit(In_sync, &repl->flags)) {
1278 /* replacement has just become active */
1279 if (!rdev ||
1280 !test_and_clear_bit(In_sync, &rdev->flags))
1281 count++;
1282 if (rdev) {
1283 /* Replaced device not technically
1284 * faulty, but we need to be sure
1285 * it gets removed and never re-added
1286 */
1287 set_bit(Faulty, &rdev->flags);
1288 sysfs_notify_dirent_safe(
1289 rdev->sysfs_state);
1290 }
1291 }
1266 if (rdev 1292 if (rdev
1267 && !test_bit(Faulty, &rdev->flags) 1293 && !test_bit(Faulty, &rdev->flags)
1268 && !test_and_set_bit(In_sync, &rdev->flags)) { 1294 && !test_and_set_bit(In_sync, &rdev->flags)) {
@@ -1286,7 +1312,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1286 int mirror = 0; 1312 int mirror = 0;
1287 struct mirror_info *p; 1313 struct mirror_info *p;
1288 int first = 0; 1314 int first = 0;
1289 int last = mddev->raid_disks - 1; 1315 int last = conf->raid_disks - 1;
1290 1316
1291 if (mddev->recovery_disabled == conf->recovery_disabled) 1317 if (mddev->recovery_disabled == conf->recovery_disabled)
1292 return -EBUSY; 1318 return -EBUSY;
@@ -1294,8 +1320,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1294 if (rdev->raid_disk >= 0) 1320 if (rdev->raid_disk >= 0)
1295 first = last = rdev->raid_disk; 1321 first = last = rdev->raid_disk;
1296 1322
1297 for (mirror = first; mirror <= last; mirror++) 1323 for (mirror = first; mirror <= last; mirror++) {
1298 if ( !(p=conf->mirrors+mirror)->rdev) { 1324 p = conf->mirrors+mirror;
1325 if (!p->rdev) {
1299 1326
1300 disk_stack_limits(mddev->gendisk, rdev->bdev, 1327 disk_stack_limits(mddev->gendisk, rdev->bdev,
1301 rdev->data_offset << 9); 1328 rdev->data_offset << 9);
@@ -1322,21 +1349,35 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1322 rcu_assign_pointer(p->rdev, rdev); 1349 rcu_assign_pointer(p->rdev, rdev);
1323 break; 1350 break;
1324 } 1351 }
1352 if (test_bit(WantReplacement, &p->rdev->flags) &&
1353 p[conf->raid_disks].rdev == NULL) {
1354 /* Add this device as a replacement */
1355 clear_bit(In_sync, &rdev->flags);
1356 set_bit(Replacement, &rdev->flags);
1357 rdev->raid_disk = mirror;
1358 err = 0;
1359 conf->fullsync = 1;
1360 rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
1361 break;
1362 }
1363 }
1325 md_integrity_add_rdev(rdev, mddev); 1364 md_integrity_add_rdev(rdev, mddev);
1326 print_conf(conf); 1365 print_conf(conf);
1327 return err; 1366 return err;
1328} 1367}
1329 1368
1330static int raid1_remove_disk(struct mddev *mddev, int number) 1369static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1331{ 1370{
1332 struct r1conf *conf = mddev->private; 1371 struct r1conf *conf = mddev->private;
1333 int err = 0; 1372 int err = 0;
1334 struct md_rdev *rdev; 1373 int number = rdev->raid_disk;
1335 struct mirror_info *p = conf->mirrors+ number; 1374 struct mirror_info *p = conf->mirrors+ number;
1336 1375
1376 if (rdev != p->rdev)
1377 p = conf->mirrors + conf->raid_disks + number;
1378
1337 print_conf(conf); 1379 print_conf(conf);
1338 rdev = p->rdev; 1380 if (rdev == p->rdev) {
1339 if (rdev) {
1340 if (test_bit(In_sync, &rdev->flags) || 1381 if (test_bit(In_sync, &rdev->flags) ||
1341 atomic_read(&rdev->nr_pending)) { 1382 atomic_read(&rdev->nr_pending)) {
1342 err = -EBUSY; 1383 err = -EBUSY;
@@ -1358,7 +1399,21 @@ static int raid1_remove_disk(struct mddev *mddev, int number)
1358 err = -EBUSY; 1399 err = -EBUSY;
1359 p->rdev = rdev; 1400 p->rdev = rdev;
1360 goto abort; 1401 goto abort;
1361 } 1402 } else if (conf->mirrors[conf->raid_disks + number].rdev) {
1403 /* We just removed a device that is being replaced.
1404 * Move down the replacement. We drain all IO before
1405 * doing this to avoid confusion.
1406 */
1407 struct md_rdev *repl =
1408 conf->mirrors[conf->raid_disks + number].rdev;
1409 raise_barrier(conf);
1410 clear_bit(Replacement, &repl->flags);
1411 p->rdev = repl;
1412 conf->mirrors[conf->raid_disks + number].rdev = NULL;
1413 lower_barrier(conf);
1414 clear_bit(WantReplacement, &rdev->flags);
1415 } else
1416 clear_bit(WantReplacement, &rdev->flags);
1362 err = md_integrity_register(mddev); 1417 err = md_integrity_register(mddev);
1363 } 1418 }
1364abort: 1419abort:
@@ -1411,6 +1466,10 @@ static void end_sync_write(struct bio *bio, int error)
1411 } while (sectors_to_go > 0); 1466 } while (sectors_to_go > 0);
1412 set_bit(WriteErrorSeen, 1467 set_bit(WriteErrorSeen,
1413 &conf->mirrors[mirror].rdev->flags); 1468 &conf->mirrors[mirror].rdev->flags);
1469 if (!test_and_set_bit(WantReplacement,
1470 &conf->mirrors[mirror].rdev->flags))
1471 set_bit(MD_RECOVERY_NEEDED, &
1472 mddev->recovery);
1414 set_bit(R1BIO_WriteError, &r1_bio->state); 1473 set_bit(R1BIO_WriteError, &r1_bio->state);
1415 } else if (is_badblock(conf->mirrors[mirror].rdev, 1474 } else if (is_badblock(conf->mirrors[mirror].rdev,
1416 r1_bio->sector, 1475 r1_bio->sector,
@@ -1441,8 +1500,13 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
1441 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) 1500 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1442 /* success */ 1501 /* success */
1443 return 1; 1502 return 1;
1444 if (rw == WRITE) 1503 if (rw == WRITE) {
1445 set_bit(WriteErrorSeen, &rdev->flags); 1504 set_bit(WriteErrorSeen, &rdev->flags);
1505 if (!test_and_set_bit(WantReplacement,
1506 &rdev->flags))
1507 set_bit(MD_RECOVERY_NEEDED, &
1508 rdev->mddev->recovery);
1509 }
1446 /* need to record an error - either for the block or the device */ 1510 /* need to record an error - either for the block or the device */
1447 if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 1511 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1448 md_error(rdev->mddev, rdev); 1512 md_error(rdev->mddev, rdev);
@@ -1493,7 +1557,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1493 } 1557 }
1494 } 1558 }
1495 d++; 1559 d++;
1496 if (d == conf->raid_disks) 1560 if (d == conf->raid_disks * 2)
1497 d = 0; 1561 d = 0;
1498 } while (!success && d != r1_bio->read_disk); 1562 } while (!success && d != r1_bio->read_disk);
1499 1563
@@ -1510,7 +1574,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1510 mdname(mddev), 1574 mdname(mddev),
1511 bdevname(bio->bi_bdev, b), 1575 bdevname(bio->bi_bdev, b),
1512 (unsigned long long)r1_bio->sector); 1576 (unsigned long long)r1_bio->sector);
1513 for (d = 0; d < conf->raid_disks; d++) { 1577 for (d = 0; d < conf->raid_disks * 2; d++) {
1514 rdev = conf->mirrors[d].rdev; 1578 rdev = conf->mirrors[d].rdev;
1515 if (!rdev || test_bit(Faulty, &rdev->flags)) 1579 if (!rdev || test_bit(Faulty, &rdev->flags))
1516 continue; 1580 continue;
@@ -1536,7 +1600,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1536 /* write it back and re-read */ 1600 /* write it back and re-read */
1537 while (d != r1_bio->read_disk) { 1601 while (d != r1_bio->read_disk) {
1538 if (d == 0) 1602 if (d == 0)
1539 d = conf->raid_disks; 1603 d = conf->raid_disks * 2;
1540 d--; 1604 d--;
1541 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1605 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1542 continue; 1606 continue;
@@ -1551,7 +1615,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1551 d = start; 1615 d = start;
1552 while (d != r1_bio->read_disk) { 1616 while (d != r1_bio->read_disk) {
1553 if (d == 0) 1617 if (d == 0)
1554 d = conf->raid_disks; 1618 d = conf->raid_disks * 2;
1555 d--; 1619 d--;
1556 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1620 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1557 continue; 1621 continue;
@@ -1584,7 +1648,7 @@ static int process_checks(struct r1bio *r1_bio)
1584 int primary; 1648 int primary;
1585 int i; 1649 int i;
1586 1650
1587 for (primary = 0; primary < conf->raid_disks; primary++) 1651 for (primary = 0; primary < conf->raid_disks * 2; primary++)
1588 if (r1_bio->bios[primary]->bi_end_io == end_sync_read && 1652 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1589 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { 1653 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1590 r1_bio->bios[primary]->bi_end_io = NULL; 1654 r1_bio->bios[primary]->bi_end_io = NULL;
@@ -1592,7 +1656,7 @@ static int process_checks(struct r1bio *r1_bio)
1592 break; 1656 break;
1593 } 1657 }
1594 r1_bio->read_disk = primary; 1658 r1_bio->read_disk = primary;
1595 for (i = 0; i < conf->raid_disks; i++) { 1659 for (i = 0; i < conf->raid_disks * 2; i++) {
1596 int j; 1660 int j;
1597 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); 1661 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1598 struct bio *pbio = r1_bio->bios[primary]; 1662 struct bio *pbio = r1_bio->bios[primary];
@@ -1656,7 +1720,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
1656{ 1720{
1657 struct r1conf *conf = mddev->private; 1721 struct r1conf *conf = mddev->private;
1658 int i; 1722 int i;
1659 int disks = conf->raid_disks; 1723 int disks = conf->raid_disks * 2;
1660 struct bio *bio, *wbio; 1724 struct bio *bio, *wbio;
1661 1725
1662 bio = r1_bio->bios[r1_bio->read_disk]; 1726 bio = r1_bio->bios[r1_bio->read_disk];
@@ -1737,7 +1801,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
1737 success = 1; 1801 success = 1;
1738 else { 1802 else {
1739 d++; 1803 d++;
1740 if (d == conf->raid_disks) 1804 if (d == conf->raid_disks * 2)
1741 d = 0; 1805 d = 0;
1742 } 1806 }
1743 } while (!success && d != read_disk); 1807 } while (!success && d != read_disk);
@@ -1753,7 +1817,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
1753 start = d; 1817 start = d;
1754 while (d != read_disk) { 1818 while (d != read_disk) {
1755 if (d==0) 1819 if (d==0)
1756 d = conf->raid_disks; 1820 d = conf->raid_disks * 2;
1757 d--; 1821 d--;
1758 rdev = conf->mirrors[d].rdev; 1822 rdev = conf->mirrors[d].rdev;
1759 if (rdev && 1823 if (rdev &&
@@ -1765,7 +1829,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
1765 while (d != read_disk) { 1829 while (d != read_disk) {
1766 char b[BDEVNAME_SIZE]; 1830 char b[BDEVNAME_SIZE];
1767 if (d==0) 1831 if (d==0)
1768 d = conf->raid_disks; 1832 d = conf->raid_disks * 2;
1769 d--; 1833 d--;
1770 rdev = conf->mirrors[d].rdev; 1834 rdev = conf->mirrors[d].rdev;
1771 if (rdev && 1835 if (rdev &&
@@ -1887,7 +1951,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
1887{ 1951{
1888 int m; 1952 int m;
1889 int s = r1_bio->sectors; 1953 int s = r1_bio->sectors;
1890 for (m = 0; m < conf->raid_disks ; m++) { 1954 for (m = 0; m < conf->raid_disks * 2 ; m++) {
1891 struct md_rdev *rdev = conf->mirrors[m].rdev; 1955 struct md_rdev *rdev = conf->mirrors[m].rdev;
1892 struct bio *bio = r1_bio->bios[m]; 1956 struct bio *bio = r1_bio->bios[m];
1893 if (bio->bi_end_io == NULL) 1957 if (bio->bi_end_io == NULL)
@@ -1909,7 +1973,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
1909static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) 1973static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
1910{ 1974{
1911 int m; 1975 int m;
1912 for (m = 0; m < conf->raid_disks ; m++) 1976 for (m = 0; m < conf->raid_disks * 2 ; m++)
1913 if (r1_bio->bios[m] == IO_MADE_GOOD) { 1977 if (r1_bio->bios[m] == IO_MADE_GOOD) {
1914 struct md_rdev *rdev = conf->mirrors[m].rdev; 1978 struct md_rdev *rdev = conf->mirrors[m].rdev;
1915 rdev_clear_badblocks(rdev, 1979 rdev_clear_badblocks(rdev,
@@ -2184,7 +2248,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2184 r1_bio->state = 0; 2248 r1_bio->state = 0;
2185 set_bit(R1BIO_IsSync, &r1_bio->state); 2249 set_bit(R1BIO_IsSync, &r1_bio->state);
2186 2250
2187 for (i=0; i < conf->raid_disks; i++) { 2251 for (i = 0; i < conf->raid_disks * 2; i++) {
2188 struct md_rdev *rdev; 2252 struct md_rdev *rdev;
2189 bio = r1_bio->bios[i]; 2253 bio = r1_bio->bios[i];
2190 2254
@@ -2203,7 +2267,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2203 rdev = rcu_dereference(conf->mirrors[i].rdev); 2267 rdev = rcu_dereference(conf->mirrors[i].rdev);
2204 if (rdev == NULL || 2268 if (rdev == NULL ||
2205 test_bit(Faulty, &rdev->flags)) { 2269 test_bit(Faulty, &rdev->flags)) {
2206 still_degraded = 1; 2270 if (i < conf->raid_disks)
2271 still_degraded = 1;
2207 } else if (!test_bit(In_sync, &rdev->flags)) { 2272 } else if (!test_bit(In_sync, &rdev->flags)) {
2208 bio->bi_rw = WRITE; 2273 bio->bi_rw = WRITE;
2209 bio->bi_end_io = end_sync_write; 2274 bio->bi_end_io = end_sync_write;
@@ -2254,7 +2319,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2254 * need to mark them bad on all write targets 2319 * need to mark them bad on all write targets
2255 */ 2320 */
2256 int ok = 1; 2321 int ok = 1;
2257 for (i = 0 ; i < conf->raid_disks ; i++) 2322 for (i = 0 ; i < conf->raid_disks * 2 ; i++)
2258 if (r1_bio->bios[i]->bi_end_io == end_sync_write) { 2323 if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
2259 struct md_rdev *rdev = 2324 struct md_rdev *rdev =
2260 rcu_dereference(conf->mirrors[i].rdev); 2325 rcu_dereference(conf->mirrors[i].rdev);
@@ -2323,7 +2388,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2323 len = sync_blocks<<9; 2388 len = sync_blocks<<9;
2324 } 2389 }
2325 2390
2326 for (i=0 ; i < conf->raid_disks; i++) { 2391 for (i = 0 ; i < conf->raid_disks * 2; i++) {
2327 bio = r1_bio->bios[i]; 2392 bio = r1_bio->bios[i];
2328 if (bio->bi_end_io) { 2393 if (bio->bi_end_io) {
2329 page = bio->bi_io_vec[bio->bi_vcnt].bv_page; 2394 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
@@ -2356,7 +2421,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2356 */ 2421 */
2357 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 2422 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2358 atomic_set(&r1_bio->remaining, read_targets); 2423 atomic_set(&r1_bio->remaining, read_targets);
2359 for (i=0; i<conf->raid_disks; i++) { 2424 for (i = 0; i < conf->raid_disks * 2; i++) {
2360 bio = r1_bio->bios[i]; 2425 bio = r1_bio->bios[i];
2361 if (bio->bi_end_io == end_sync_read) { 2426 if (bio->bi_end_io == end_sync_read) {
2362 md_sync_acct(bio->bi_bdev, nr_sectors); 2427 md_sync_acct(bio->bi_bdev, nr_sectors);
@@ -2393,7 +2458,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2393 if (!conf) 2458 if (!conf)
2394 goto abort; 2459 goto abort;
2395 2460
2396 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 2461 conf->mirrors = kzalloc(sizeof(struct mirror_info)
2462 * mddev->raid_disks * 2,
2397 GFP_KERNEL); 2463 GFP_KERNEL);
2398 if (!conf->mirrors) 2464 if (!conf->mirrors)
2399 goto abort; 2465 goto abort;
@@ -2405,7 +2471,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2405 conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 2471 conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
2406 if (!conf->poolinfo) 2472 if (!conf->poolinfo)
2407 goto abort; 2473 goto abort;
2408 conf->poolinfo->raid_disks = mddev->raid_disks; 2474 conf->poolinfo->raid_disks = mddev->raid_disks * 2;
2409 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 2475 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2410 r1bio_pool_free, 2476 r1bio_pool_free,
2411 conf->poolinfo); 2477 conf->poolinfo);
@@ -2414,14 +2480,20 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2414 2480
2415 conf->poolinfo->mddev = mddev; 2481 conf->poolinfo->mddev = mddev;
2416 2482
2483 err = -EINVAL;
2417 spin_lock_init(&conf->device_lock); 2484 spin_lock_init(&conf->device_lock);
2418 list_for_each_entry(rdev, &mddev->disks, same_set) { 2485 list_for_each_entry(rdev, &mddev->disks, same_set) {
2419 int disk_idx = rdev->raid_disk; 2486 int disk_idx = rdev->raid_disk;
2420 if (disk_idx >= mddev->raid_disks 2487 if (disk_idx >= mddev->raid_disks
2421 || disk_idx < 0) 2488 || disk_idx < 0)
2422 continue; 2489 continue;
2423 disk = conf->mirrors + disk_idx; 2490 if (test_bit(Replacement, &rdev->flags))
2491 disk = conf->mirrors + conf->raid_disks + disk_idx;
2492 else
2493 disk = conf->mirrors + disk_idx;
2424 2494
2495 if (disk->rdev)
2496 goto abort;
2425 disk->rdev = rdev; 2497 disk->rdev = rdev;
2426 2498
2427 disk->head_position = 0; 2499 disk->head_position = 0;
@@ -2437,11 +2509,27 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2437 conf->pending_count = 0; 2509 conf->pending_count = 0;
2438 conf->recovery_disabled = mddev->recovery_disabled - 1; 2510 conf->recovery_disabled = mddev->recovery_disabled - 1;
2439 2511
2512 err = -EIO;
2440 conf->last_used = -1; 2513 conf->last_used = -1;
2441 for (i = 0; i < conf->raid_disks; i++) { 2514 for (i = 0; i < conf->raid_disks * 2; i++) {
2442 2515
2443 disk = conf->mirrors + i; 2516 disk = conf->mirrors + i;
2444 2517
2518 if (i < conf->raid_disks &&
2519 disk[conf->raid_disks].rdev) {
2520 /* This slot has a replacement. */
2521 if (!disk->rdev) {
2522 /* No original, just make the replacement
2523 * a recovering spare
2524 */
2525 disk->rdev =
2526 disk[conf->raid_disks].rdev;
2527 disk[conf->raid_disks].rdev = NULL;
2528 } else if (!test_bit(In_sync, &disk->rdev->flags))
2529 /* Original is not in_sync - bad */
2530 goto abort;
2531 }
2532
2445 if (!disk->rdev || 2533 if (!disk->rdev ||
2446 !test_bit(In_sync, &disk->rdev->flags)) { 2534 !test_bit(In_sync, &disk->rdev->flags)) {
2447 disk->head_position = 0; 2535 disk->head_position = 0;
@@ -2455,7 +2543,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2455 conf->last_used = i; 2543 conf->last_used = i;
2456 } 2544 }
2457 2545
2458 err = -EIO;
2459 if (conf->last_used < 0) { 2546 if (conf->last_used < 0) {
2460 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", 2547 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2461 mdname(mddev)); 2548 mdname(mddev));
@@ -2665,7 +2752,7 @@ static int raid1_reshape(struct mddev *mddev)
2665 if (!newpoolinfo) 2752 if (!newpoolinfo)
2666 return -ENOMEM; 2753 return -ENOMEM;
2667 newpoolinfo->mddev = mddev; 2754 newpoolinfo->mddev = mddev;
2668 newpoolinfo->raid_disks = raid_disks; 2755 newpoolinfo->raid_disks = raid_disks * 2;
2669 2756
2670 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 2757 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2671 r1bio_pool_free, newpoolinfo); 2758 r1bio_pool_free, newpoolinfo);
@@ -2673,7 +2760,8 @@ static int raid1_reshape(struct mddev *mddev)
2673 kfree(newpoolinfo); 2760 kfree(newpoolinfo);
2674 return -ENOMEM; 2761 return -ENOMEM;
2675 } 2762 }
2676 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); 2763 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
2764 GFP_KERNEL);
2677 if (!newmirrors) { 2765 if (!newmirrors) {
2678 kfree(newpoolinfo); 2766 kfree(newpoolinfo);
2679 mempool_destroy(newpool); 2767 mempool_destroy(newpool);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index c732b6cce935..80ded139314c 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -12,6 +12,9 @@ struct mirror_info {
12 * pool was allocated for, so they know how much to allocate and free. 12 * pool was allocated for, so they know how much to allocate and free.
13 * mddev->raid_disks cannot be used, as it can change while a pool is active 13 * mddev->raid_disks cannot be used, as it can change while a pool is active
14 * These two datums are stored in a kmalloced struct. 14 * These two datums are stored in a kmalloced struct.
15 * The 'raid_disks' here is twice the raid_disks in r1conf.
16 * This allows space for each 'real' device can have a replacement in the
17 * second half of the array.
15 */ 18 */
16 19
17struct pool_info { 20struct pool_info {
@@ -21,7 +24,9 @@ struct pool_info {
21 24
22struct r1conf { 25struct r1conf {
23 struct mddev *mddev; 26 struct mddev *mddev;
24 struct mirror_info *mirrors; 27 struct mirror_info *mirrors; /* twice 'raid_disks' to
28 * allow for replacements.
29 */
25 int raid_disks; 30 int raid_disks;
26 31
27 /* When choose the best device for a read (read_balance()) 32 /* When choose the best device for a read (read_balance())
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 685ddf325ee4..6e8aa213f0d5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -73,7 +73,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
73 struct r10conf *conf = data; 73 struct r10conf *conf = data;
74 int size = offsetof(struct r10bio, devs[conf->copies]); 74 int size = offsetof(struct r10bio, devs[conf->copies]);
75 75
76 /* allocate a r10bio with room for raid_disks entries in the bios array */ 76 /* allocate a r10bio with room for raid_disks entries in the
77 * bios array */
77 return kzalloc(size, gfp_flags); 78 return kzalloc(size, gfp_flags);
78} 79}
79 80
@@ -123,12 +124,19 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
123 if (!bio) 124 if (!bio)
124 goto out_free_bio; 125 goto out_free_bio;
125 r10_bio->devs[j].bio = bio; 126 r10_bio->devs[j].bio = bio;
127 if (!conf->have_replacement)
128 continue;
129 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
130 if (!bio)
131 goto out_free_bio;
132 r10_bio->devs[j].repl_bio = bio;
126 } 133 }
127 /* 134 /*
128 * Allocate RESYNC_PAGES data pages and attach them 135 * Allocate RESYNC_PAGES data pages and attach them
129 * where needed. 136 * where needed.
130 */ 137 */
131 for (j = 0 ; j < nalloc; j++) { 138 for (j = 0 ; j < nalloc; j++) {
139 struct bio *rbio = r10_bio->devs[j].repl_bio;
132 bio = r10_bio->devs[j].bio; 140 bio = r10_bio->devs[j].bio;
133 for (i = 0; i < RESYNC_PAGES; i++) { 141 for (i = 0; i < RESYNC_PAGES; i++) {
134 if (j == 1 && !test_bit(MD_RECOVERY_SYNC, 142 if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
@@ -143,6 +151,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
143 goto out_free_pages; 151 goto out_free_pages;
144 152
145 bio->bi_io_vec[i].bv_page = page; 153 bio->bi_io_vec[i].bv_page = page;
154 if (rbio)
155 rbio->bi_io_vec[i].bv_page = page;
146 } 156 }
147 } 157 }
148 158
@@ -156,8 +166,11 @@ out_free_pages:
156 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); 166 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
157 j = -1; 167 j = -1;
158out_free_bio: 168out_free_bio:
159 while ( ++j < nalloc ) 169 while (++j < nalloc) {
160 bio_put(r10_bio->devs[j].bio); 170 bio_put(r10_bio->devs[j].bio);
171 if (r10_bio->devs[j].repl_bio)
172 bio_put(r10_bio->devs[j].repl_bio);
173 }
161 r10bio_pool_free(r10_bio, conf); 174 r10bio_pool_free(r10_bio, conf);
162 return NULL; 175 return NULL;
163} 176}
@@ -178,6 +191,9 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
178 } 191 }
179 bio_put(bio); 192 bio_put(bio);
180 } 193 }
194 bio = r10bio->devs[j].repl_bio;
195 if (bio)
196 bio_put(bio);
181 } 197 }
182 r10bio_pool_free(r10bio, conf); 198 r10bio_pool_free(r10bio, conf);
183} 199}
@@ -191,6 +207,10 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
191 if (!BIO_SPECIAL(*bio)) 207 if (!BIO_SPECIAL(*bio))
192 bio_put(*bio); 208 bio_put(*bio);
193 *bio = NULL; 209 *bio = NULL;
210 bio = &r10_bio->devs[i].repl_bio;
211 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
212 bio_put(*bio);
213 *bio = NULL;
194 } 214 }
195} 215}
196 216
@@ -275,19 +295,27 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio)
275 * Find the disk number which triggered given bio 295 * Find the disk number which triggered given bio
276 */ 296 */
277static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, 297static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
278 struct bio *bio, int *slotp) 298 struct bio *bio, int *slotp, int *replp)
279{ 299{
280 int slot; 300 int slot;
301 int repl = 0;
281 302
282 for (slot = 0; slot < conf->copies; slot++) 303 for (slot = 0; slot < conf->copies; slot++) {
283 if (r10_bio->devs[slot].bio == bio) 304 if (r10_bio->devs[slot].bio == bio)
284 break; 305 break;
306 if (r10_bio->devs[slot].repl_bio == bio) {
307 repl = 1;
308 break;
309 }
310 }
285 311
286 BUG_ON(slot == conf->copies); 312 BUG_ON(slot == conf->copies);
287 update_head_pos(slot, r10_bio); 313 update_head_pos(slot, r10_bio);
288 314
289 if (slotp) 315 if (slotp)
290 *slotp = slot; 316 *slotp = slot;
317 if (replp)
318 *replp = repl;
291 return r10_bio->devs[slot].devnum; 319 return r10_bio->devs[slot].devnum;
292} 320}
293 321
@@ -296,11 +324,13 @@ static void raid10_end_read_request(struct bio *bio, int error)
296 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 324 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
297 struct r10bio *r10_bio = bio->bi_private; 325 struct r10bio *r10_bio = bio->bi_private;
298 int slot, dev; 326 int slot, dev;
327 struct md_rdev *rdev;
299 struct r10conf *conf = r10_bio->mddev->private; 328 struct r10conf *conf = r10_bio->mddev->private;
300 329
301 330
302 slot = r10_bio->read_slot; 331 slot = r10_bio->read_slot;
303 dev = r10_bio->devs[slot].devnum; 332 dev = r10_bio->devs[slot].devnum;
333 rdev = r10_bio->devs[slot].rdev;
304 /* 334 /*
305 * this branch is our 'one mirror IO has finished' event handler: 335 * this branch is our 'one mirror IO has finished' event handler:
306 */ 336 */
@@ -318,7 +348,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
318 */ 348 */
319 set_bit(R10BIO_Uptodate, &r10_bio->state); 349 set_bit(R10BIO_Uptodate, &r10_bio->state);
320 raid_end_bio_io(r10_bio); 350 raid_end_bio_io(r10_bio);
321 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); 351 rdev_dec_pending(rdev, conf->mddev);
322 } else { 352 } else {
323 /* 353 /*
324 * oops, read error - keep the refcount on the rdev 354 * oops, read error - keep the refcount on the rdev
@@ -327,7 +357,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
327 printk_ratelimited(KERN_ERR 357 printk_ratelimited(KERN_ERR
328 "md/raid10:%s: %s: rescheduling sector %llu\n", 358 "md/raid10:%s: %s: rescheduling sector %llu\n",
329 mdname(conf->mddev), 359 mdname(conf->mddev),
330 bdevname(conf->mirrors[dev].rdev->bdev, b), 360 bdevname(rdev->bdev, b),
331 (unsigned long long)r10_bio->sector); 361 (unsigned long long)r10_bio->sector);
332 set_bit(R10BIO_ReadError, &r10_bio->state); 362 set_bit(R10BIO_ReadError, &r10_bio->state);
333 reschedule_retry(r10_bio); 363 reschedule_retry(r10_bio);
@@ -366,17 +396,35 @@ static void raid10_end_write_request(struct bio *bio, int error)
366 int dev; 396 int dev;
367 int dec_rdev = 1; 397 int dec_rdev = 1;
368 struct r10conf *conf = r10_bio->mddev->private; 398 struct r10conf *conf = r10_bio->mddev->private;
369 int slot; 399 int slot, repl;
400 struct md_rdev *rdev = NULL;
370 401
371 dev = find_bio_disk(conf, r10_bio, bio, &slot); 402 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
372 403
404 if (repl)
405 rdev = conf->mirrors[dev].replacement;
406 if (!rdev) {
407 smp_rmb();
408 repl = 0;
409 rdev = conf->mirrors[dev].rdev;
410 }
373 /* 411 /*
374 * this branch is our 'one mirror IO has finished' event handler: 412 * this branch is our 'one mirror IO has finished' event handler:
375 */ 413 */
376 if (!uptodate) { 414 if (!uptodate) {
377 set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); 415 if (repl)
378 set_bit(R10BIO_WriteError, &r10_bio->state); 416 /* Never record new bad blocks to replacement,
379 dec_rdev = 0; 417 * just fail it.
418 */
419 md_error(rdev->mddev, rdev);
420 else {
421 set_bit(WriteErrorSeen, &rdev->flags);
422 if (!test_and_set_bit(WantReplacement, &rdev->flags))
423 set_bit(MD_RECOVERY_NEEDED,
424 &rdev->mddev->recovery);
425 set_bit(R10BIO_WriteError, &r10_bio->state);
426 dec_rdev = 0;
427 }
380 } else { 428 } else {
381 /* 429 /*
382 * Set R10BIO_Uptodate in our master bio, so that 430 * Set R10BIO_Uptodate in our master bio, so that
@@ -393,12 +441,15 @@ static void raid10_end_write_request(struct bio *bio, int error)
393 set_bit(R10BIO_Uptodate, &r10_bio->state); 441 set_bit(R10BIO_Uptodate, &r10_bio->state);
394 442
395 /* Maybe we can clear some bad blocks. */ 443 /* Maybe we can clear some bad blocks. */
396 if (is_badblock(conf->mirrors[dev].rdev, 444 if (is_badblock(rdev,
397 r10_bio->devs[slot].addr, 445 r10_bio->devs[slot].addr,
398 r10_bio->sectors, 446 r10_bio->sectors,
399 &first_bad, &bad_sectors)) { 447 &first_bad, &bad_sectors)) {
400 bio_put(bio); 448 bio_put(bio);
401 r10_bio->devs[slot].bio = IO_MADE_GOOD; 449 if (repl)
450 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
451 else
452 r10_bio->devs[slot].bio = IO_MADE_GOOD;
402 dec_rdev = 0; 453 dec_rdev = 0;
403 set_bit(R10BIO_MadeGood, &r10_bio->state); 454 set_bit(R10BIO_MadeGood, &r10_bio->state);
404 } 455 }
@@ -414,7 +465,6 @@ static void raid10_end_write_request(struct bio *bio, int error)
414 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); 465 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
415} 466}
416 467
417
418/* 468/*
419 * RAID10 layout manager 469 * RAID10 layout manager
420 * As well as the chunksize and raid_disks count, there are two 470 * As well as the chunksize and raid_disks count, there are two
@@ -562,14 +612,16 @@ static int raid10_mergeable_bvec(struct request_queue *q,
562 * FIXME: possibly should rethink readbalancing and do it differently 612 * FIXME: possibly should rethink readbalancing and do it differently
563 * depending on near_copies / far_copies geometry. 613 * depending on near_copies / far_copies geometry.
564 */ 614 */
565static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors) 615static struct md_rdev *read_balance(struct r10conf *conf,
616 struct r10bio *r10_bio,
617 int *max_sectors)
566{ 618{
567 const sector_t this_sector = r10_bio->sector; 619 const sector_t this_sector = r10_bio->sector;
568 int disk, slot; 620 int disk, slot;
569 int sectors = r10_bio->sectors; 621 int sectors = r10_bio->sectors;
570 int best_good_sectors; 622 int best_good_sectors;
571 sector_t new_distance, best_dist; 623 sector_t new_distance, best_dist;
572 struct md_rdev *rdev; 624 struct md_rdev *rdev, *best_rdev;
573 int do_balance; 625 int do_balance;
574 int best_slot; 626 int best_slot;
575 627
@@ -578,6 +630,7 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s
578retry: 630retry:
579 sectors = r10_bio->sectors; 631 sectors = r10_bio->sectors;
580 best_slot = -1; 632 best_slot = -1;
633 best_rdev = NULL;
581 best_dist = MaxSector; 634 best_dist = MaxSector;
582 best_good_sectors = 0; 635 best_good_sectors = 0;
583 do_balance = 1; 636 do_balance = 1;
@@ -599,10 +652,16 @@ retry:
599 if (r10_bio->devs[slot].bio == IO_BLOCKED) 652 if (r10_bio->devs[slot].bio == IO_BLOCKED)
600 continue; 653 continue;
601 disk = r10_bio->devs[slot].devnum; 654 disk = r10_bio->devs[slot].devnum;
602 rdev = rcu_dereference(conf->mirrors[disk].rdev); 655 rdev = rcu_dereference(conf->mirrors[disk].replacement);
656 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
657 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
658 rdev = rcu_dereference(conf->mirrors[disk].rdev);
603 if (rdev == NULL) 659 if (rdev == NULL)
604 continue; 660 continue;
605 if (!test_bit(In_sync, &rdev->flags)) 661 if (test_bit(Faulty, &rdev->flags))
662 continue;
663 if (!test_bit(In_sync, &rdev->flags) &&
664 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
606 continue; 665 continue;
607 666
608 dev_sector = r10_bio->devs[slot].addr; 667 dev_sector = r10_bio->devs[slot].addr;
@@ -627,6 +686,7 @@ retry:
627 if (good_sectors > best_good_sectors) { 686 if (good_sectors > best_good_sectors) {
628 best_good_sectors = good_sectors; 687 best_good_sectors = good_sectors;
629 best_slot = slot; 688 best_slot = slot;
689 best_rdev = rdev;
630 } 690 }
631 if (!do_balance) 691 if (!do_balance)
632 /* Must read from here */ 692 /* Must read from here */
@@ -655,16 +715,15 @@ retry:
655 if (new_distance < best_dist) { 715 if (new_distance < best_dist) {
656 best_dist = new_distance; 716 best_dist = new_distance;
657 best_slot = slot; 717 best_slot = slot;
718 best_rdev = rdev;
658 } 719 }
659 } 720 }
660 if (slot == conf->copies) 721 if (slot >= conf->copies) {
661 slot = best_slot; 722 slot = best_slot;
723 rdev = best_rdev;
724 }
662 725
663 if (slot >= 0) { 726 if (slot >= 0) {
664 disk = r10_bio->devs[slot].devnum;
665 rdev = rcu_dereference(conf->mirrors[disk].rdev);
666 if (!rdev)
667 goto retry;
668 atomic_inc(&rdev->nr_pending); 727 atomic_inc(&rdev->nr_pending);
669 if (test_bit(Faulty, &rdev->flags)) { 728 if (test_bit(Faulty, &rdev->flags)) {
670 /* Cannot risk returning a device that failed 729 /* Cannot risk returning a device that failed
@@ -675,11 +734,11 @@ retry:
675 } 734 }
676 r10_bio->read_slot = slot; 735 r10_bio->read_slot = slot;
677 } else 736 } else
678 disk = -1; 737 rdev = NULL;
679 rcu_read_unlock(); 738 rcu_read_unlock();
680 *max_sectors = best_good_sectors; 739 *max_sectors = best_good_sectors;
681 740
682 return disk; 741 return rdev;
683} 742}
684 743
685static int raid10_congested(void *data, int bits) 744static int raid10_congested(void *data, int bits)
@@ -846,7 +905,6 @@ static void unfreeze_array(struct r10conf *conf)
846static void make_request(struct mddev *mddev, struct bio * bio) 905static void make_request(struct mddev *mddev, struct bio * bio)
847{ 906{
848 struct r10conf *conf = mddev->private; 907 struct r10conf *conf = mddev->private;
849 struct mirror_info *mirror;
850 struct r10bio *r10_bio; 908 struct r10bio *r10_bio;
851 struct bio *read_bio; 909 struct bio *read_bio;
852 int i; 910 int i;
@@ -945,27 +1003,27 @@ static void make_request(struct mddev *mddev, struct bio * bio)
945 /* 1003 /*
946 * read balancing logic: 1004 * read balancing logic:
947 */ 1005 */
948 int disk; 1006 struct md_rdev *rdev;
949 int slot; 1007 int slot;
950 1008
951read_again: 1009read_again:
952 disk = read_balance(conf, r10_bio, &max_sectors); 1010 rdev = read_balance(conf, r10_bio, &max_sectors);
953 slot = r10_bio->read_slot; 1011 if (!rdev) {
954 if (disk < 0) {
955 raid_end_bio_io(r10_bio); 1012 raid_end_bio_io(r10_bio);
956 return; 1013 return;
957 } 1014 }
958 mirror = conf->mirrors + disk; 1015 slot = r10_bio->read_slot;
959 1016
960 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1017 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
961 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, 1018 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
962 max_sectors); 1019 max_sectors);
963 1020
964 r10_bio->devs[slot].bio = read_bio; 1021 r10_bio->devs[slot].bio = read_bio;
1022 r10_bio->devs[slot].rdev = rdev;
965 1023
966 read_bio->bi_sector = r10_bio->devs[slot].addr + 1024 read_bio->bi_sector = r10_bio->devs[slot].addr +
967 mirror->rdev->data_offset; 1025 rdev->data_offset;
968 read_bio->bi_bdev = mirror->rdev->bdev; 1026 read_bio->bi_bdev = rdev->bdev;
969 read_bio->bi_end_io = raid10_end_read_request; 1027 read_bio->bi_end_io = raid10_end_read_request;
970 read_bio->bi_rw = READ | do_sync; 1028 read_bio->bi_rw = READ | do_sync;
971 read_bio->bi_private = r10_bio; 1029 read_bio->bi_private = r10_bio;
@@ -1025,6 +1083,7 @@ read_again:
1025 */ 1083 */
1026 plugged = mddev_check_plugged(mddev); 1084 plugged = mddev_check_plugged(mddev);
1027 1085
1086 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
1028 raid10_find_phys(conf, r10_bio); 1087 raid10_find_phys(conf, r10_bio);
1029retry_write: 1088retry_write:
1030 blocked_rdev = NULL; 1089 blocked_rdev = NULL;
@@ -1034,12 +1093,25 @@ retry_write:
1034 for (i = 0; i < conf->copies; i++) { 1093 for (i = 0; i < conf->copies; i++) {
1035 int d = r10_bio->devs[i].devnum; 1094 int d = r10_bio->devs[i].devnum;
1036 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 1095 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1096 struct md_rdev *rrdev = rcu_dereference(
1097 conf->mirrors[d].replacement);
1098 if (rdev == rrdev)
1099 rrdev = NULL;
1037 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 1100 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1038 atomic_inc(&rdev->nr_pending); 1101 atomic_inc(&rdev->nr_pending);
1039 blocked_rdev = rdev; 1102 blocked_rdev = rdev;
1040 break; 1103 break;
1041 } 1104 }
1105 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1106 atomic_inc(&rrdev->nr_pending);
1107 blocked_rdev = rrdev;
1108 break;
1109 }
1110 if (rrdev && test_bit(Faulty, &rrdev->flags))
1111 rrdev = NULL;
1112
1042 r10_bio->devs[i].bio = NULL; 1113 r10_bio->devs[i].bio = NULL;
1114 r10_bio->devs[i].repl_bio = NULL;
1043 if (!rdev || test_bit(Faulty, &rdev->flags)) { 1115 if (!rdev || test_bit(Faulty, &rdev->flags)) {
1044 set_bit(R10BIO_Degraded, &r10_bio->state); 1116 set_bit(R10BIO_Degraded, &r10_bio->state);
1045 continue; 1117 continue;
@@ -1088,6 +1160,10 @@ retry_write:
1088 } 1160 }
1089 r10_bio->devs[i].bio = bio; 1161 r10_bio->devs[i].bio = bio;
1090 atomic_inc(&rdev->nr_pending); 1162 atomic_inc(&rdev->nr_pending);
1163 if (rrdev) {
1164 r10_bio->devs[i].repl_bio = bio;
1165 atomic_inc(&rrdev->nr_pending);
1166 }
1091 } 1167 }
1092 rcu_read_unlock(); 1168 rcu_read_unlock();
1093 1169
@@ -1096,11 +1172,23 @@ retry_write:
1096 int j; 1172 int j;
1097 int d; 1173 int d;
1098 1174
1099 for (j = 0; j < i; j++) 1175 for (j = 0; j < i; j++) {
1100 if (r10_bio->devs[j].bio) { 1176 if (r10_bio->devs[j].bio) {
1101 d = r10_bio->devs[j].devnum; 1177 d = r10_bio->devs[j].devnum;
1102 rdev_dec_pending(conf->mirrors[d].rdev, mddev); 1178 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1103 } 1179 }
1180 if (r10_bio->devs[j].repl_bio) {
1181 struct md_rdev *rdev;
1182 d = r10_bio->devs[j].devnum;
1183 rdev = conf->mirrors[d].replacement;
1184 if (!rdev) {
1185 /* Race with remove_disk */
1186 smp_mb();
1187 rdev = conf->mirrors[d].rdev;
1188 }
1189 rdev_dec_pending(rdev, mddev);
1190 }
1191 }
1104 allow_barrier(conf); 1192 allow_barrier(conf);
1105 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1193 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1106 wait_barrier(conf); 1194 wait_barrier(conf);
@@ -1147,6 +1235,31 @@ retry_write:
1147 bio_list_add(&conf->pending_bio_list, mbio); 1235 bio_list_add(&conf->pending_bio_list, mbio);
1148 conf->pending_count++; 1236 conf->pending_count++;
1149 spin_unlock_irqrestore(&conf->device_lock, flags); 1237 spin_unlock_irqrestore(&conf->device_lock, flags);
1238
1239 if (!r10_bio->devs[i].repl_bio)
1240 continue;
1241
1242 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1243 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1244 max_sectors);
1245 r10_bio->devs[i].repl_bio = mbio;
1246
1247 /* We are actively writing to the original device
1248 * so it cannot disappear, so the replacement cannot
1249 * become NULL here
1250 */
1251 mbio->bi_sector = (r10_bio->devs[i].addr+
1252 conf->mirrors[d].replacement->data_offset);
1253 mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
1254 mbio->bi_end_io = raid10_end_write_request;
1255 mbio->bi_rw = WRITE | do_sync | do_fua;
1256 mbio->bi_private = r10_bio;
1257
1258 atomic_inc(&r10_bio->remaining);
1259 spin_lock_irqsave(&conf->device_lock, flags);
1260 bio_list_add(&conf->pending_bio_list, mbio);
1261 conf->pending_count++;
1262 spin_unlock_irqrestore(&conf->device_lock, flags);
1150 } 1263 }
1151 1264
1152 /* Don't remove the bias on 'remaining' (one_write_done) until 1265 /* Don't remove the bias on 'remaining' (one_write_done) until
@@ -1309,9 +1422,27 @@ static int raid10_spare_active(struct mddev *mddev)
1309 */ 1422 */
1310 for (i = 0; i < conf->raid_disks; i++) { 1423 for (i = 0; i < conf->raid_disks; i++) {
1311 tmp = conf->mirrors + i; 1424 tmp = conf->mirrors + i;
1312 if (tmp->rdev 1425 if (tmp->replacement
1313 && !test_bit(Faulty, &tmp->rdev->flags) 1426 && tmp->replacement->recovery_offset == MaxSector
1314 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 1427 && !test_bit(Faulty, &tmp->replacement->flags)
1428 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1429 /* Replacement has just become active */
1430 if (!tmp->rdev
1431 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1432 count++;
1433 if (tmp->rdev) {
1434 /* Replaced device not technically faulty,
1435 * but we need to be sure it gets removed
1436 * and never re-added.
1437 */
1438 set_bit(Faulty, &tmp->rdev->flags);
1439 sysfs_notify_dirent_safe(
1440 tmp->rdev->sysfs_state);
1441 }
1442 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1443 } else if (tmp->rdev
1444 && !test_bit(Faulty, &tmp->rdev->flags)
1445 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1315 count++; 1446 count++;
1316 sysfs_notify_dirent(tmp->rdev->sysfs_state); 1447 sysfs_notify_dirent(tmp->rdev->sysfs_state);
1317 } 1448 }
@@ -1353,8 +1484,25 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1353 struct mirror_info *p = &conf->mirrors[mirror]; 1484 struct mirror_info *p = &conf->mirrors[mirror];
1354 if (p->recovery_disabled == mddev->recovery_disabled) 1485 if (p->recovery_disabled == mddev->recovery_disabled)
1355 continue; 1486 continue;
1356 if (p->rdev) 1487 if (p->rdev) {
1357 continue; 1488 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1489 p->replacement != NULL)
1490 continue;
1491 clear_bit(In_sync, &rdev->flags);
1492 set_bit(Replacement, &rdev->flags);
1493 rdev->raid_disk = mirror;
1494 err = 0;
1495 disk_stack_limits(mddev->gendisk, rdev->bdev,
1496 rdev->data_offset << 9);
1497 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1498 blk_queue_max_segments(mddev->queue, 1);
1499 blk_queue_segment_boundary(mddev->queue,
1500 PAGE_CACHE_SIZE - 1);
1501 }
1502 conf->fullsync = 1;
1503 rcu_assign_pointer(p->replacement, rdev);
1504 break;
1505 }
1358 1506
1359 disk_stack_limits(mddev->gendisk, rdev->bdev, 1507 disk_stack_limits(mddev->gendisk, rdev->bdev,
1360 rdev->data_offset << 9); 1508 rdev->data_offset << 9);
@@ -1385,40 +1533,61 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1385 return err; 1533 return err;
1386} 1534}
1387 1535
1388static int raid10_remove_disk(struct mddev *mddev, int number) 1536static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1389{ 1537{
1390 struct r10conf *conf = mddev->private; 1538 struct r10conf *conf = mddev->private;
1391 int err = 0; 1539 int err = 0;
1392 struct md_rdev *rdev; 1540 int number = rdev->raid_disk;
1393 struct mirror_info *p = conf->mirrors+ number; 1541 struct md_rdev **rdevp;
1542 struct mirror_info *p = conf->mirrors + number;
1394 1543
1395 print_conf(conf); 1544 print_conf(conf);
1396 rdev = p->rdev; 1545 if (rdev == p->rdev)
1397 if (rdev) { 1546 rdevp = &p->rdev;
1398 if (test_bit(In_sync, &rdev->flags) || 1547 else if (rdev == p->replacement)
1399 atomic_read(&rdev->nr_pending)) { 1548 rdevp = &p->replacement;
1400 err = -EBUSY; 1549 else
1401 goto abort; 1550 return 0;
1402 } 1551
1403 /* Only remove faulty devices in recovery 1552 if (test_bit(In_sync, &rdev->flags) ||
1404 * is not possible. 1553 atomic_read(&rdev->nr_pending)) {
1405 */ 1554 err = -EBUSY;
1406 if (!test_bit(Faulty, &rdev->flags) && 1555 goto abort;
1407 mddev->recovery_disabled != p->recovery_disabled &&
1408 enough(conf, -1)) {
1409 err = -EBUSY;
1410 goto abort;
1411 }
1412 p->rdev = NULL;
1413 synchronize_rcu();
1414 if (atomic_read(&rdev->nr_pending)) {
1415 /* lost the race, try later */
1416 err = -EBUSY;
1417 p->rdev = rdev;
1418 goto abort;
1419 }
1420 err = md_integrity_register(mddev);
1421 } 1556 }
1557 /* Only remove faulty devices if recovery
1558 * is not possible.
1559 */
1560 if (!test_bit(Faulty, &rdev->flags) &&
1561 mddev->recovery_disabled != p->recovery_disabled &&
1562 (!p->replacement || p->replacement == rdev) &&
1563 enough(conf, -1)) {
1564 err = -EBUSY;
1565 goto abort;
1566 }
1567 *rdevp = NULL;
1568 synchronize_rcu();
1569 if (atomic_read(&rdev->nr_pending)) {
1570 /* lost the race, try later */
1571 err = -EBUSY;
1572 *rdevp = rdev;
1573 goto abort;
1574 } else if (p->replacement) {
1575 /* We must have just cleared 'rdev' */
1576 p->rdev = p->replacement;
1577 clear_bit(Replacement, &p->replacement->flags);
1578 smp_mb(); /* Make sure other CPUs may see both as identical
1579 * but will never see neither -- if they are careful.
1580 */
1581 p->replacement = NULL;
1582 clear_bit(WantReplacement, &rdev->flags);
1583 } else
1584 /* We might have just remove the Replacement as faulty
1585 * Clear the flag just in case
1586 */
1587 clear_bit(WantReplacement, &rdev->flags);
1588
1589 err = md_integrity_register(mddev);
1590
1422abort: 1591abort:
1423 1592
1424 print_conf(conf); 1593 print_conf(conf);
@@ -1432,7 +1601,7 @@ static void end_sync_read(struct bio *bio, int error)
1432 struct r10conf *conf = r10_bio->mddev->private; 1601 struct r10conf *conf = r10_bio->mddev->private;
1433 int d; 1602 int d;
1434 1603
1435 d = find_bio_disk(conf, r10_bio, bio, NULL); 1604 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1436 1605
1437 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1606 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1438 set_bit(R10BIO_Uptodate, &r10_bio->state); 1607 set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1493,19 +1662,34 @@ static void end_sync_write(struct bio *bio, int error)
1493 sector_t first_bad; 1662 sector_t first_bad;
1494 int bad_sectors; 1663 int bad_sectors;
1495 int slot; 1664 int slot;
1496 1665 int repl;
1497 d = find_bio_disk(conf, r10_bio, bio, &slot); 1666 struct md_rdev *rdev = NULL;
1667
1668 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1669 if (repl)
1670 rdev = conf->mirrors[d].replacement;
1671 if (!rdev) {
1672 smp_mb();
1673 rdev = conf->mirrors[d].rdev;
1674 }
1498 1675
1499 if (!uptodate) { 1676 if (!uptodate) {
1500 set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); 1677 if (repl)
1501 set_bit(R10BIO_WriteError, &r10_bio->state); 1678 md_error(mddev, rdev);
1502 } else if (is_badblock(conf->mirrors[d].rdev, 1679 else {
1680 set_bit(WriteErrorSeen, &rdev->flags);
1681 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1682 set_bit(MD_RECOVERY_NEEDED,
1683 &rdev->mddev->recovery);
1684 set_bit(R10BIO_WriteError, &r10_bio->state);
1685 }
1686 } else if (is_badblock(rdev,
1503 r10_bio->devs[slot].addr, 1687 r10_bio->devs[slot].addr,
1504 r10_bio->sectors, 1688 r10_bio->sectors,
1505 &first_bad, &bad_sectors)) 1689 &first_bad, &bad_sectors))
1506 set_bit(R10BIO_MadeGood, &r10_bio->state); 1690 set_bit(R10BIO_MadeGood, &r10_bio->state);
1507 1691
1508 rdev_dec_pending(conf->mirrors[d].rdev, mddev); 1692 rdev_dec_pending(rdev, mddev);
1509 1693
1510 end_sync_request(r10_bio); 1694 end_sync_request(r10_bio);
1511} 1695}
@@ -1609,6 +1793,29 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1609 generic_make_request(tbio); 1793 generic_make_request(tbio);
1610 } 1794 }
1611 1795
1796 /* Now write out to any replacement devices
1797 * that are active
1798 */
1799 for (i = 0; i < conf->copies; i++) {
1800 int j, d;
1801 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1802
1803 tbio = r10_bio->devs[i].repl_bio;
1804 if (!tbio || !tbio->bi_end_io)
1805 continue;
1806 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
1807 && r10_bio->devs[i].bio != fbio)
1808 for (j = 0; j < vcnt; j++)
1809 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1810 page_address(fbio->bi_io_vec[j].bv_page),
1811 PAGE_SIZE);
1812 d = r10_bio->devs[i].devnum;
1813 atomic_inc(&r10_bio->remaining);
1814 md_sync_acct(conf->mirrors[d].replacement->bdev,
1815 tbio->bi_size >> 9);
1816 generic_make_request(tbio);
1817 }
1818
1612done: 1819done:
1613 if (atomic_dec_and_test(&r10_bio->remaining)) { 1820 if (atomic_dec_and_test(&r10_bio->remaining)) {
1614 md_done_sync(mddev, r10_bio->sectors, 1); 1821 md_done_sync(mddev, r10_bio->sectors, 1);
@@ -1668,8 +1875,13 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
1668 s << 9, 1875 s << 9,
1669 bio->bi_io_vec[idx].bv_page, 1876 bio->bi_io_vec[idx].bv_page,
1670 WRITE, false); 1877 WRITE, false);
1671 if (!ok) 1878 if (!ok) {
1672 set_bit(WriteErrorSeen, &rdev->flags); 1879 set_bit(WriteErrorSeen, &rdev->flags);
1880 if (!test_and_set_bit(WantReplacement,
1881 &rdev->flags))
1882 set_bit(MD_RECOVERY_NEEDED,
1883 &rdev->mddev->recovery);
1884 }
1673 } 1885 }
1674 if (!ok) { 1886 if (!ok) {
1675 /* We don't worry if we cannot set a bad block - 1887 /* We don't worry if we cannot set a bad block -
@@ -1709,7 +1921,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1709{ 1921{
1710 struct r10conf *conf = mddev->private; 1922 struct r10conf *conf = mddev->private;
1711 int d; 1923 int d;
1712 struct bio *wbio; 1924 struct bio *wbio, *wbio2;
1713 1925
1714 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { 1926 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
1715 fix_recovery_read_error(r10_bio); 1927 fix_recovery_read_error(r10_bio);
@@ -1721,12 +1933,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1721 * share the pages with the first bio 1933 * share the pages with the first bio
1722 * and submit the write request 1934 * and submit the write request
1723 */ 1935 */
1724 wbio = r10_bio->devs[1].bio;
1725 d = r10_bio->devs[1].devnum; 1936 d = r10_bio->devs[1].devnum;
1726 1937 wbio = r10_bio->devs[1].bio;
1727 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 1938 wbio2 = r10_bio->devs[1].repl_bio;
1728 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 1939 if (wbio->bi_end_io) {
1729 generic_make_request(wbio); 1940 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1941 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1942 generic_make_request(wbio);
1943 }
1944 if (wbio2 && wbio2->bi_end_io) {
1945 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
1946 md_sync_acct(conf->mirrors[d].replacement->bdev,
1947 wbio2->bi_size >> 9);
1948 generic_make_request(wbio2);
1949 }
1730} 1950}
1731 1951
1732 1952
@@ -1779,8 +1999,12 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
1779 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) 1999 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1780 /* success */ 2000 /* success */
1781 return 1; 2001 return 1;
1782 if (rw == WRITE) 2002 if (rw == WRITE) {
1783 set_bit(WriteErrorSeen, &rdev->flags); 2003 set_bit(WriteErrorSeen, &rdev->flags);
2004 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2005 set_bit(MD_RECOVERY_NEEDED,
2006 &rdev->mddev->recovery);
2007 }
1784 /* need to record an error - either for the block or the device */ 2008 /* need to record an error - either for the block or the device */
1785 if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 2009 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1786 md_error(rdev->mddev, rdev); 2010 md_error(rdev->mddev, rdev);
@@ -2060,10 +2284,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
2060static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) 2284static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2061{ 2285{
2062 int slot = r10_bio->read_slot; 2286 int slot = r10_bio->read_slot;
2063 int mirror = r10_bio->devs[slot].devnum;
2064 struct bio *bio; 2287 struct bio *bio;
2065 struct r10conf *conf = mddev->private; 2288 struct r10conf *conf = mddev->private;
2066 struct md_rdev *rdev; 2289 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2067 char b[BDEVNAME_SIZE]; 2290 char b[BDEVNAME_SIZE];
2068 unsigned long do_sync; 2291 unsigned long do_sync;
2069 int max_sectors; 2292 int max_sectors;
@@ -2081,15 +2304,15 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2081 fix_read_error(conf, mddev, r10_bio); 2304 fix_read_error(conf, mddev, r10_bio);
2082 unfreeze_array(conf); 2305 unfreeze_array(conf);
2083 } 2306 }
2084 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); 2307 rdev_dec_pending(rdev, mddev);
2085 2308
2086 bio = r10_bio->devs[slot].bio; 2309 bio = r10_bio->devs[slot].bio;
2087 bdevname(bio->bi_bdev, b); 2310 bdevname(bio->bi_bdev, b);
2088 r10_bio->devs[slot].bio = 2311 r10_bio->devs[slot].bio =
2089 mddev->ro ? IO_BLOCKED : NULL; 2312 mddev->ro ? IO_BLOCKED : NULL;
2090read_more: 2313read_more:
2091 mirror = read_balance(conf, r10_bio, &max_sectors); 2314 rdev = read_balance(conf, r10_bio, &max_sectors);
2092 if (mirror == -1) { 2315 if (rdev == NULL) {
2093 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" 2316 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2094 " read error for block %llu\n", 2317 " read error for block %llu\n",
2095 mdname(mddev), b, 2318 mdname(mddev), b,
@@ -2103,7 +2326,6 @@ read_more:
2103 if (bio) 2326 if (bio)
2104 bio_put(bio); 2327 bio_put(bio);
2105 slot = r10_bio->read_slot; 2328 slot = r10_bio->read_slot;
2106 rdev = conf->mirrors[mirror].rdev;
2107 printk_ratelimited( 2329 printk_ratelimited(
2108 KERN_ERR 2330 KERN_ERR
2109 "md/raid10:%s: %s: redirecting" 2331 "md/raid10:%s: %s: redirecting"
@@ -2117,6 +2339,7 @@ read_more:
2117 r10_bio->sector - bio->bi_sector, 2339 r10_bio->sector - bio->bi_sector,
2118 max_sectors); 2340 max_sectors);
2119 r10_bio->devs[slot].bio = bio; 2341 r10_bio->devs[slot].bio = bio;
2342 r10_bio->devs[slot].rdev = rdev;
2120 bio->bi_sector = r10_bio->devs[slot].addr 2343 bio->bi_sector = r10_bio->devs[slot].addr
2121 + rdev->data_offset; 2344 + rdev->data_offset;
2122 bio->bi_bdev = rdev->bdev; 2345 bio->bi_bdev = rdev->bdev;
@@ -2187,6 +2410,22 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2187 r10_bio->sectors, 0)) 2410 r10_bio->sectors, 0))
2188 md_error(conf->mddev, rdev); 2411 md_error(conf->mddev, rdev);
2189 } 2412 }
2413 rdev = conf->mirrors[dev].replacement;
2414 if (r10_bio->devs[m].repl_bio == NULL)
2415 continue;
2416 if (test_bit(BIO_UPTODATE,
2417 &r10_bio->devs[m].repl_bio->bi_flags)) {
2418 rdev_clear_badblocks(
2419 rdev,
2420 r10_bio->devs[m].addr,
2421 r10_bio->sectors);
2422 } else {
2423 if (!rdev_set_badblocks(
2424 rdev,
2425 r10_bio->devs[m].addr,
2426 r10_bio->sectors, 0))
2427 md_error(conf->mddev, rdev);
2428 }
2190 } 2429 }
2191 put_buf(r10_bio); 2430 put_buf(r10_bio);
2192 } else { 2431 } else {
@@ -2209,6 +2448,15 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2209 } 2448 }
2210 rdev_dec_pending(rdev, conf->mddev); 2449 rdev_dec_pending(rdev, conf->mddev);
2211 } 2450 }
2451 bio = r10_bio->devs[m].repl_bio;
2452 rdev = conf->mirrors[dev].replacement;
2453 if (rdev && bio == IO_MADE_GOOD) {
2454 rdev_clear_badblocks(
2455 rdev,
2456 r10_bio->devs[m].addr,
2457 r10_bio->sectors);
2458 rdev_dec_pending(rdev, conf->mddev);
2459 }
2212 } 2460 }
2213 if (test_bit(R10BIO_WriteError, 2461 if (test_bit(R10BIO_WriteError,
2214 &r10_bio->state)) 2462 &r10_bio->state))
@@ -2272,9 +2520,14 @@ static void raid10d(struct mddev *mddev)
2272static int init_resync(struct r10conf *conf) 2520static int init_resync(struct r10conf *conf)
2273{ 2521{
2274 int buffs; 2522 int buffs;
2523 int i;
2275 2524
2276 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 2525 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2277 BUG_ON(conf->r10buf_pool); 2526 BUG_ON(conf->r10buf_pool);
2527 conf->have_replacement = 0;
2528 for (i = 0; i < conf->raid_disks; i++)
2529 if (conf->mirrors[i].replacement)
2530 conf->have_replacement = 1;
2278 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); 2531 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2279 if (!conf->r10buf_pool) 2532 if (!conf->r10buf_pool)
2280 return -ENOMEM; 2533 return -ENOMEM;
@@ -2355,9 +2608,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2355 bitmap_end_sync(mddev->bitmap, sect, 2608 bitmap_end_sync(mddev->bitmap, sect,
2356 &sync_blocks, 1); 2609 &sync_blocks, 1);
2357 } 2610 }
2358 } else /* completed sync */ 2611 } else {
2612 /* completed sync */
2613 if ((!mddev->bitmap || conf->fullsync)
2614 && conf->have_replacement
2615 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2616 /* Completed a full sync so the replacements
2617 * are now fully recovered.
2618 */
2619 for (i = 0; i < conf->raid_disks; i++)
2620 if (conf->mirrors[i].replacement)
2621 conf->mirrors[i].replacement
2622 ->recovery_offset
2623 = MaxSector;
2624 }
2359 conf->fullsync = 0; 2625 conf->fullsync = 0;
2360 2626 }
2361 bitmap_close_sync(mddev->bitmap); 2627 bitmap_close_sync(mddev->bitmap);
2362 close_sync(conf); 2628 close_sync(conf);
2363 *skipped = 1; 2629 *skipped = 1;
@@ -2414,23 +2680,30 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2414 sector_t sect; 2680 sector_t sect;
2415 int must_sync; 2681 int must_sync;
2416 int any_working; 2682 int any_working;
2417 2683 struct mirror_info *mirror = &conf->mirrors[i];
2418 if (conf->mirrors[i].rdev == NULL || 2684
2419 test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 2685 if ((mirror->rdev == NULL ||
2686 test_bit(In_sync, &mirror->rdev->flags))
2687 &&
2688 (mirror->replacement == NULL ||
2689 test_bit(Faulty,
2690 &mirror->replacement->flags)))
2420 continue; 2691 continue;
2421 2692
2422 still_degraded = 0; 2693 still_degraded = 0;
2423 /* want to reconstruct this device */ 2694 /* want to reconstruct this device */
2424 rb2 = r10_bio; 2695 rb2 = r10_bio;
2425 sect = raid10_find_virt(conf, sector_nr, i); 2696 sect = raid10_find_virt(conf, sector_nr, i);
2426 /* Unless we are doing a full sync, we only need 2697 /* Unless we are doing a full sync, or a replacement
2427 * to recover the block if it is set in the bitmap 2698 * we only need to recover the block if it is set in
2699 * the bitmap
2428 */ 2700 */
2429 must_sync = bitmap_start_sync(mddev->bitmap, sect, 2701 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2430 &sync_blocks, 1); 2702 &sync_blocks, 1);
2431 if (sync_blocks < max_sync) 2703 if (sync_blocks < max_sync)
2432 max_sync = sync_blocks; 2704 max_sync = sync_blocks;
2433 if (!must_sync && 2705 if (!must_sync &&
2706 mirror->replacement == NULL &&
2434 !conf->fullsync) { 2707 !conf->fullsync) {
2435 /* yep, skip the sync_blocks here, but don't assume 2708 /* yep, skip the sync_blocks here, but don't assume
2436 * that there will never be anything to do here 2709 * that there will never be anything to do here
@@ -2500,33 +2773,60 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2500 bio->bi_end_io = end_sync_read; 2773 bio->bi_end_io = end_sync_read;
2501 bio->bi_rw = READ; 2774 bio->bi_rw = READ;
2502 from_addr = r10_bio->devs[j].addr; 2775 from_addr = r10_bio->devs[j].addr;
2503 bio->bi_sector = from_addr + 2776 bio->bi_sector = from_addr + rdev->data_offset;
2504 conf->mirrors[d].rdev->data_offset; 2777 bio->bi_bdev = rdev->bdev;
2505 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2778 atomic_inc(&rdev->nr_pending);
2506 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2779 /* and we write to 'i' (if not in_sync) */
2507 atomic_inc(&r10_bio->remaining);
2508 /* and we write to 'i' */
2509 2780
2510 for (k=0; k<conf->copies; k++) 2781 for (k=0; k<conf->copies; k++)
2511 if (r10_bio->devs[k].devnum == i) 2782 if (r10_bio->devs[k].devnum == i)
2512 break; 2783 break;
2513 BUG_ON(k == conf->copies); 2784 BUG_ON(k == conf->copies);
2514 bio = r10_bio->devs[1].bio;
2515 bio->bi_next = biolist;
2516 biolist = bio;
2517 bio->bi_private = r10_bio;
2518 bio->bi_end_io = end_sync_write;
2519 bio->bi_rw = WRITE;
2520 to_addr = r10_bio->devs[k].addr; 2785 to_addr = r10_bio->devs[k].addr;
2521 bio->bi_sector = to_addr +
2522 conf->mirrors[i].rdev->data_offset;
2523 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
2524
2525 r10_bio->devs[0].devnum = d; 2786 r10_bio->devs[0].devnum = d;
2526 r10_bio->devs[0].addr = from_addr; 2787 r10_bio->devs[0].addr = from_addr;
2527 r10_bio->devs[1].devnum = i; 2788 r10_bio->devs[1].devnum = i;
2528 r10_bio->devs[1].addr = to_addr; 2789 r10_bio->devs[1].addr = to_addr;
2529 2790
2791 rdev = mirror->rdev;
2792 if (!test_bit(In_sync, &rdev->flags)) {
2793 bio = r10_bio->devs[1].bio;
2794 bio->bi_next = biolist;
2795 biolist = bio;
2796 bio->bi_private = r10_bio;
2797 bio->bi_end_io = end_sync_write;
2798 bio->bi_rw = WRITE;
2799 bio->bi_sector = to_addr
2800 + rdev->data_offset;
2801 bio->bi_bdev = rdev->bdev;
2802 atomic_inc(&r10_bio->remaining);
2803 } else
2804 r10_bio->devs[1].bio->bi_end_io = NULL;
2805
2806 /* and maybe write to replacement */
2807 bio = r10_bio->devs[1].repl_bio;
2808 if (bio)
2809 bio->bi_end_io = NULL;
2810 rdev = mirror->replacement;
2811 /* Note: if rdev != NULL, then bio
2812 * cannot be NULL as r10buf_pool_alloc will
2813 * have allocated it.
2814 * So the second test here is pointless.
2815 * But it keeps semantic-checkers happy, and
2816 * this comment keeps human reviewers
2817 * happy.
2818 */
2819 if (rdev == NULL || bio == NULL ||
2820 test_bit(Faulty, &rdev->flags))
2821 break;
2822 bio->bi_next = biolist;
2823 biolist = bio;
2824 bio->bi_private = r10_bio;
2825 bio->bi_end_io = end_sync_write;
2826 bio->bi_rw = WRITE;
2827 bio->bi_sector = to_addr + rdev->data_offset;
2828 bio->bi_bdev = rdev->bdev;
2829 atomic_inc(&r10_bio->remaining);
2530 break; 2830 break;
2531 } 2831 }
2532 if (j == conf->copies) { 2832 if (j == conf->copies) {
@@ -2544,8 +2844,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2544 for (k = 0; k < conf->copies; k++) 2844 for (k = 0; k < conf->copies; k++)
2545 if (r10_bio->devs[k].devnum == i) 2845 if (r10_bio->devs[k].devnum == i)
2546 break; 2846 break;
2547 if (!rdev_set_badblocks( 2847 if (!test_bit(In_sync,
2548 conf->mirrors[i].rdev, 2848 &mirror->rdev->flags)
2849 && !rdev_set_badblocks(
2850 mirror->rdev,
2851 r10_bio->devs[k].addr,
2852 max_sync, 0))
2853 any_working = 0;
2854 if (mirror->replacement &&
2855 !rdev_set_badblocks(
2856 mirror->replacement,
2549 r10_bio->devs[k].addr, 2857 r10_bio->devs[k].addr,
2550 max_sync, 0)) 2858 max_sync, 0))
2551 any_working = 0; 2859 any_working = 0;
@@ -2556,7 +2864,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2556 printk(KERN_INFO "md/raid10:%s: insufficient " 2864 printk(KERN_INFO "md/raid10:%s: insufficient "
2557 "working devices for recovery.\n", 2865 "working devices for recovery.\n",
2558 mdname(mddev)); 2866 mdname(mddev));
2559 conf->mirrors[i].recovery_disabled 2867 mirror->recovery_disabled
2560 = mddev->recovery_disabled; 2868 = mddev->recovery_disabled;
2561 } 2869 }
2562 break; 2870 break;
@@ -2605,6 +2913,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2605 sector_t first_bad, sector; 2913 sector_t first_bad, sector;
2606 int bad_sectors; 2914 int bad_sectors;
2607 2915
2916 if (r10_bio->devs[i].repl_bio)
2917 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
2918
2608 bio = r10_bio->devs[i].bio; 2919 bio = r10_bio->devs[i].bio;
2609 bio->bi_end_io = NULL; 2920 bio->bi_end_io = NULL;
2610 clear_bit(BIO_UPTODATE, &bio->bi_flags); 2921 clear_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -2635,6 +2946,27 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2635 conf->mirrors[d].rdev->data_offset; 2946 conf->mirrors[d].rdev->data_offset;
2636 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2947 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
2637 count++; 2948 count++;
2949
2950 if (conf->mirrors[d].replacement == NULL ||
2951 test_bit(Faulty,
2952 &conf->mirrors[d].replacement->flags))
2953 continue;
2954
2955 /* Need to set up for writing to the replacement */
2956 bio = r10_bio->devs[i].repl_bio;
2957 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2958
2959 sector = r10_bio->devs[i].addr;
2960 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2961 bio->bi_next = biolist;
2962 biolist = bio;
2963 bio->bi_private = r10_bio;
2964 bio->bi_end_io = end_sync_write;
2965 bio->bi_rw = WRITE;
2966 bio->bi_sector = sector +
2967 conf->mirrors[d].replacement->data_offset;
2968 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
2969 count++;
2638 } 2970 }
2639 2971
2640 if (count < 2) { 2972 if (count < 2) {
@@ -2643,6 +2975,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2643 if (r10_bio->devs[i].bio->bi_end_io) 2975 if (r10_bio->devs[i].bio->bi_end_io)
2644 rdev_dec_pending(conf->mirrors[d].rdev, 2976 rdev_dec_pending(conf->mirrors[d].rdev,
2645 mddev); 2977 mddev);
2978 if (r10_bio->devs[i].repl_bio &&
2979 r10_bio->devs[i].repl_bio->bi_end_io)
2980 rdev_dec_pending(
2981 conf->mirrors[d].replacement,
2982 mddev);
2646 } 2983 }
2647 put_buf(r10_bio); 2984 put_buf(r10_bio);
2648 biolist = NULL; 2985 biolist = NULL;
@@ -2896,6 +3233,16 @@ static int run(struct mddev *mddev)
2896 continue; 3233 continue;
2897 disk = conf->mirrors + disk_idx; 3234 disk = conf->mirrors + disk_idx;
2898 3235
3236 if (test_bit(Replacement, &rdev->flags)) {
3237 if (disk->replacement)
3238 goto out_free_conf;
3239 disk->replacement = rdev;
3240 } else {
3241 if (disk->rdev)
3242 goto out_free_conf;
3243 disk->rdev = rdev;
3244 }
3245
2899 disk->rdev = rdev; 3246 disk->rdev = rdev;
2900 disk_stack_limits(mddev->gendisk, rdev->bdev, 3247 disk_stack_limits(mddev->gendisk, rdev->bdev,
2901 rdev->data_offset << 9); 3248 rdev->data_offset << 9);
@@ -2923,6 +3270,13 @@ static int run(struct mddev *mddev)
2923 3270
2924 disk = conf->mirrors + i; 3271 disk = conf->mirrors + i;
2925 3272
3273 if (!disk->rdev && disk->replacement) {
3274 /* The replacement is all we have - use it */
3275 disk->rdev = disk->replacement;
3276 disk->replacement = NULL;
3277 clear_bit(Replacement, &disk->rdev->flags);
3278 }
3279
2926 if (!disk->rdev || 3280 if (!disk->rdev ||
2927 !test_bit(In_sync, &disk->rdev->flags)) { 3281 !test_bit(In_sync, &disk->rdev->flags)) {
2928 disk->head_position = 0; 3282 disk->head_position = 0;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 7facfdf841f4..7c615613c381 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -2,7 +2,7 @@
2#define _RAID10_H 2#define _RAID10_H
3 3
4struct mirror_info { 4struct mirror_info {
5 struct md_rdev *rdev; 5 struct md_rdev *rdev, *replacement;
6 sector_t head_position; 6 sector_t head_position;
7 int recovery_disabled; /* matches 7 int recovery_disabled; /* matches
8 * mddev->recovery_disabled 8 * mddev->recovery_disabled
@@ -18,12 +18,13 @@ struct r10conf {
18 spinlock_t device_lock; 18 spinlock_t device_lock;
19 19
20 /* geometry */ 20 /* geometry */
21 int near_copies; /* number of copies laid out raid0 style */ 21 int near_copies; /* number of copies laid out
22 * raid0 style */
22 int far_copies; /* number of copies laid out 23 int far_copies; /* number of copies laid out
23 * at large strides across drives 24 * at large strides across drives
24 */ 25 */
25 int far_offset; /* far_copies are offset by 1 stripe 26 int far_offset; /* far_copies are offset by 1
26 * instead of many 27 * stripe instead of many
27 */ 28 */
28 int copies; /* near_copies * far_copies. 29 int copies; /* near_copies * far_copies.
29 * must be <= raid_disks 30 * must be <= raid_disks
@@ -34,10 +35,11 @@ struct r10conf {
34 * 1 stripe. 35 * 1 stripe.
35 */ 36 */
36 37
37 sector_t dev_sectors; /* temp copy of mddev->dev_sectors */ 38 sector_t dev_sectors; /* temp copy of
39 * mddev->dev_sectors */
38 40
39 int chunk_shift; /* shift from chunks to sectors */ 41 int chunk_shift; /* shift from chunks to sectors */
40 sector_t chunk_mask; 42 sector_t chunk_mask;
41 43
42 struct list_head retry_list; 44 struct list_head retry_list;
43 /* queue pending writes and submit them on unplug */ 45 /* queue pending writes and submit them on unplug */
@@ -45,20 +47,22 @@ struct r10conf {
45 int pending_count; 47 int pending_count;
46 48
47 spinlock_t resync_lock; 49 spinlock_t resync_lock;
48 int nr_pending; 50 int nr_pending;
49 int nr_waiting; 51 int nr_waiting;
50 int nr_queued; 52 int nr_queued;
51 int barrier; 53 int barrier;
52 sector_t next_resync; 54 sector_t next_resync;
53 int fullsync; /* set to 1 if a full sync is needed, 55 int fullsync; /* set to 1 if a full sync is needed,
54 * (fresh device added). 56 * (fresh device added).
55 * Cleared when a sync completes. 57 * Cleared when a sync completes.
56 */ 58 */
57 59 int have_replacement; /* There is at least one
60 * replacement device.
61 */
58 wait_queue_head_t wait_barrier; 62 wait_queue_head_t wait_barrier;
59 63
60 mempool_t *r10bio_pool; 64 mempool_t *r10bio_pool;
61 mempool_t *r10buf_pool; 65 mempool_t *r10buf_pool;
62 struct page *tmppage; 66 struct page *tmppage;
63 67
64 /* When taking over an array from a different personality, we store 68 /* When taking over an array from a different personality, we store
@@ -98,11 +102,18 @@ struct r10bio {
98 * When resyncing we also use one for each copy. 102 * When resyncing we also use one for each copy.
99 * When reconstructing, we use 2 bios, one for read, one for write. 103 * When reconstructing, we use 2 bios, one for read, one for write.
100 * We choose the number when they are allocated. 104 * We choose the number when they are allocated.
105 * We sometimes need an extra bio to write to the replacement.
101 */ 106 */
102 struct { 107 struct {
103 struct bio *bio; 108 struct bio *bio;
104 sector_t addr; 109 union {
105 int devnum; 110 struct bio *repl_bio; /* used for resync and
111 * writes */
112 struct md_rdev *rdev; /* used for reads
113 * (read_slot >= 0) */
114 };
115 sector_t addr;
116 int devnum;
106 } devs[0]; 117 } devs[0];
107}; 118};
108 119
@@ -121,17 +132,19 @@ struct r10bio {
121#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) 132#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
122 133
123/* bits for r10bio.state */ 134/* bits for r10bio.state */
124#define R10BIO_Uptodate 0 135enum r10bio_state {
125#define R10BIO_IsSync 1 136 R10BIO_Uptodate,
126#define R10BIO_IsRecover 2 137 R10BIO_IsSync,
127#define R10BIO_Degraded 3 138 R10BIO_IsRecover,
139 R10BIO_Degraded,
128/* Set ReadError on bios that experience a read error 140/* Set ReadError on bios that experience a read error
129 * so that raid10d knows what to do with them. 141 * so that raid10d knows what to do with them.
130 */ 142 */
131#define R10BIO_ReadError 4 143 R10BIO_ReadError,
132/* If a write for this request means we can clear some 144/* If a write for this request means we can clear some
133 * known-bad-block records, we set this flag. 145 * known-bad-block records, we set this flag.
134 */ 146 */
135#define R10BIO_MadeGood 5 147 R10BIO_MadeGood,
136#define R10BIO_WriteError 6 148 R10BIO_WriteError,
149};
137#endif 150#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 858fdbb7eb07..360f2b98f62b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -370,12 +370,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
370 * of the two sections, and some non-in_sync devices may 370 * of the two sections, and some non-in_sync devices may
371 * be insync in the section most affected by failed devices. 371 * be insync in the section most affected by failed devices.
372 */ 372 */
373static int has_failed(struct r5conf *conf) 373static int calc_degraded(struct r5conf *conf)
374{ 374{
375 int degraded; 375 int degraded, degraded2;
376 int i; 376 int i;
377 if (conf->mddev->reshape_position == MaxSector)
378 return conf->mddev->degraded > conf->max_degraded;
379 377
380 rcu_read_lock(); 378 rcu_read_lock();
381 degraded = 0; 379 degraded = 0;
@@ -399,14 +397,14 @@ static int has_failed(struct r5conf *conf)
399 degraded++; 397 degraded++;
400 } 398 }
401 rcu_read_unlock(); 399 rcu_read_unlock();
402 if (degraded > conf->max_degraded) 400 if (conf->raid_disks == conf->previous_raid_disks)
403 return 1; 401 return degraded;
404 rcu_read_lock(); 402 rcu_read_lock();
405 degraded = 0; 403 degraded2 = 0;
406 for (i = 0; i < conf->raid_disks; i++) { 404 for (i = 0; i < conf->raid_disks; i++) {
407 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 405 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
408 if (!rdev || test_bit(Faulty, &rdev->flags)) 406 if (!rdev || test_bit(Faulty, &rdev->flags))
409 degraded++; 407 degraded2++;
410 else if (test_bit(In_sync, &rdev->flags)) 408 else if (test_bit(In_sync, &rdev->flags))
411 ; 409 ;
412 else 410 else
@@ -416,9 +414,22 @@ static int has_failed(struct r5conf *conf)
416 * almost certainly hasn't. 414 * almost certainly hasn't.
417 */ 415 */
418 if (conf->raid_disks <= conf->previous_raid_disks) 416 if (conf->raid_disks <= conf->previous_raid_disks)
419 degraded++; 417 degraded2++;
420 } 418 }
421 rcu_read_unlock(); 419 rcu_read_unlock();
420 if (degraded2 > degraded)
421 return degraded2;
422 return degraded;
423}
424
425static int has_failed(struct r5conf *conf)
426{
427 int degraded;
428
429 if (conf->mddev->reshape_position == MaxSector)
430 return conf->mddev->degraded > conf->max_degraded;
431
432 degraded = calc_degraded(conf);
422 if (degraded > conf->max_degraded) 433 if (degraded > conf->max_degraded)
423 return 1; 434 return 1;
424 return 0; 435 return 0;
@@ -492,8 +503,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
492 503
493 for (i = disks; i--; ) { 504 for (i = disks; i--; ) {
494 int rw; 505 int rw;
495 struct bio *bi; 506 int replace_only = 0;
496 struct md_rdev *rdev; 507 struct bio *bi, *rbi;
508 struct md_rdev *rdev, *rrdev = NULL;
497 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
498 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 510 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
499 rw = WRITE_FUA; 511 rw = WRITE_FUA;
@@ -501,27 +513,57 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
501 rw = WRITE; 513 rw = WRITE;
502 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 514 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
503 rw = READ; 515 rw = READ;
504 else 516 else if (test_and_clear_bit(R5_WantReplace,
517 &sh->dev[i].flags)) {
518 rw = WRITE;
519 replace_only = 1;
520 } else
505 continue; 521 continue;
506 522
507 bi = &sh->dev[i].req; 523 bi = &sh->dev[i].req;
524 rbi = &sh->dev[i].rreq; /* For writing to replacement */
508 525
509 bi->bi_rw = rw; 526 bi->bi_rw = rw;
510 if (rw & WRITE) 527 rbi->bi_rw = rw;
528 if (rw & WRITE) {
511 bi->bi_end_io = raid5_end_write_request; 529 bi->bi_end_io = raid5_end_write_request;
512 else 530 rbi->bi_end_io = raid5_end_write_request;
531 } else
513 bi->bi_end_io = raid5_end_read_request; 532 bi->bi_end_io = raid5_end_read_request;
514 533
515 rcu_read_lock(); 534 rcu_read_lock();
535 rrdev = rcu_dereference(conf->disks[i].replacement);
536 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
516 rdev = rcu_dereference(conf->disks[i].rdev); 537 rdev = rcu_dereference(conf->disks[i].rdev);
538 if (!rdev) {
539 rdev = rrdev;
540 rrdev = NULL;
541 }
542 if (rw & WRITE) {
543 if (replace_only)
544 rdev = NULL;
545 if (rdev == rrdev)
546 /* We raced and saw duplicates */
547 rrdev = NULL;
548 } else {
549 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
550 rdev = rrdev;
551 rrdev = NULL;
552 }
553
517 if (rdev && test_bit(Faulty, &rdev->flags)) 554 if (rdev && test_bit(Faulty, &rdev->flags))
518 rdev = NULL; 555 rdev = NULL;
519 if (rdev) 556 if (rdev)
520 atomic_inc(&rdev->nr_pending); 557 atomic_inc(&rdev->nr_pending);
558 if (rrdev && test_bit(Faulty, &rrdev->flags))
559 rrdev = NULL;
560 if (rrdev)
561 atomic_inc(&rrdev->nr_pending);
521 rcu_read_unlock(); 562 rcu_read_unlock();
522 563
523 /* We have already checked bad blocks for reads. Now 564 /* We have already checked bad blocks for reads. Now
524 * need to check for writes. 565 * need to check for writes. We never accept write errors
566 * on the replacement, so we don't to check rrdev.
525 */ 567 */
526 while ((rw & WRITE) && rdev && 568 while ((rw & WRITE) && rdev &&
527 test_bit(WriteErrorSeen, &rdev->flags)) { 569 test_bit(WriteErrorSeen, &rdev->flags)) {
@@ -551,7 +593,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
551 } 593 }
552 594
553 if (rdev) { 595 if (rdev) {
554 if (s->syncing || s->expanding || s->expanded) 596 if (s->syncing || s->expanding || s->expanded
597 || s->replacing)
555 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 598 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
556 599
557 set_bit(STRIPE_IO_STARTED, &sh->state); 600 set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -563,16 +606,38 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
563 atomic_inc(&sh->count); 606 atomic_inc(&sh->count);
564 bi->bi_sector = sh->sector + rdev->data_offset; 607 bi->bi_sector = sh->sector + rdev->data_offset;
565 bi->bi_flags = 1 << BIO_UPTODATE; 608 bi->bi_flags = 1 << BIO_UPTODATE;
566 bi->bi_vcnt = 1;
567 bi->bi_max_vecs = 1;
568 bi->bi_idx = 0; 609 bi->bi_idx = 0;
569 bi->bi_io_vec = &sh->dev[i].vec;
570 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 610 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
571 bi->bi_io_vec[0].bv_offset = 0; 611 bi->bi_io_vec[0].bv_offset = 0;
572 bi->bi_size = STRIPE_SIZE; 612 bi->bi_size = STRIPE_SIZE;
573 bi->bi_next = NULL; 613 bi->bi_next = NULL;
614 if (rrdev)
615 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
574 generic_make_request(bi); 616 generic_make_request(bi);
575 } else { 617 }
618 if (rrdev) {
619 if (s->syncing || s->expanding || s->expanded
620 || s->replacing)
621 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
622
623 set_bit(STRIPE_IO_STARTED, &sh->state);
624
625 rbi->bi_bdev = rrdev->bdev;
626 pr_debug("%s: for %llu schedule op %ld on "
627 "replacement disc %d\n",
628 __func__, (unsigned long long)sh->sector,
629 rbi->bi_rw, i);
630 atomic_inc(&sh->count);
631 rbi->bi_sector = sh->sector + rrdev->data_offset;
632 rbi->bi_flags = 1 << BIO_UPTODATE;
633 rbi->bi_idx = 0;
634 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
635 rbi->bi_io_vec[0].bv_offset = 0;
636 rbi->bi_size = STRIPE_SIZE;
637 rbi->bi_next = NULL;
638 generic_make_request(rbi);
639 }
640 if (!rdev && !rrdev) {
576 if (rw & WRITE) 641 if (rw & WRITE)
577 set_bit(STRIPE_DEGRADED, &sh->state); 642 set_bit(STRIPE_DEGRADED, &sh->state);
578 pr_debug("skip op %ld on disc %d for sector %llu\n", 643 pr_debug("skip op %ld on disc %d for sector %llu\n",
@@ -1583,7 +1648,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1583 int disks = sh->disks, i; 1648 int disks = sh->disks, i;
1584 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1649 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1585 char b[BDEVNAME_SIZE]; 1650 char b[BDEVNAME_SIZE];
1586 struct md_rdev *rdev; 1651 struct md_rdev *rdev = NULL;
1587 1652
1588 1653
1589 for (i=0 ; i<disks; i++) 1654 for (i=0 ; i<disks; i++)
@@ -1597,11 +1662,23 @@ static void raid5_end_read_request(struct bio * bi, int error)
1597 BUG(); 1662 BUG();
1598 return; 1663 return;
1599 } 1664 }
1665 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1666 /* If replacement finished while this request was outstanding,
1667 * 'replacement' might be NULL already.
1668 * In that case it moved down to 'rdev'.
1669 * rdev is not removed until all requests are finished.
1670 */
1671 rdev = conf->disks[i].replacement;
1672 if (!rdev)
1673 rdev = conf->disks[i].rdev;
1600 1674
1601 if (uptodate) { 1675 if (uptodate) {
1602 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1676 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1603 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1677 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1604 rdev = conf->disks[i].rdev; 1678 /* Note that this cannot happen on a
1679 * replacement device. We just fail those on
1680 * any error
1681 */
1605 printk_ratelimited( 1682 printk_ratelimited(
1606 KERN_INFO 1683 KERN_INFO
1607 "md/raid:%s: read error corrected" 1684 "md/raid:%s: read error corrected"
@@ -1614,16 +1691,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
1614 clear_bit(R5_ReadError, &sh->dev[i].flags); 1691 clear_bit(R5_ReadError, &sh->dev[i].flags);
1615 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1692 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1616 } 1693 }
1617 if (atomic_read(&conf->disks[i].rdev->read_errors)) 1694 if (atomic_read(&rdev->read_errors))
1618 atomic_set(&conf->disks[i].rdev->read_errors, 0); 1695 atomic_set(&rdev->read_errors, 0);
1619 } else { 1696 } else {
1620 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); 1697 const char *bdn = bdevname(rdev->bdev, b);
1621 int retry = 0; 1698 int retry = 0;
1622 rdev = conf->disks[i].rdev;
1623 1699
1624 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1700 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1625 atomic_inc(&rdev->read_errors); 1701 atomic_inc(&rdev->read_errors);
1626 if (conf->mddev->degraded >= conf->max_degraded) 1702 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1703 printk_ratelimited(
1704 KERN_WARNING
1705 "md/raid:%s: read error on replacement device "
1706 "(sector %llu on %s).\n",
1707 mdname(conf->mddev),
1708 (unsigned long long)(sh->sector
1709 + rdev->data_offset),
1710 bdn);
1711 else if (conf->mddev->degraded >= conf->max_degraded)
1627 printk_ratelimited( 1712 printk_ratelimited(
1628 KERN_WARNING 1713 KERN_WARNING
1629 "md/raid:%s: read error not correctable " 1714 "md/raid:%s: read error not correctable "
@@ -1657,7 +1742,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1657 md_error(conf->mddev, rdev); 1742 md_error(conf->mddev, rdev);
1658 } 1743 }
1659 } 1744 }
1660 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1745 rdev_dec_pending(rdev, conf->mddev);
1661 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1746 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1662 set_bit(STRIPE_HANDLE, &sh->state); 1747 set_bit(STRIPE_HANDLE, &sh->state);
1663 release_stripe(sh); 1748 release_stripe(sh);
@@ -1668,14 +1753,30 @@ static void raid5_end_write_request(struct bio *bi, int error)
1668 struct stripe_head *sh = bi->bi_private; 1753 struct stripe_head *sh = bi->bi_private;
1669 struct r5conf *conf = sh->raid_conf; 1754 struct r5conf *conf = sh->raid_conf;
1670 int disks = sh->disks, i; 1755 int disks = sh->disks, i;
1756 struct md_rdev *uninitialized_var(rdev);
1671 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1757 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1672 sector_t first_bad; 1758 sector_t first_bad;
1673 int bad_sectors; 1759 int bad_sectors;
1760 int replacement = 0;
1674 1761
1675 for (i=0 ; i<disks; i++) 1762 for (i = 0 ; i < disks; i++) {
1676 if (bi == &sh->dev[i].req) 1763 if (bi == &sh->dev[i].req) {
1764 rdev = conf->disks[i].rdev;
1677 break; 1765 break;
1678 1766 }
1767 if (bi == &sh->dev[i].rreq) {
1768 rdev = conf->disks[i].replacement;
1769 if (rdev)
1770 replacement = 1;
1771 else
1772 /* rdev was removed and 'replacement'
1773 * replaced it. rdev is not removed
1774 * until all requests are finished.
1775 */
1776 rdev = conf->disks[i].rdev;
1777 break;
1778 }
1779 }
1679 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1780 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1680 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1781 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1681 uptodate); 1782 uptodate);
@@ -1684,21 +1785,33 @@ static void raid5_end_write_request(struct bio *bi, int error)
1684 return; 1785 return;
1685 } 1786 }
1686 1787
1687 if (!uptodate) { 1788 if (replacement) {
1688 set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); 1789 if (!uptodate)
1689 set_bit(R5_WriteError, &sh->dev[i].flags); 1790 md_error(conf->mddev, rdev);
1690 } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, 1791 else if (is_badblock(rdev, sh->sector,
1691 &first_bad, &bad_sectors)) 1792 STRIPE_SECTORS,
1692 set_bit(R5_MadeGood, &sh->dev[i].flags); 1793 &first_bad, &bad_sectors))
1794 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
1795 } else {
1796 if (!uptodate) {
1797 set_bit(WriteErrorSeen, &rdev->flags);
1798 set_bit(R5_WriteError, &sh->dev[i].flags);
1799 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1800 set_bit(MD_RECOVERY_NEEDED,
1801 &rdev->mddev->recovery);
1802 } else if (is_badblock(rdev, sh->sector,
1803 STRIPE_SECTORS,
1804 &first_bad, &bad_sectors))
1805 set_bit(R5_MadeGood, &sh->dev[i].flags);
1806 }
1807 rdev_dec_pending(rdev, conf->mddev);
1693 1808
1694 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1809 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
1695 1810 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1696 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1697 set_bit(STRIPE_HANDLE, &sh->state); 1811 set_bit(STRIPE_HANDLE, &sh->state);
1698 release_stripe(sh); 1812 release_stripe(sh);
1699} 1813}
1700 1814
1701
1702static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1815static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1703 1816
1704static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1817static void raid5_build_block(struct stripe_head *sh, int i, int previous)
@@ -1709,12 +1822,15 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1709 dev->req.bi_io_vec = &dev->vec; 1822 dev->req.bi_io_vec = &dev->vec;
1710 dev->req.bi_vcnt++; 1823 dev->req.bi_vcnt++;
1711 dev->req.bi_max_vecs++; 1824 dev->req.bi_max_vecs++;
1825 dev->req.bi_private = sh;
1712 dev->vec.bv_page = dev->page; 1826 dev->vec.bv_page = dev->page;
1713 dev->vec.bv_len = STRIPE_SIZE;
1714 dev->vec.bv_offset = 0;
1715 1827
1716 dev->req.bi_sector = sh->sector; 1828 bio_init(&dev->rreq);
1717 dev->req.bi_private = sh; 1829 dev->rreq.bi_io_vec = &dev->rvec;
1830 dev->rreq.bi_vcnt++;
1831 dev->rreq.bi_max_vecs++;
1832 dev->rreq.bi_private = sh;
1833 dev->rvec.bv_page = dev->page;
1718 1834
1719 dev->flags = 0; 1835 dev->flags = 0;
1720 dev->sector = compute_blocknr(sh, i, previous); 1836 dev->sector = compute_blocknr(sh, i, previous);
@@ -1724,18 +1840,15 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1724{ 1840{
1725 char b[BDEVNAME_SIZE]; 1841 char b[BDEVNAME_SIZE];
1726 struct r5conf *conf = mddev->private; 1842 struct r5conf *conf = mddev->private;
1843 unsigned long flags;
1727 pr_debug("raid456: error called\n"); 1844 pr_debug("raid456: error called\n");
1728 1845
1729 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1846 spin_lock_irqsave(&conf->device_lock, flags);
1730 unsigned long flags; 1847 clear_bit(In_sync, &rdev->flags);
1731 spin_lock_irqsave(&conf->device_lock, flags); 1848 mddev->degraded = calc_degraded(conf);
1732 mddev->degraded++; 1849 spin_unlock_irqrestore(&conf->device_lock, flags);
1733 spin_unlock_irqrestore(&conf->device_lock, flags); 1850 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1734 /* 1851
1735 * if recovery was running, make sure it aborts.
1736 */
1737 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1738 }
1739 set_bit(Blocked, &rdev->flags); 1852 set_bit(Blocked, &rdev->flags);
1740 set_bit(Faulty, &rdev->flags); 1853 set_bit(Faulty, &rdev->flags);
1741 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1854 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -2362,8 +2475,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2362 md_done_sync(conf->mddev, STRIPE_SECTORS, 0); 2475 md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2363 clear_bit(STRIPE_SYNCING, &sh->state); 2476 clear_bit(STRIPE_SYNCING, &sh->state);
2364 s->syncing = 0; 2477 s->syncing = 0;
2478 s->replacing = 0;
2365 /* There is nothing more to do for sync/check/repair. 2479 /* There is nothing more to do for sync/check/repair.
2366 * For recover we need to record a bad block on all 2480 * For recover/replace we need to record a bad block on all
2367 * non-sync devices, or abort the recovery 2481 * non-sync devices, or abort the recovery
2368 */ 2482 */
2369 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) 2483 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
@@ -2373,12 +2487,18 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2373 */ 2487 */
2374 for (i = 0; i < conf->raid_disks; i++) { 2488 for (i = 0; i < conf->raid_disks; i++) {
2375 struct md_rdev *rdev = conf->disks[i].rdev; 2489 struct md_rdev *rdev = conf->disks[i].rdev;
2376 if (!rdev 2490 if (rdev
2377 || test_bit(Faulty, &rdev->flags) 2491 && !test_bit(Faulty, &rdev->flags)
2378 || test_bit(In_sync, &rdev->flags)) 2492 && !test_bit(In_sync, &rdev->flags)
2379 continue; 2493 && !rdev_set_badblocks(rdev, sh->sector,
2380 if (!rdev_set_badblocks(rdev, sh->sector, 2494 STRIPE_SECTORS, 0))
2381 STRIPE_SECTORS, 0)) 2495 abort = 1;
2496 rdev = conf->disks[i].replacement;
2497 if (rdev
2498 && !test_bit(Faulty, &rdev->flags)
2499 && !test_bit(In_sync, &rdev->flags)
2500 && !rdev_set_badblocks(rdev, sh->sector,
2501 STRIPE_SECTORS, 0))
2382 abort = 1; 2502 abort = 1;
2383 } 2503 }
2384 if (abort) { 2504 if (abort) {
@@ -2387,6 +2507,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2387 } 2507 }
2388} 2508}
2389 2509
2510static int want_replace(struct stripe_head *sh, int disk_idx)
2511{
2512 struct md_rdev *rdev;
2513 int rv = 0;
2514 /* Doing recovery so rcu locking not required */
2515 rdev = sh->raid_conf->disks[disk_idx].replacement;
2516 if (rdev
2517 && !test_bit(Faulty, &rdev->flags)
2518 && !test_bit(In_sync, &rdev->flags)
2519 && (rdev->recovery_offset <= sh->sector
2520 || rdev->mddev->recovery_cp <= sh->sector))
2521 rv = 1;
2522
2523 return rv;
2524}
2525
2390/* fetch_block - checks the given member device to see if its data needs 2526/* fetch_block - checks the given member device to see if its data needs
2391 * to be read or computed to satisfy a request. 2527 * to be read or computed to satisfy a request.
2392 * 2528 *
@@ -2406,6 +2542,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2406 (dev->toread || 2542 (dev->toread ||
2407 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2543 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2408 s->syncing || s->expanding || 2544 s->syncing || s->expanding ||
2545 (s->replacing && want_replace(sh, disk_idx)) ||
2409 (s->failed >= 1 && fdev[0]->toread) || 2546 (s->failed >= 1 && fdev[0]->toread) ||
2410 (s->failed >= 2 && fdev[1]->toread) || 2547 (s->failed >= 2 && fdev[1]->toread) ||
2411 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2548 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
@@ -2959,22 +3096,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
2959 } 3096 }
2960} 3097}
2961 3098
2962
2963/* 3099/*
2964 * handle_stripe - do things to a stripe. 3100 * handle_stripe - do things to a stripe.
2965 * 3101 *
2966 * We lock the stripe and then examine the state of various bits 3102 * We lock the stripe by setting STRIPE_ACTIVE and then examine the
2967 * to see what needs to be done. 3103 * state of various bits to see what needs to be done.
2968 * Possible results: 3104 * Possible results:
2969 * return some read request which now have data 3105 * return some read requests which now have data
2970 * return some write requests which are safely on disc 3106 * return some write requests which are safely on storage
2971 * schedule a read on some buffers 3107 * schedule a read on some buffers
2972 * schedule a write of some buffers 3108 * schedule a write of some buffers
2973 * return confirmation of parity correctness 3109 * return confirmation of parity correctness
2974 * 3110 *
2975 * buffers are taken off read_list or write_list, and bh_cache buffers
2976 * get BH_Lock set before the stripe lock is released.
2977 *
2978 */ 3111 */
2979 3112
2980static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3113static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
@@ -2983,10 +3116,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
2983 int disks = sh->disks; 3116 int disks = sh->disks;
2984 struct r5dev *dev; 3117 struct r5dev *dev;
2985 int i; 3118 int i;
3119 int do_recovery = 0;
2986 3120
2987 memset(s, 0, sizeof(*s)); 3121 memset(s, 0, sizeof(*s));
2988 3122
2989 s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
2990 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3123 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2991 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3124 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2992 s->failed_num[0] = -1; 3125 s->failed_num[0] = -1;
@@ -3004,7 +3137,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3004 dev = &sh->dev[i]; 3137 dev = &sh->dev[i];
3005 3138
3006 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3139 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3007 i, dev->flags, dev->toread, dev->towrite, dev->written); 3140 i, dev->flags,
3141 dev->toread, dev->towrite, dev->written);
3008 /* maybe we can reply to a read 3142 /* maybe we can reply to a read
3009 * 3143 *
3010 * new wantfill requests are only permitted while 3144 * new wantfill requests are only permitted while
@@ -3035,7 +3169,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3035 } 3169 }
3036 if (dev->written) 3170 if (dev->written)
3037 s->written++; 3171 s->written++;
3038 rdev = rcu_dereference(conf->disks[i].rdev); 3172 /* Prefer to use the replacement for reads, but only
3173 * if it is recovered enough and has no bad blocks.
3174 */
3175 rdev = rcu_dereference(conf->disks[i].replacement);
3176 if (rdev && !test_bit(Faulty, &rdev->flags) &&
3177 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
3178 !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3179 &first_bad, &bad_sectors))
3180 set_bit(R5_ReadRepl, &dev->flags);
3181 else {
3182 if (rdev)
3183 set_bit(R5_NeedReplace, &dev->flags);
3184 rdev = rcu_dereference(conf->disks[i].rdev);
3185 clear_bit(R5_ReadRepl, &dev->flags);
3186 }
3039 if (rdev && test_bit(Faulty, &rdev->flags)) 3187 if (rdev && test_bit(Faulty, &rdev->flags))
3040 rdev = NULL; 3188 rdev = NULL;
3041 if (rdev) { 3189 if (rdev) {
@@ -3077,20 +3225,38 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3077 set_bit(R5_Insync, &dev->flags); 3225 set_bit(R5_Insync, &dev->flags);
3078 3226
3079 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3227 if (rdev && test_bit(R5_WriteError, &dev->flags)) {
3080 clear_bit(R5_Insync, &dev->flags); 3228 /* This flag does not apply to '.replacement'
3081 if (!test_bit(Faulty, &rdev->flags)) { 3229 * only to .rdev, so make sure to check that*/
3230 struct md_rdev *rdev2 = rcu_dereference(
3231 conf->disks[i].rdev);
3232 if (rdev2 == rdev)
3233 clear_bit(R5_Insync, &dev->flags);
3234 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3082 s->handle_bad_blocks = 1; 3235 s->handle_bad_blocks = 1;
3083 atomic_inc(&rdev->nr_pending); 3236 atomic_inc(&rdev2->nr_pending);
3084 } else 3237 } else
3085 clear_bit(R5_WriteError, &dev->flags); 3238 clear_bit(R5_WriteError, &dev->flags);
3086 } 3239 }
3087 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3240 if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
3088 if (!test_bit(Faulty, &rdev->flags)) { 3241 /* This flag does not apply to '.replacement'
3242 * only to .rdev, so make sure to check that*/
3243 struct md_rdev *rdev2 = rcu_dereference(
3244 conf->disks[i].rdev);
3245 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3089 s->handle_bad_blocks = 1; 3246 s->handle_bad_blocks = 1;
3090 atomic_inc(&rdev->nr_pending); 3247 atomic_inc(&rdev2->nr_pending);
3091 } else 3248 } else
3092 clear_bit(R5_MadeGood, &dev->flags); 3249 clear_bit(R5_MadeGood, &dev->flags);
3093 } 3250 }
3251 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
3252 struct md_rdev *rdev2 = rcu_dereference(
3253 conf->disks[i].replacement);
3254 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3255 s->handle_bad_blocks = 1;
3256 atomic_inc(&rdev2->nr_pending);
3257 } else
3258 clear_bit(R5_MadeGoodRepl, &dev->flags);
3259 }
3094 if (!test_bit(R5_Insync, &dev->flags)) { 3260 if (!test_bit(R5_Insync, &dev->flags)) {
3095 /* The ReadError flag will just be confusing now */ 3261 /* The ReadError flag will just be confusing now */
3096 clear_bit(R5_ReadError, &dev->flags); 3262 clear_bit(R5_ReadError, &dev->flags);
@@ -3102,9 +3268,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3102 if (s->failed < 2) 3268 if (s->failed < 2)
3103 s->failed_num[s->failed] = i; 3269 s->failed_num[s->failed] = i;
3104 s->failed++; 3270 s->failed++;
3271 if (rdev && !test_bit(Faulty, &rdev->flags))
3272 do_recovery = 1;
3105 } 3273 }
3106 } 3274 }
3107 spin_unlock_irq(&conf->device_lock); 3275 spin_unlock_irq(&conf->device_lock);
3276 if (test_bit(STRIPE_SYNCING, &sh->state)) {
3277 /* If there is a failed device being replaced,
3278 * we must be recovering.
3279 * else if we are after recovery_cp, we must be syncing
3280 * else we can only be replacing
3281 * sync and recovery both need to read all devices, and so
3282 * use the same flag.
3283 */
3284 if (do_recovery ||
3285 sh->sector >= conf->mddev->recovery_cp)
3286 s->syncing = 1;
3287 else
3288 s->replacing = 1;
3289 }
3108 rcu_read_unlock(); 3290 rcu_read_unlock();
3109} 3291}
3110 3292
@@ -3146,7 +3328,7 @@ static void handle_stripe(struct stripe_head *sh)
3146 3328
3147 if (unlikely(s.blocked_rdev)) { 3329 if (unlikely(s.blocked_rdev)) {
3148 if (s.syncing || s.expanding || s.expanded || 3330 if (s.syncing || s.expanding || s.expanded ||
3149 s.to_write || s.written) { 3331 s.replacing || s.to_write || s.written) {
3150 set_bit(STRIPE_HANDLE, &sh->state); 3332 set_bit(STRIPE_HANDLE, &sh->state);
3151 goto finish; 3333 goto finish;
3152 } 3334 }
@@ -3172,7 +3354,7 @@ static void handle_stripe(struct stripe_head *sh)
3172 sh->reconstruct_state = 0; 3354 sh->reconstruct_state = 0;
3173 if (s.to_read+s.to_write+s.written) 3355 if (s.to_read+s.to_write+s.written)
3174 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3356 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3175 if (s.syncing) 3357 if (s.syncing + s.replacing)
3176 handle_failed_sync(conf, sh, &s); 3358 handle_failed_sync(conf, sh, &s);
3177 } 3359 }
3178 3360
@@ -3203,7 +3385,9 @@ static void handle_stripe(struct stripe_head *sh)
3203 */ 3385 */
3204 if (s.to_read || s.non_overwrite 3386 if (s.to_read || s.non_overwrite
3205 || (conf->level == 6 && s.to_write && s.failed) 3387 || (conf->level == 6 && s.to_write && s.failed)
3206 || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3388 || (s.syncing && (s.uptodate + s.compute < disks))
3389 || s.replacing
3390 || s.expanding)
3207 handle_stripe_fill(sh, &s, disks); 3391 handle_stripe_fill(sh, &s, disks);
3208 3392
3209 /* Now we check to see if any write operations have recently 3393 /* Now we check to see if any write operations have recently
@@ -3265,7 +3449,20 @@ static void handle_stripe(struct stripe_head *sh)
3265 handle_parity_checks5(conf, sh, &s, disks); 3449 handle_parity_checks5(conf, sh, &s, disks);
3266 } 3450 }
3267 3451
3268 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3452 if (s.replacing && s.locked == 0
3453 && !test_bit(STRIPE_INSYNC, &sh->state)) {
3454 /* Write out to replacement devices where possible */
3455 for (i = 0; i < conf->raid_disks; i++)
3456 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
3457 test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
3458 set_bit(R5_WantReplace, &sh->dev[i].flags);
3459 set_bit(R5_LOCKED, &sh->dev[i].flags);
3460 s.locked++;
3461 }
3462 set_bit(STRIPE_INSYNC, &sh->state);
3463 }
3464 if ((s.syncing || s.replacing) && s.locked == 0 &&
3465 test_bit(STRIPE_INSYNC, &sh->state)) {
3269 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3466 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3270 clear_bit(STRIPE_SYNCING, &sh->state); 3467 clear_bit(STRIPE_SYNCING, &sh->state);
3271 } 3468 }
@@ -3363,6 +3560,15 @@ finish:
3363 STRIPE_SECTORS); 3560 STRIPE_SECTORS);
3364 rdev_dec_pending(rdev, conf->mddev); 3561 rdev_dec_pending(rdev, conf->mddev);
3365 } 3562 }
3563 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
3564 rdev = conf->disks[i].replacement;
3565 if (!rdev)
3566 /* rdev have been moved down */
3567 rdev = conf->disks[i].rdev;
3568 rdev_clear_badblocks(rdev, sh->sector,
3569 STRIPE_SECTORS);
3570 rdev_dec_pending(rdev, conf->mddev);
3571 }
3366 } 3572 }
3367 3573
3368 if (s.ops_request) 3574 if (s.ops_request)
@@ -3586,6 +3792,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3586 int dd_idx; 3792 int dd_idx;
3587 struct bio* align_bi; 3793 struct bio* align_bi;
3588 struct md_rdev *rdev; 3794 struct md_rdev *rdev;
3795 sector_t end_sector;
3589 3796
3590 if (!in_chunk_boundary(mddev, raid_bio)) { 3797 if (!in_chunk_boundary(mddev, raid_bio)) {
3591 pr_debug("chunk_aligned_read : non aligned\n"); 3798 pr_debug("chunk_aligned_read : non aligned\n");
@@ -3610,9 +3817,19 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3610 0, 3817 0,
3611 &dd_idx, NULL); 3818 &dd_idx, NULL);
3612 3819
3820 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
3613 rcu_read_lock(); 3821 rcu_read_lock();
3614 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3822 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
3615 if (rdev && test_bit(In_sync, &rdev->flags)) { 3823 if (!rdev || test_bit(Faulty, &rdev->flags) ||
3824 rdev->recovery_offset < end_sector) {
3825 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3826 if (rdev &&
3827 (test_bit(Faulty, &rdev->flags) ||
3828 !(test_bit(In_sync, &rdev->flags) ||
3829 rdev->recovery_offset >= end_sector)))
3830 rdev = NULL;
3831 }
3832 if (rdev) {
3616 sector_t first_bad; 3833 sector_t first_bad;
3617 int bad_sectors; 3834 int bad_sectors;
3618 3835
@@ -4137,7 +4354,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
4137 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4354 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
4138 } 4355 }
4139 4356
4140
4141 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4357 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
4142 4358
4143 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4359 sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
@@ -4208,7 +4424,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4208 return handled; 4424 return handled;
4209 } 4425 }
4210 4426
4211 set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
4212 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4427 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4213 release_stripe(sh); 4428 release_stripe(sh);
4214 raid5_set_bi_hw_segments(raid_bio, scnt); 4429 raid5_set_bi_hw_segments(raid_bio, scnt);
@@ -4635,7 +4850,15 @@ static struct r5conf *setup_conf(struct mddev *mddev)
4635 continue; 4850 continue;
4636 disk = conf->disks + raid_disk; 4851 disk = conf->disks + raid_disk;
4637 4852
4638 disk->rdev = rdev; 4853 if (test_bit(Replacement, &rdev->flags)) {
4854 if (disk->replacement)
4855 goto abort;
4856 disk->replacement = rdev;
4857 } else {
4858 if (disk->rdev)
4859 goto abort;
4860 disk->rdev = rdev;
4861 }
4639 4862
4640 if (test_bit(In_sync, &rdev->flags)) { 4863 if (test_bit(In_sync, &rdev->flags)) {
4641 char b[BDEVNAME_SIZE]; 4864 char b[BDEVNAME_SIZE];
@@ -4724,6 +4947,7 @@ static int run(struct mddev *mddev)
4724 int dirty_parity_disks = 0; 4947 int dirty_parity_disks = 0;
4725 struct md_rdev *rdev; 4948 struct md_rdev *rdev;
4726 sector_t reshape_offset = 0; 4949 sector_t reshape_offset = 0;
4950 int i;
4727 4951
4728 if (mddev->recovery_cp != MaxSector) 4952 if (mddev->recovery_cp != MaxSector)
4729 printk(KERN_NOTICE "md/raid:%s: not clean" 4953 printk(KERN_NOTICE "md/raid:%s: not clean"
@@ -4813,12 +5037,25 @@ static int run(struct mddev *mddev)
4813 conf->thread = NULL; 5037 conf->thread = NULL;
4814 mddev->private = conf; 5038 mddev->private = conf;
4815 5039
4816 /* 5040 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
4817 * 0 for a fully functional array, 1 or 2 for a degraded array. 5041 i++) {
4818 */ 5042 rdev = conf->disks[i].rdev;
4819 list_for_each_entry(rdev, &mddev->disks, same_set) { 5043 if (!rdev && conf->disks[i].replacement) {
4820 if (rdev->raid_disk < 0) 5044 /* The replacement is all we have yet */
5045 rdev = conf->disks[i].replacement;
5046 conf->disks[i].replacement = NULL;
5047 clear_bit(Replacement, &rdev->flags);
5048 conf->disks[i].rdev = rdev;
5049 }
5050 if (!rdev)
4821 continue; 5051 continue;
5052 if (conf->disks[i].replacement &&
5053 conf->reshape_progress != MaxSector) {
5054 /* replacements and reshape simply do not mix. */
5055 printk(KERN_ERR "md: cannot handle concurrent "
5056 "replacement and reshape.\n");
5057 goto abort;
5058 }
4822 if (test_bit(In_sync, &rdev->flags)) { 5059 if (test_bit(In_sync, &rdev->flags)) {
4823 working_disks++; 5060 working_disks++;
4824 continue; 5061 continue;
@@ -4852,8 +5089,10 @@ static int run(struct mddev *mddev)
4852 dirty_parity_disks++; 5089 dirty_parity_disks++;
4853 } 5090 }
4854 5091
4855 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) 5092 /*
4856 - working_disks); 5093 * 0 for a fully functional array, 1 or 2 for a degraded array.
5094 */
5095 mddev->degraded = calc_degraded(conf);
4857 5096
4858 if (has_failed(conf)) { 5097 if (has_failed(conf)) {
4859 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5098 printk(KERN_ERR "md/raid:%s: not enough operational devices"
@@ -5016,7 +5255,25 @@ static int raid5_spare_active(struct mddev *mddev)
5016 5255
5017 for (i = 0; i < conf->raid_disks; i++) { 5256 for (i = 0; i < conf->raid_disks; i++) {
5018 tmp = conf->disks + i; 5257 tmp = conf->disks + i;
5019 if (tmp->rdev 5258 if (tmp->replacement
5259 && tmp->replacement->recovery_offset == MaxSector
5260 && !test_bit(Faulty, &tmp->replacement->flags)
5261 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
5262 /* Replacement has just become active. */
5263 if (!tmp->rdev
5264 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
5265 count++;
5266 if (tmp->rdev) {
5267 /* Replaced device not technically faulty,
5268 * but we need to be sure it gets removed
5269 * and never re-added.
5270 */
5271 set_bit(Faulty, &tmp->rdev->flags);
5272 sysfs_notify_dirent_safe(
5273 tmp->rdev->sysfs_state);
5274 }
5275 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
5276 } else if (tmp->rdev
5020 && tmp->rdev->recovery_offset == MaxSector 5277 && tmp->rdev->recovery_offset == MaxSector
5021 && !test_bit(Faulty, &tmp->rdev->flags) 5278 && !test_bit(Faulty, &tmp->rdev->flags)
5022 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5279 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
@@ -5025,49 +5282,68 @@ static int raid5_spare_active(struct mddev *mddev)
5025 } 5282 }
5026 } 5283 }
5027 spin_lock_irqsave(&conf->device_lock, flags); 5284 spin_lock_irqsave(&conf->device_lock, flags);
5028 mddev->degraded -= count; 5285 mddev->degraded = calc_degraded(conf);
5029 spin_unlock_irqrestore(&conf->device_lock, flags); 5286 spin_unlock_irqrestore(&conf->device_lock, flags);
5030 print_raid5_conf(conf); 5287 print_raid5_conf(conf);
5031 return count; 5288 return count;
5032} 5289}
5033 5290
5034static int raid5_remove_disk(struct mddev *mddev, int number) 5291static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
5035{ 5292{
5036 struct r5conf *conf = mddev->private; 5293 struct r5conf *conf = mddev->private;
5037 int err = 0; 5294 int err = 0;
5038 struct md_rdev *rdev; 5295 int number = rdev->raid_disk;
5296 struct md_rdev **rdevp;
5039 struct disk_info *p = conf->disks + number; 5297 struct disk_info *p = conf->disks + number;
5040 5298
5041 print_raid5_conf(conf); 5299 print_raid5_conf(conf);
5042 rdev = p->rdev; 5300 if (rdev == p->rdev)
5043 if (rdev) { 5301 rdevp = &p->rdev;
5044 if (number >= conf->raid_disks && 5302 else if (rdev == p->replacement)
5045 conf->reshape_progress == MaxSector) 5303 rdevp = &p->replacement;
5046 clear_bit(In_sync, &rdev->flags); 5304 else
5305 return 0;
5047 5306
5048 if (test_bit(In_sync, &rdev->flags) || 5307 if (number >= conf->raid_disks &&
5049 atomic_read(&rdev->nr_pending)) { 5308 conf->reshape_progress == MaxSector)
5050 err = -EBUSY; 5309 clear_bit(In_sync, &rdev->flags);
5051 goto abort; 5310
5052 } 5311 if (test_bit(In_sync, &rdev->flags) ||
5053 /* Only remove non-faulty devices if recovery 5312 atomic_read(&rdev->nr_pending)) {
5054 * isn't possible. 5313 err = -EBUSY;
5055 */ 5314 goto abort;
5056 if (!test_bit(Faulty, &rdev->flags) &&
5057 mddev->recovery_disabled != conf->recovery_disabled &&
5058 !has_failed(conf) &&
5059 number < conf->raid_disks) {
5060 err = -EBUSY;
5061 goto abort;
5062 }
5063 p->rdev = NULL;
5064 synchronize_rcu();
5065 if (atomic_read(&rdev->nr_pending)) {
5066 /* lost the race, try later */
5067 err = -EBUSY;
5068 p->rdev = rdev;
5069 }
5070 } 5315 }
5316 /* Only remove non-faulty devices if recovery
5317 * isn't possible.
5318 */
5319 if (!test_bit(Faulty, &rdev->flags) &&
5320 mddev->recovery_disabled != conf->recovery_disabled &&
5321 !has_failed(conf) &&
5322 (!p->replacement || p->replacement == rdev) &&
5323 number < conf->raid_disks) {
5324 err = -EBUSY;
5325 goto abort;
5326 }
5327 *rdevp = NULL;
5328 synchronize_rcu();
5329 if (atomic_read(&rdev->nr_pending)) {
5330 /* lost the race, try later */
5331 err = -EBUSY;
5332 *rdevp = rdev;
5333 } else if (p->replacement) {
5334 /* We must have just cleared 'rdev' */
5335 p->rdev = p->replacement;
5336 clear_bit(Replacement, &p->replacement->flags);
5337 smp_mb(); /* Make sure other CPUs may see both as identical
5338 * but will never see neither - if they are careful
5339 */
5340 p->replacement = NULL;
5341 clear_bit(WantReplacement, &rdev->flags);
5342 } else
5343 /* We might have just removed the Replacement as faulty-
5344 * clear the bit just in case
5345 */
5346 clear_bit(WantReplacement, &rdev->flags);
5071abort: 5347abort:
5072 5348
5073 print_raid5_conf(conf); 5349 print_raid5_conf(conf);
@@ -5103,8 +5379,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5103 disk = rdev->saved_raid_disk; 5379 disk = rdev->saved_raid_disk;
5104 else 5380 else
5105 disk = first; 5381 disk = first;
5106 for ( ; disk <= last ; disk++) 5382 for ( ; disk <= last ; disk++) {
5107 if ((p=conf->disks + disk)->rdev == NULL) { 5383 p = conf->disks + disk;
5384 if (p->rdev == NULL) {
5108 clear_bit(In_sync, &rdev->flags); 5385 clear_bit(In_sync, &rdev->flags);
5109 rdev->raid_disk = disk; 5386 rdev->raid_disk = disk;
5110 err = 0; 5387 err = 0;
@@ -5113,6 +5390,17 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5113 rcu_assign_pointer(p->rdev, rdev); 5390 rcu_assign_pointer(p->rdev, rdev);
5114 break; 5391 break;
5115 } 5392 }
5393 if (test_bit(WantReplacement, &p->rdev->flags) &&
5394 p->replacement == NULL) {
5395 clear_bit(In_sync, &rdev->flags);
5396 set_bit(Replacement, &rdev->flags);
5397 rdev->raid_disk = disk;
5398 err = 0;
5399 conf->fullsync = 1;
5400 rcu_assign_pointer(p->replacement, rdev);
5401 break;
5402 }
5403 }
5116 print_raid5_conf(conf); 5404 print_raid5_conf(conf);
5117 return err; 5405 return err;
5118} 5406}
@@ -5286,8 +5574,7 @@ static int raid5_start_reshape(struct mddev *mddev)
5286 * pre and post number of devices. 5574 * pre and post number of devices.
5287 */ 5575 */
5288 spin_lock_irqsave(&conf->device_lock, flags); 5576 spin_lock_irqsave(&conf->device_lock, flags);
5289 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) 5577 mddev->degraded = calc_degraded(conf);
5290 - added_devices;
5291 spin_unlock_irqrestore(&conf->device_lock, flags); 5578 spin_unlock_irqrestore(&conf->device_lock, flags);
5292 } 5579 }
5293 mddev->raid_disks = conf->raid_disks; 5580 mddev->raid_disks = conf->raid_disks;
@@ -5356,17 +5643,15 @@ static void raid5_finish_reshape(struct mddev *mddev)
5356 revalidate_disk(mddev->gendisk); 5643 revalidate_disk(mddev->gendisk);
5357 } else { 5644 } else {
5358 int d; 5645 int d;
5359 mddev->degraded = conf->raid_disks; 5646 spin_lock_irq(&conf->device_lock);
5360 for (d = 0; d < conf->raid_disks ; d++) 5647 mddev->degraded = calc_degraded(conf);
5361 if (conf->disks[d].rdev && 5648 spin_unlock_irq(&conf->device_lock);
5362 test_bit(In_sync,
5363 &conf->disks[d].rdev->flags))
5364 mddev->degraded--;
5365 for (d = conf->raid_disks ; 5649 for (d = conf->raid_disks ;
5366 d < conf->raid_disks - mddev->delta_disks; 5650 d < conf->raid_disks - mddev->delta_disks;
5367 d++) { 5651 d++) {
5368 struct md_rdev *rdev = conf->disks[d].rdev; 5652 struct md_rdev *rdev = conf->disks[d].rdev;
5369 if (rdev && raid5_remove_disk(mddev, d) == 0) { 5653 if (rdev &&
5654 raid5_remove_disk(mddev, rdev) == 0) {
5370 sysfs_unlink_rdev(mddev, rdev); 5655 sysfs_unlink_rdev(mddev, rdev);
5371 rdev->raid_disk = -1; 5656 rdev->raid_disk = -1;
5372 } 5657 }
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index e10c5531f9c5..8d8e13934a48 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -27,7 +27,7 @@
27 * The possible state transitions are: 27 * The possible state transitions are:
28 * 28 *
29 * Empty -> Want - on read or write to get old data for parity calc 29 * Empty -> Want - on read or write to get old data for parity calc
30 * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) 30 * Empty -> Dirty - on compute_parity to satisfy write/sync request.
31 * Empty -> Clean - on compute_block when computing a block for failed drive 31 * Empty -> Clean - on compute_block when computing a block for failed drive
32 * Want -> Empty - on failed read 32 * Want -> Empty - on failed read
33 * Want -> Clean - on successful completion of read request 33 * Want -> Clean - on successful completion of read request
@@ -226,8 +226,11 @@ struct stripe_head {
226 #endif 226 #endif
227 } ops; 227 } ops;
228 struct r5dev { 228 struct r5dev {
229 struct bio req; 229 /* rreq and rvec are used for the replacement device when
230 struct bio_vec vec; 230 * writing data to both devices.
231 */
232 struct bio req, rreq;
233 struct bio_vec vec, rvec;
231 struct page *page; 234 struct page *page;
232 struct bio *toread, *read, *towrite, *written; 235 struct bio *toread, *read, *towrite, *written;
233 sector_t sector; /* sector of this page */ 236 sector_t sector; /* sector of this page */
@@ -239,7 +242,13 @@ struct stripe_head {
239 * for handle_stripe. 242 * for handle_stripe.
240 */ 243 */
241struct stripe_head_state { 244struct stripe_head_state {
242 int syncing, expanding, expanded; 245 /* 'syncing' means that we need to read all devices, either
246 * to check/correct parity, or to reconstruct a missing device.
247 * 'replacing' means we are replacing one or more drives and
248 * the source is valid at this point so we don't need to
249 * read all devices, just the replacement targets.
250 */
251 int syncing, expanding, expanded, replacing;
243 int locked, uptodate, to_read, to_write, failed, written; 252 int locked, uptodate, to_read, to_write, failed, written;
244 int to_fill, compute, req_compute, non_overwrite; 253 int to_fill, compute, req_compute, non_overwrite;
245 int failed_num[2]; 254 int failed_num[2];
@@ -252,38 +261,41 @@ struct stripe_head_state {
252 int handle_bad_blocks; 261 int handle_bad_blocks;
253}; 262};
254 263
255/* Flags */ 264/* Flags for struct r5dev.flags */
256#define R5_UPTODATE 0 /* page contains current data */ 265enum r5dev_flags {
257#define R5_LOCKED 1 /* IO has been submitted on "req" */ 266 R5_UPTODATE, /* page contains current data */
258#define R5_OVERWRITE 2 /* towrite covers whole page */ 267 R5_LOCKED, /* IO has been submitted on "req" */
268 R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
269 R5_OVERWRITE, /* towrite covers whole page */
259/* and some that are internal to handle_stripe */ 270/* and some that are internal to handle_stripe */
260#define R5_Insync 3 /* rdev && rdev->in_sync at start */ 271 R5_Insync, /* rdev && rdev->in_sync at start */
261#define R5_Wantread 4 /* want to schedule a read */ 272 R5_Wantread, /* want to schedule a read */
262#define R5_Wantwrite 5 273 R5_Wantwrite,
263#define R5_Overlap 7 /* There is a pending overlapping request on this block */ 274 R5_Overlap, /* There is a pending overlapping request
264#define R5_ReadError 8 /* seen a read error here recently */ 275 * on this block */
265#define R5_ReWrite 9 /* have tried to over-write the readerror */ 276 R5_ReadError, /* seen a read error here recently */
277 R5_ReWrite, /* have tried to over-write the readerror */
266 278
267#define R5_Expanded 10 /* This block now has post-expand data */ 279 R5_Expanded, /* This block now has post-expand data */
268#define R5_Wantcompute 11 /* compute_block in progress treat as 280 R5_Wantcompute, /* compute_block in progress treat as
269 * uptodate 281 * uptodate
270 */ 282 */
271#define R5_Wantfill 12 /* dev->toread contains a bio that needs 283 R5_Wantfill, /* dev->toread contains a bio that needs
272 * filling 284 * filling
273 */ 285 */
274#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ 286 R5_Wantdrain, /* dev->towrite needs to be drained */
275#define R5_WantFUA 14 /* Write should be FUA */ 287 R5_WantFUA, /* Write should be FUA */
276#define R5_WriteError 15 /* got a write error - need to record it */ 288 R5_WriteError, /* got a write error - need to record it */
277#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/ 289 R5_MadeGood, /* A bad block has been fixed by writing to it */
278/* 290 R5_ReadRepl, /* Will/did read from replacement rather than orig */
279 * Write method 291 R5_MadeGoodRepl,/* A bad block on the replacement device has been
280 */ 292 * fixed by writing to it */
281#define RECONSTRUCT_WRITE 1 293 R5_NeedReplace, /* This device has a replacement which is not
282#define READ_MODIFY_WRITE 2 294 * up-to-date at this stripe. */
283/* not a write method, but a compute_parity mode */ 295 R5_WantReplace, /* We need to update the replacement, we have read
284#define CHECK_PARITY 3 296 * data in, and now is a good time to write it out.
285/* Additional compute_parity mode -- updates the parity w/o LOCKING */ 297 */
286#define UPDATE_PARITY 4 298};
287 299
288/* 300/*
289 * Stripe state 301 * Stripe state
@@ -311,13 +323,14 @@ enum {
311/* 323/*
312 * Operation request flags 324 * Operation request flags
313 */ 325 */
314#define STRIPE_OP_BIOFILL 0 326enum {
315#define STRIPE_OP_COMPUTE_BLK 1 327 STRIPE_OP_BIOFILL,
316#define STRIPE_OP_PREXOR 2 328 STRIPE_OP_COMPUTE_BLK,
317#define STRIPE_OP_BIODRAIN 3 329 STRIPE_OP_PREXOR,
318#define STRIPE_OP_RECONSTRUCT 4 330 STRIPE_OP_BIODRAIN,
319#define STRIPE_OP_CHECK 5 331 STRIPE_OP_RECONSTRUCT,
320 332 STRIPE_OP_CHECK,
333};
321/* 334/*
322 * Plugging: 335 * Plugging:
323 * 336 *
@@ -344,13 +357,12 @@ enum {
344 357
345 358
346struct disk_info { 359struct disk_info {
347 struct md_rdev *rdev; 360 struct md_rdev *rdev, *replacement;
348}; 361};
349 362
350struct r5conf { 363struct r5conf {
351 struct hlist_head *stripe_hashtbl; 364 struct hlist_head *stripe_hashtbl;
352 struct mddev *mddev; 365 struct mddev *mddev;
353 struct disk_info *spare;
354 int chunk_sectors; 366 int chunk_sectors;
355 int level, algorithm; 367 int level, algorithm;
356 int max_degraded; 368 int max_degraded;