aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/md.txt22
-rw-r--r--drivers/md/bitmap.c12
-rw-r--r--drivers/md/md.c107
-rw-r--r--drivers/md/md.h82
-rw-r--r--drivers/md/multipath.c7
-rw-r--r--drivers/md/raid1.c174
-rw-r--r--drivers/md/raid1.h7
-rw-r--r--drivers/md/raid10.c582
-rw-r--r--drivers/md/raid10.h61
-rw-r--r--drivers/md/raid5.c557
-rw-r--r--drivers/md/raid5.h98
-rw-r--r--include/linux/raid/md_p.h7
-rw-r--r--include/linux/raid/pq.h2
13 files changed, 1280 insertions, 438 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt
index fc94770f44ab..993fba37b7d1 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -357,14 +357,14 @@ Each directory contains:
357 written to, that device. 357 written to, that device.
358 358
359 state 359 state
360 A file recording the current state of the device in the array 360 A file recording the current state of the device in the array
361 which can be a comma separated list of 361 which can be a comma separated list of
362 faulty - device has been kicked from active use due to 362 faulty - device has been kicked from active use due to
363 a detected fault or it has unacknowledged bad 363 a detected fault, or it has unacknowledged bad
364 blocks 364 blocks
365 in_sync - device is a fully in-sync member of the array 365 in_sync - device is a fully in-sync member of the array
366 writemostly - device will only be subject to read 366 writemostly - device will only be subject to read
367 requests if there are no other options. 367 requests if there are no other options.
368 This applies only to raid1 arrays. 368 This applies only to raid1 arrays.
369 blocked - device has failed, and the failure hasn't been 369 blocked - device has failed, and the failure hasn't been
370 acknowledged yet by the metadata handler. 370 acknowledged yet by the metadata handler.
@@ -374,6 +374,13 @@ Each directory contains:
374 This includes spares that are in the process 374 This includes spares that are in the process
375 of being recovered to 375 of being recovered to
376 write_error - device has ever seen a write error. 376 write_error - device has ever seen a write error.
377 want_replacement - device is (mostly) working but probably
378 should be replaced, either due to errors or
379 due to user request.
380 replacement - device is a replacement for another active
381 device with same raid_disk.
382
383
377 This list may grow in future. 384 This list may grow in future.
378 This can be written to. 385 This can be written to.
379 Writing "faulty" simulates a failure on the device. 386 Writing "faulty" simulates a failure on the device.
@@ -386,6 +393,13 @@ Each directory contains:
386 Writing "in_sync" sets the in_sync flag. 393 Writing "in_sync" sets the in_sync flag.
387 Writing "write_error" sets writeerrorseen flag. 394 Writing "write_error" sets writeerrorseen flag.
388 Writing "-write_error" clears writeerrorseen flag. 395 Writing "-write_error" clears writeerrorseen flag.
396 Writing "want_replacement" is allowed at any time except to a
397 replacement device or a spare. It sets the flag.
398 Writing "-want_replacement" is allowed at any time. It clears
399 the flag.
400 Writing "replacement" or "-replacement" is only allowed before
401 starting the array. It sets or clears the flag.
402
389 403
390 This file responds to select/poll. Any change to 'faulty' 404 This file responds to select/poll. Any change to 'faulty'
391 or 'blocked' causes an event. 405 or 'blocked' causes an event.
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 6d03774b176e..cdf36b1e9aa6 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1149,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev)
1149 return; 1149 return;
1150 } 1150 }
1151 if (time_before(jiffies, bitmap->daemon_lastrun 1151 if (time_before(jiffies, bitmap->daemon_lastrun
1152 + bitmap->mddev->bitmap_info.daemon_sleep)) 1152 + mddev->bitmap_info.daemon_sleep))
1153 goto done; 1153 goto done;
1154 1154
1155 bitmap->daemon_lastrun = jiffies; 1155 bitmap->daemon_lastrun = jiffies;
1156 if (bitmap->allclean) { 1156 if (bitmap->allclean) {
1157 bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 1157 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1158 goto done; 1158 goto done;
1159 } 1159 }
1160 bitmap->allclean = 1; 1160 bitmap->allclean = 1;
@@ -1206,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev)
1206 * sure that events_cleared is up-to-date. 1206 * sure that events_cleared is up-to-date.
1207 */ 1207 */
1208 if (bitmap->need_sync && 1208 if (bitmap->need_sync &&
1209 bitmap->mddev->bitmap_info.external == 0) { 1209 mddev->bitmap_info.external == 0) {
1210 bitmap_super_t *sb; 1210 bitmap_super_t *sb;
1211 bitmap->need_sync = 0; 1211 bitmap->need_sync = 0;
1212 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 1212 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
@@ -1270,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev)
1270 1270
1271 done: 1271 done:
1272 if (bitmap->allclean == 0) 1272 if (bitmap->allclean == 0)
1273 bitmap->mddev->thread->timeout = 1273 mddev->thread->timeout =
1274 bitmap->mddev->bitmap_info.daemon_sleep; 1274 mddev->bitmap_info.daemon_sleep;
1275 mutex_unlock(&mddev->bitmap_info.mutex); 1275 mutex_unlock(&mddev->bitmap_info.mutex);
1276} 1276}
1277 1277
@@ -1587,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
1587 } 1587 }
1588 if (!*bmc) { 1588 if (!*bmc) {
1589 struct page *page; 1589 struct page *page;
1590 *bmc = 1 | (needed ? NEEDED_MASK : 0); 1590 *bmc = 2 | (needed ? NEEDED_MASK : 0);
1591 bitmap_count_page(bitmap, offset, 1); 1591 bitmap_count_page(bitmap, offset, 1);
1592 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); 1592 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
1593 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); 1593 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5d1b6762f108..ca8527fe77eb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1713,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1713 } 1713 }
1714 if (sb->devflags & WriteMostly1) 1714 if (sb->devflags & WriteMostly1)
1715 set_bit(WriteMostly, &rdev->flags); 1715 set_bit(WriteMostly, &rdev->flags);
1716 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1717 set_bit(Replacement, &rdev->flags);
1716 } else /* MULTIPATH are always insync */ 1718 } else /* MULTIPATH are always insync */
1717 set_bit(In_sync, &rdev->flags); 1719 set_bit(In_sync, &rdev->flags);
1718 1720
@@ -1766,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1766 sb->recovery_offset = 1768 sb->recovery_offset =
1767 cpu_to_le64(rdev->recovery_offset); 1769 cpu_to_le64(rdev->recovery_offset);
1768 } 1770 }
1771 if (test_bit(Replacement, &rdev->flags))
1772 sb->feature_map |=
1773 cpu_to_le32(MD_FEATURE_REPLACEMENT);
1769 1774
1770 if (mddev->reshape_position != MaxSector) { 1775 if (mddev->reshape_position != MaxSector) {
1771 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1776 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
@@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page)
2559 len += sprintf(page+len, "%swrite_error", sep); 2564 len += sprintf(page+len, "%swrite_error", sep);
2560 sep = ","; 2565 sep = ",";
2561 } 2566 }
2567 if (test_bit(WantReplacement, &rdev->flags)) {
2568 len += sprintf(page+len, "%swant_replacement", sep);
2569 sep = ",";
2570 }
2571 if (test_bit(Replacement, &rdev->flags)) {
2572 len += sprintf(page+len, "%sreplacement", sep);
2573 sep = ",";
2574 }
2575
2562 return len+sprintf(page+len, "\n"); 2576 return len+sprintf(page+len, "\n");
2563} 2577}
2564 2578
@@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2627 } else if (cmd_match(buf, "-write_error")) { 2641 } else if (cmd_match(buf, "-write_error")) {
2628 clear_bit(WriteErrorSeen, &rdev->flags); 2642 clear_bit(WriteErrorSeen, &rdev->flags);
2629 err = 0; 2643 err = 0;
2644 } else if (cmd_match(buf, "want_replacement")) {
2645 /* Any non-spare device that is not a replacement can
2646 * become want_replacement at any time, but we then need to
2647 * check if recovery is needed.
2648 */
2649 if (rdev->raid_disk >= 0 &&
2650 !test_bit(Replacement, &rdev->flags))
2651 set_bit(WantReplacement, &rdev->flags);
2652 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2653 md_wakeup_thread(rdev->mddev->thread);
2654 err = 0;
2655 } else if (cmd_match(buf, "-want_replacement")) {
2656 /* Clearing 'want_replacement' is always allowed.
2657 * Once replacements starts it is too late though.
2658 */
2659 err = 0;
2660 clear_bit(WantReplacement, &rdev->flags);
2661 } else if (cmd_match(buf, "replacement")) {
2662 /* Can only set a device as a replacement when array has not
2663 * yet been started. Once running, replacement is automatic
2664 * from spares, or by assigning 'slot'.
2665 */
2666 if (rdev->mddev->pers)
2667 err = -EBUSY;
2668 else {
2669 set_bit(Replacement, &rdev->flags);
2670 err = 0;
2671 }
2672 } else if (cmd_match(buf, "-replacement")) {
2673 /* Similarly, can only clear Replacement before start */
2674 if (rdev->mddev->pers)
2675 err = -EBUSY;
2676 else {
2677 clear_bit(Replacement, &rdev->flags);
2678 err = 0;
2679 }
2630 } 2680 }
2631 if (!err) 2681 if (!err)
2632 sysfs_notify_dirent_safe(rdev->sysfs_state); 2682 sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2688,7 +2738,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2688 if (rdev->mddev->pers->hot_remove_disk == NULL) 2738 if (rdev->mddev->pers->hot_remove_disk == NULL)
2689 return -EINVAL; 2739 return -EINVAL;
2690 err = rdev->mddev->pers-> 2740 err = rdev->mddev->pers->
2691 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2741 hot_remove_disk(rdev->mddev, rdev);
2692 if (err) 2742 if (err)
2693 return err; 2743 return err;
2694 sysfs_unlink_rdev(rdev->mddev, rdev); 2744 sysfs_unlink_rdev(rdev->mddev, rdev);
@@ -2696,7 +2746,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2696 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2746 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2697 md_wakeup_thread(rdev->mddev->thread); 2747 md_wakeup_thread(rdev->mddev->thread);
2698 } else if (rdev->mddev->pers) { 2748 } else if (rdev->mddev->pers) {
2699 struct md_rdev *rdev2;
2700 /* Activating a spare .. or possibly reactivating 2749 /* Activating a spare .. or possibly reactivating
2701 * if we ever get bitmaps working here. 2750 * if we ever get bitmaps working here.
2702 */ 2751 */
@@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2710 if (rdev->mddev->pers->hot_add_disk == NULL) 2759 if (rdev->mddev->pers->hot_add_disk == NULL)
2711 return -EINVAL; 2760 return -EINVAL;
2712 2761
2713 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2714 if (rdev2->raid_disk == slot)
2715 return -EEXIST;
2716
2717 if (slot >= rdev->mddev->raid_disks && 2762 if (slot >= rdev->mddev->raid_disks &&
2718 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2763 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2719 return -ENOSPC; 2764 return -ENOSPC;
@@ -6053,8 +6098,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6053 struct mddev *mddev = NULL; 6098 struct mddev *mddev = NULL;
6054 int ro; 6099 int ro;
6055 6100
6056 if (!capable(CAP_SYS_ADMIN)) 6101 switch (cmd) {
6057 return -EACCES; 6102 case RAID_VERSION:
6103 case GET_ARRAY_INFO:
6104 case GET_DISK_INFO:
6105 break;
6106 default:
6107 if (!capable(CAP_SYS_ADMIN))
6108 return -EACCES;
6109 }
6058 6110
6059 /* 6111 /*
6060 * Commands dealing with the RAID driver but not any 6112 * Commands dealing with the RAID driver but not any
@@ -6714,8 +6766,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
6714 if (test_bit(Faulty, &rdev->flags)) { 6766 if (test_bit(Faulty, &rdev->flags)) {
6715 seq_printf(seq, "(F)"); 6767 seq_printf(seq, "(F)");
6716 continue; 6768 continue;
6717 } else if (rdev->raid_disk < 0) 6769 }
6770 if (rdev->raid_disk < 0)
6718 seq_printf(seq, "(S)"); /* spare */ 6771 seq_printf(seq, "(S)"); /* spare */
6772 if (test_bit(Replacement, &rdev->flags))
6773 seq_printf(seq, "(R)");
6719 sectors += rdev->sectors; 6774 sectors += rdev->sectors;
6720 } 6775 }
6721 6776
@@ -7337,29 +7392,27 @@ static int remove_and_add_spares(struct mddev *mddev)
7337 ! test_bit(In_sync, &rdev->flags)) && 7392 ! test_bit(In_sync, &rdev->flags)) &&
7338 atomic_read(&rdev->nr_pending)==0) { 7393 atomic_read(&rdev->nr_pending)==0) {
7339 if (mddev->pers->hot_remove_disk( 7394 if (mddev->pers->hot_remove_disk(
7340 mddev, rdev->raid_disk)==0) { 7395 mddev, rdev) == 0) {
7341 sysfs_unlink_rdev(mddev, rdev); 7396 sysfs_unlink_rdev(mddev, rdev);
7342 rdev->raid_disk = -1; 7397 rdev->raid_disk = -1;
7343 } 7398 }
7344 } 7399 }
7345 7400
7346 if (mddev->degraded) { 7401 list_for_each_entry(rdev, &mddev->disks, same_set) {
7347 list_for_each_entry(rdev, &mddev->disks, same_set) { 7402 if (rdev->raid_disk >= 0 &&
7348 if (rdev->raid_disk >= 0 && 7403 !test_bit(In_sync, &rdev->flags) &&
7349 !test_bit(In_sync, &rdev->flags) && 7404 !test_bit(Faulty, &rdev->flags))
7350 !test_bit(Faulty, &rdev->flags)) 7405 spares++;
7406 if (rdev->raid_disk < 0
7407 && !test_bit(Faulty, &rdev->flags)) {
7408 rdev->recovery_offset = 0;
7409 if (mddev->pers->
7410 hot_add_disk(mddev, rdev) == 0) {
7411 if (sysfs_link_rdev(mddev, rdev))
7412 /* failure here is OK */;
7351 spares++; 7413 spares++;
7352 if (rdev->raid_disk < 0 7414 md_new_event(mddev);
7353 && !test_bit(Faulty, &rdev->flags)) { 7415 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7354 rdev->recovery_offset = 0;
7355 if (mddev->pers->
7356 hot_add_disk(mddev, rdev) == 0) {
7357 if (sysfs_link_rdev(mddev, rdev))
7358 /* failure here is OK */;
7359 spares++;
7360 md_new_event(mddev);
7361 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7362 }
7363 } 7416 }
7364 } 7417 }
7365 } 7418 }
@@ -7474,7 +7527,7 @@ void md_check_recovery(struct mddev *mddev)
7474 test_bit(Faulty, &rdev->flags) && 7527 test_bit(Faulty, &rdev->flags) &&
7475 atomic_read(&rdev->nr_pending)==0) { 7528 atomic_read(&rdev->nr_pending)==0) {
7476 if (mddev->pers->hot_remove_disk( 7529 if (mddev->pers->hot_remove_disk(
7477 mddev, rdev->raid_disk)==0) { 7530 mddev, rdev) == 0) {
7478 sysfs_unlink_rdev(mddev, rdev); 7531 sysfs_unlink_rdev(mddev, rdev);
7479 rdev->raid_disk = -1; 7532 rdev->raid_disk = -1;
7480 } 7533 }
diff --git a/drivers/md/md.h b/drivers/md/md.h
index cf742d9306ec..44c63dfeeb2b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -72,34 +72,7 @@ struct md_rdev {
72 * This reduces the burden of testing multiple flags in many cases 72 * This reduces the burden of testing multiple flags in many cases
73 */ 73 */
74 74
75 unsigned long flags; 75 unsigned long flags; /* bit set of 'enum flag_bits' bits. */
76#define Faulty 1 /* device is known to have a fault */
77#define In_sync 2 /* device is in_sync with rest of array */
78#define WriteMostly 4 /* Avoid reading if at all possible */
79#define AutoDetected 7 /* added by auto-detect */
80#define Blocked 8 /* An error occurred but has not yet
81 * been acknowledged by the metadata
82 * handler, so don't allow writes
83 * until it is cleared */
84#define WriteErrorSeen 9 /* A write error has been seen on this
85 * device
86 */
87#define FaultRecorded 10 /* Intermediate state for clearing
88 * Blocked. The Fault is/will-be
89 * recorded in the metadata, but that
90 * metadata hasn't been stored safely
91 * on disk yet.
92 */
93#define BlockedBadBlocks 11 /* A writer is blocked because they
94 * found an unacknowledged bad-block.
95 * This can safely be cleared at any
96 * time, and the writer will re-check.
97 * It may be set at any time, and at
98 * worst the writer will timeout and
99 * re-check. So setting it as
100 * accurately as possible is good, but
101 * not absolutely critical.
102 */
103 wait_queue_head_t blocked_wait; 76 wait_queue_head_t blocked_wait;
104 77
105 int desc_nr; /* descriptor index in the superblock */ 78 int desc_nr; /* descriptor index in the superblock */
@@ -152,6 +125,44 @@ struct md_rdev {
152 sector_t size; /* in sectors */ 125 sector_t size; /* in sectors */
153 } badblocks; 126 } badblocks;
154}; 127};
128enum flag_bits {
129 Faulty, /* device is known to have a fault */
130 In_sync, /* device is in_sync with rest of array */
131 WriteMostly, /* Avoid reading if at all possible */
132 AutoDetected, /* added by auto-detect */
133 Blocked, /* An error occurred but has not yet
134 * been acknowledged by the metadata
135 * handler, so don't allow writes
136 * until it is cleared */
137 WriteErrorSeen, /* A write error has been seen on this
138 * device
139 */
140 FaultRecorded, /* Intermediate state for clearing
141 * Blocked. The Fault is/will-be
142 * recorded in the metadata, but that
143 * metadata hasn't been stored safely
144 * on disk yet.
145 */
146 BlockedBadBlocks, /* A writer is blocked because they
147 * found an unacknowledged bad-block.
148 * This can safely be cleared at any
149 * time, and the writer will re-check.
150 * It may be set at any time, and at
151 * worst the writer will timeout and
152 * re-check. So setting it as
153 * accurately as possible is good, but
154 * not absolutely critical.
155 */
156 WantReplacement, /* This device is a candidate to be
157 * hot-replaced, either because it has
158 * reported some faults, or because
159 * of explicit request.
160 */
161 Replacement, /* This device is a replacement for
162 * a want_replacement device with same
163 * raid_disk number.
164 */
165};
155 166
156#define BB_LEN_MASK (0x00000000000001FFULL) 167#define BB_LEN_MASK (0x00000000000001FFULL)
157#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) 168#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
@@ -428,7 +439,7 @@ struct md_personality
428 */ 439 */
429 void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); 440 void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
430 int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); 441 int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
431 int (*hot_remove_disk) (struct mddev *mddev, int number); 442 int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
432 int (*spare_active) (struct mddev *mddev); 443 int (*spare_active) (struct mddev *mddev);
433 sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); 444 sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster);
434 int (*resize) (struct mddev *mddev, sector_t sectors); 445 int (*resize) (struct mddev *mddev, sector_t sectors);
@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev)
482static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) 493static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
483{ 494{
484 char nm[20]; 495 char nm[20];
485 sprintf(nm, "rd%d", rdev->raid_disk); 496 if (!test_bit(Replacement, &rdev->flags)) {
486 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 497 sprintf(nm, "rd%d", rdev->raid_disk);
498 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
499 } else
500 return 0;
487} 501}
488 502
489static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) 503static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
490{ 504{
491 char nm[20]; 505 char nm[20];
492 sprintf(nm, "rd%d", rdev->raid_disk); 506 if (!test_bit(Replacement, &rdev->flags)) {
493 sysfs_remove_link(&mddev->kobj, nm); 507 sprintf(nm, "rd%d", rdev->raid_disk);
508 sysfs_remove_link(&mddev->kobj, nm);
509 }
494} 510}
495 511
496/* 512/*
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 5899246fa37e..a222f516660e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -292,17 +292,16 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
292 return err; 292 return err;
293} 293}
294 294
295static int multipath_remove_disk(struct mddev *mddev, int number) 295static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
296{ 296{
297 struct mpconf *conf = mddev->private; 297 struct mpconf *conf = mddev->private;
298 int err = 0; 298 int err = 0;
299 struct md_rdev *rdev; 299 int number = rdev->raid_disk;
300 struct multipath_info *p = conf->multipaths + number; 300 struct multipath_info *p = conf->multipaths + number;
301 301
302 print_multipath_conf(conf); 302 print_multipath_conf(conf);
303 303
304 rdev = p->rdev; 304 if (rdev == p->rdev) {
305 if (rdev) {
306 if (test_bit(In_sync, &rdev->flags) || 305 if (test_bit(In_sync, &rdev->flags) ||
307 atomic_read(&rdev->nr_pending)) { 306 atomic_read(&rdev->nr_pending)) {
308 printk(KERN_ERR "hot-remove-disk, slot %d is identified" 307 printk(KERN_ERR "hot-remove-disk, slot %d is identified"
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ede2461e79c5..cc24f0cb7ee3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -135,7 +135,7 @@ out_free_pages:
135 put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); 135 put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
136 j = -1; 136 j = -1;
137out_free_bio: 137out_free_bio:
138 while ( ++j < pi->raid_disks ) 138 while (++j < pi->raid_disks)
139 bio_put(r1_bio->bios[j]); 139 bio_put(r1_bio->bios[j]);
140 r1bio_pool_free(r1_bio, data); 140 r1bio_pool_free(r1_bio, data);
141 return NULL; 141 return NULL;
@@ -164,7 +164,7 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
164{ 164{
165 int i; 165 int i;
166 166
167 for (i = 0; i < conf->raid_disks; i++) { 167 for (i = 0; i < conf->raid_disks * 2; i++) {
168 struct bio **bio = r1_bio->bios + i; 168 struct bio **bio = r1_bio->bios + i;
169 if (!BIO_SPECIAL(*bio)) 169 if (!BIO_SPECIAL(*bio))
170 bio_put(*bio); 170 bio_put(*bio);
@@ -185,7 +185,7 @@ static void put_buf(struct r1bio *r1_bio)
185 struct r1conf *conf = r1_bio->mddev->private; 185 struct r1conf *conf = r1_bio->mddev->private;
186 int i; 186 int i;
187 187
188 for (i=0; i<conf->raid_disks; i++) { 188 for (i = 0; i < conf->raid_disks * 2; i++) {
189 struct bio *bio = r1_bio->bios[i]; 189 struct bio *bio = r1_bio->bios[i];
190 if (bio->bi_end_io) 190 if (bio->bi_end_io)
191 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); 191 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
@@ -277,13 +277,14 @@ static inline void update_head_pos(int disk, struct r1bio *r1_bio)
277static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) 277static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
278{ 278{
279 int mirror; 279 int mirror;
280 int raid_disks = r1_bio->mddev->raid_disks; 280 struct r1conf *conf = r1_bio->mddev->private;
281 int raid_disks = conf->raid_disks;
281 282
282 for (mirror = 0; mirror < raid_disks; mirror++) 283 for (mirror = 0; mirror < raid_disks * 2; mirror++)
283 if (r1_bio->bios[mirror] == bio) 284 if (r1_bio->bios[mirror] == bio)
284 break; 285 break;
285 286
286 BUG_ON(mirror == raid_disks); 287 BUG_ON(mirror == raid_disks * 2);
287 update_head_pos(mirror, r1_bio); 288 update_head_pos(mirror, r1_bio);
288 289
289 return mirror; 290 return mirror;
@@ -390,6 +391,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
390 if (!uptodate) { 391 if (!uptodate) {
391 set_bit(WriteErrorSeen, 392 set_bit(WriteErrorSeen,
392 &conf->mirrors[mirror].rdev->flags); 393 &conf->mirrors[mirror].rdev->flags);
394 if (!test_and_set_bit(WantReplacement,
395 &conf->mirrors[mirror].rdev->flags))
396 set_bit(MD_RECOVERY_NEEDED, &
397 conf->mddev->recovery);
398
393 set_bit(R1BIO_WriteError, &r1_bio->state); 399 set_bit(R1BIO_WriteError, &r1_bio->state);
394 } else { 400 } else {
395 /* 401 /*
@@ -505,7 +511,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
505 start_disk = conf->last_used; 511 start_disk = conf->last_used;
506 } 512 }
507 513
508 for (i = 0 ; i < conf->raid_disks ; i++) { 514 for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
509 sector_t dist; 515 sector_t dist;
510 sector_t first_bad; 516 sector_t first_bad;
511 int bad_sectors; 517 int bad_sectors;
@@ -609,7 +615,7 @@ int md_raid1_congested(struct mddev *mddev, int bits)
609 return 1; 615 return 1;
610 616
611 rcu_read_lock(); 617 rcu_read_lock();
612 for (i = 0; i < mddev->raid_disks; i++) { 618 for (i = 0; i < conf->raid_disks; i++) {
613 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 619 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
614 if (rdev && !test_bit(Faulty, &rdev->flags)) { 620 if (rdev && !test_bit(Faulty, &rdev->flags)) {
615 struct request_queue *q = bdev_get_queue(rdev->bdev); 621 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -974,7 +980,7 @@ read_again:
974 */ 980 */
975 plugged = mddev_check_plugged(mddev); 981 plugged = mddev_check_plugged(mddev);
976 982
977 disks = conf->raid_disks; 983 disks = conf->raid_disks * 2;
978 retry_write: 984 retry_write:
979 blocked_rdev = NULL; 985 blocked_rdev = NULL;
980 rcu_read_lock(); 986 rcu_read_lock();
@@ -988,7 +994,8 @@ read_again:
988 } 994 }
989 r1_bio->bios[i] = NULL; 995 r1_bio->bios[i] = NULL;
990 if (!rdev || test_bit(Faulty, &rdev->flags)) { 996 if (!rdev || test_bit(Faulty, &rdev->flags)) {
991 set_bit(R1BIO_Degraded, &r1_bio->state); 997 if (i < conf->raid_disks)
998 set_bit(R1BIO_Degraded, &r1_bio->state);
992 continue; 999 continue;
993 } 1000 }
994 1001
@@ -1263,6 +1270,25 @@ static int raid1_spare_active(struct mddev *mddev)
1263 */ 1270 */
1264 for (i = 0; i < conf->raid_disks; i++) { 1271 for (i = 0; i < conf->raid_disks; i++) {
1265 struct md_rdev *rdev = conf->mirrors[i].rdev; 1272 struct md_rdev *rdev = conf->mirrors[i].rdev;
1273 struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
1274 if (repl
1275 && repl->recovery_offset == MaxSector
1276 && !test_bit(Faulty, &repl->flags)
1277 && !test_and_set_bit(In_sync, &repl->flags)) {
1278 /* replacement has just become active */
1279 if (!rdev ||
1280 !test_and_clear_bit(In_sync, &rdev->flags))
1281 count++;
1282 if (rdev) {
1283 /* Replaced device not technically
1284 * faulty, but we need to be sure
1285 * it gets removed and never re-added
1286 */
1287 set_bit(Faulty, &rdev->flags);
1288 sysfs_notify_dirent_safe(
1289 rdev->sysfs_state);
1290 }
1291 }
1266 if (rdev 1292 if (rdev
1267 && !test_bit(Faulty, &rdev->flags) 1293 && !test_bit(Faulty, &rdev->flags)
1268 && !test_and_set_bit(In_sync, &rdev->flags)) { 1294 && !test_and_set_bit(In_sync, &rdev->flags)) {
@@ -1286,7 +1312,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1286 int mirror = 0; 1312 int mirror = 0;
1287 struct mirror_info *p; 1313 struct mirror_info *p;
1288 int first = 0; 1314 int first = 0;
1289 int last = mddev->raid_disks - 1; 1315 int last = conf->raid_disks - 1;
1290 1316
1291 if (mddev->recovery_disabled == conf->recovery_disabled) 1317 if (mddev->recovery_disabled == conf->recovery_disabled)
1292 return -EBUSY; 1318 return -EBUSY;
@@ -1294,8 +1320,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1294 if (rdev->raid_disk >= 0) 1320 if (rdev->raid_disk >= 0)
1295 first = last = rdev->raid_disk; 1321 first = last = rdev->raid_disk;
1296 1322
1297 for (mirror = first; mirror <= last; mirror++) 1323 for (mirror = first; mirror <= last; mirror++) {
1298 if ( !(p=conf->mirrors+mirror)->rdev) { 1324 p = conf->mirrors+mirror;
1325 if (!p->rdev) {
1299 1326
1300 disk_stack_limits(mddev->gendisk, rdev->bdev, 1327 disk_stack_limits(mddev->gendisk, rdev->bdev,
1301 rdev->data_offset << 9); 1328 rdev->data_offset << 9);
@@ -1322,21 +1349,35 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1322 rcu_assign_pointer(p->rdev, rdev); 1349 rcu_assign_pointer(p->rdev, rdev);
1323 break; 1350 break;
1324 } 1351 }
1352 if (test_bit(WantReplacement, &p->rdev->flags) &&
1353 p[conf->raid_disks].rdev == NULL) {
1354 /* Add this device as a replacement */
1355 clear_bit(In_sync, &rdev->flags);
1356 set_bit(Replacement, &rdev->flags);
1357 rdev->raid_disk = mirror;
1358 err = 0;
1359 conf->fullsync = 1;
1360 rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
1361 break;
1362 }
1363 }
1325 md_integrity_add_rdev(rdev, mddev); 1364 md_integrity_add_rdev(rdev, mddev);
1326 print_conf(conf); 1365 print_conf(conf);
1327 return err; 1366 return err;
1328} 1367}
1329 1368
1330static int raid1_remove_disk(struct mddev *mddev, int number) 1369static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1331{ 1370{
1332 struct r1conf *conf = mddev->private; 1371 struct r1conf *conf = mddev->private;
1333 int err = 0; 1372 int err = 0;
1334 struct md_rdev *rdev; 1373 int number = rdev->raid_disk;
1335 struct mirror_info *p = conf->mirrors+ number; 1374 struct mirror_info *p = conf->mirrors+ number;
1336 1375
1376 if (rdev != p->rdev)
1377 p = conf->mirrors + conf->raid_disks + number;
1378
1337 print_conf(conf); 1379 print_conf(conf);
1338 rdev = p->rdev; 1380 if (rdev == p->rdev) {
1339 if (rdev) {
1340 if (test_bit(In_sync, &rdev->flags) || 1381 if (test_bit(In_sync, &rdev->flags) ||
1341 atomic_read(&rdev->nr_pending)) { 1382 atomic_read(&rdev->nr_pending)) {
1342 err = -EBUSY; 1383 err = -EBUSY;
@@ -1358,7 +1399,21 @@ static int raid1_remove_disk(struct mddev *mddev, int number)
1358 err = -EBUSY; 1399 err = -EBUSY;
1359 p->rdev = rdev; 1400 p->rdev = rdev;
1360 goto abort; 1401 goto abort;
1361 } 1402 } else if (conf->mirrors[conf->raid_disks + number].rdev) {
1403 /* We just removed a device that is being replaced.
1404 * Move down the replacement. We drain all IO before
1405 * doing this to avoid confusion.
1406 */
1407 struct md_rdev *repl =
1408 conf->mirrors[conf->raid_disks + number].rdev;
1409 raise_barrier(conf);
1410 clear_bit(Replacement, &repl->flags);
1411 p->rdev = repl;
1412 conf->mirrors[conf->raid_disks + number].rdev = NULL;
1413 lower_barrier(conf);
1414 clear_bit(WantReplacement, &rdev->flags);
1415 } else
1416 clear_bit(WantReplacement, &rdev->flags);
1362 err = md_integrity_register(mddev); 1417 err = md_integrity_register(mddev);
1363 } 1418 }
1364abort: 1419abort:
@@ -1411,6 +1466,10 @@ static void end_sync_write(struct bio *bio, int error)
1411 } while (sectors_to_go > 0); 1466 } while (sectors_to_go > 0);
1412 set_bit(WriteErrorSeen, 1467 set_bit(WriteErrorSeen,
1413 &conf->mirrors[mirror].rdev->flags); 1468 &conf->mirrors[mirror].rdev->flags);
1469 if (!test_and_set_bit(WantReplacement,
1470 &conf->mirrors[mirror].rdev->flags))
1471 set_bit(MD_RECOVERY_NEEDED, &
1472 mddev->recovery);
1414 set_bit(R1BIO_WriteError, &r1_bio->state); 1473 set_bit(R1BIO_WriteError, &r1_bio->state);
1415 } else if (is_badblock(conf->mirrors[mirror].rdev, 1474 } else if (is_badblock(conf->mirrors[mirror].rdev,
1416 r1_bio->sector, 1475 r1_bio->sector,
@@ -1441,8 +1500,13 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
1441 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) 1500 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1442 /* success */ 1501 /* success */
1443 return 1; 1502 return 1;
1444 if (rw == WRITE) 1503 if (rw == WRITE) {
1445 set_bit(WriteErrorSeen, &rdev->flags); 1504 set_bit(WriteErrorSeen, &rdev->flags);
1505 if (!test_and_set_bit(WantReplacement,
1506 &rdev->flags))
1507 set_bit(MD_RECOVERY_NEEDED, &
1508 rdev->mddev->recovery);
1509 }
1446 /* need to record an error - either for the block or the device */ 1510 /* need to record an error - either for the block or the device */
1447 if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 1511 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1448 md_error(rdev->mddev, rdev); 1512 md_error(rdev->mddev, rdev);
@@ -1493,7 +1557,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1493 } 1557 }
1494 } 1558 }
1495 d++; 1559 d++;
1496 if (d == conf->raid_disks) 1560 if (d == conf->raid_disks * 2)
1497 d = 0; 1561 d = 0;
1498 } while (!success && d != r1_bio->read_disk); 1562 } while (!success && d != r1_bio->read_disk);
1499 1563
@@ -1510,7 +1574,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1510 mdname(mddev), 1574 mdname(mddev),
1511 bdevname(bio->bi_bdev, b), 1575 bdevname(bio->bi_bdev, b),
1512 (unsigned long long)r1_bio->sector); 1576 (unsigned long long)r1_bio->sector);
1513 for (d = 0; d < conf->raid_disks; d++) { 1577 for (d = 0; d < conf->raid_disks * 2; d++) {
1514 rdev = conf->mirrors[d].rdev; 1578 rdev = conf->mirrors[d].rdev;
1515 if (!rdev || test_bit(Faulty, &rdev->flags)) 1579 if (!rdev || test_bit(Faulty, &rdev->flags))
1516 continue; 1580 continue;
@@ -1536,7 +1600,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1536 /* write it back and re-read */ 1600 /* write it back and re-read */
1537 while (d != r1_bio->read_disk) { 1601 while (d != r1_bio->read_disk) {
1538 if (d == 0) 1602 if (d == 0)
1539 d = conf->raid_disks; 1603 d = conf->raid_disks * 2;
1540 d--; 1604 d--;
1541 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1605 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1542 continue; 1606 continue;
@@ -1551,7 +1615,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1551 d = start; 1615 d = start;
1552 while (d != r1_bio->read_disk) { 1616 while (d != r1_bio->read_disk) {
1553 if (d == 0) 1617 if (d == 0)
1554 d = conf->raid_disks; 1618 d = conf->raid_disks * 2;
1555 d--; 1619 d--;
1556 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1620 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1557 continue; 1621 continue;
@@ -1584,7 +1648,7 @@ static int process_checks(struct r1bio *r1_bio)
1584 int primary; 1648 int primary;
1585 int i; 1649 int i;
1586 1650
1587 for (primary = 0; primary < conf->raid_disks; primary++) 1651 for (primary = 0; primary < conf->raid_disks * 2; primary++)
1588 if (r1_bio->bios[primary]->bi_end_io == end_sync_read && 1652 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1589 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { 1653 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1590 r1_bio->bios[primary]->bi_end_io = NULL; 1654 r1_bio->bios[primary]->bi_end_io = NULL;
@@ -1592,7 +1656,7 @@ static int process_checks(struct r1bio *r1_bio)
1592 break; 1656 break;
1593 } 1657 }
1594 r1_bio->read_disk = primary; 1658 r1_bio->read_disk = primary;
1595 for (i = 0; i < conf->raid_disks; i++) { 1659 for (i = 0; i < conf->raid_disks * 2; i++) {
1596 int j; 1660 int j;
1597 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); 1661 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1598 struct bio *pbio = r1_bio->bios[primary]; 1662 struct bio *pbio = r1_bio->bios[primary];
@@ -1656,7 +1720,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
1656{ 1720{
1657 struct r1conf *conf = mddev->private; 1721 struct r1conf *conf = mddev->private;
1658 int i; 1722 int i;
1659 int disks = conf->raid_disks; 1723 int disks = conf->raid_disks * 2;
1660 struct bio *bio, *wbio; 1724 struct bio *bio, *wbio;
1661 1725
1662 bio = r1_bio->bios[r1_bio->read_disk]; 1726 bio = r1_bio->bios[r1_bio->read_disk];
@@ -1737,7 +1801,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
1737 success = 1; 1801 success = 1;
1738 else { 1802 else {
1739 d++; 1803 d++;
1740 if (d == conf->raid_disks) 1804 if (d == conf->raid_disks * 2)
1741 d = 0; 1805 d = 0;
1742 } 1806 }
1743 } while (!success && d != read_disk); 1807 } while (!success && d != read_disk);
@@ -1753,7 +1817,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
1753 start = d; 1817 start = d;
1754 while (d != read_disk) { 1818 while (d != read_disk) {
1755 if (d==0) 1819 if (d==0)
1756 d = conf->raid_disks; 1820 d = conf->raid_disks * 2;
1757 d--; 1821 d--;
1758 rdev = conf->mirrors[d].rdev; 1822 rdev = conf->mirrors[d].rdev;
1759 if (rdev && 1823 if (rdev &&
@@ -1765,7 +1829,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
1765 while (d != read_disk) { 1829 while (d != read_disk) {
1766 char b[BDEVNAME_SIZE]; 1830 char b[BDEVNAME_SIZE];
1767 if (d==0) 1831 if (d==0)
1768 d = conf->raid_disks; 1832 d = conf->raid_disks * 2;
1769 d--; 1833 d--;
1770 rdev = conf->mirrors[d].rdev; 1834 rdev = conf->mirrors[d].rdev;
1771 if (rdev && 1835 if (rdev &&
@@ -1887,7 +1951,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
1887{ 1951{
1888 int m; 1952 int m;
1889 int s = r1_bio->sectors; 1953 int s = r1_bio->sectors;
1890 for (m = 0; m < conf->raid_disks ; m++) { 1954 for (m = 0; m < conf->raid_disks * 2 ; m++) {
1891 struct md_rdev *rdev = conf->mirrors[m].rdev; 1955 struct md_rdev *rdev = conf->mirrors[m].rdev;
1892 struct bio *bio = r1_bio->bios[m]; 1956 struct bio *bio = r1_bio->bios[m];
1893 if (bio->bi_end_io == NULL) 1957 if (bio->bi_end_io == NULL)
@@ -1909,7 +1973,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
1909static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) 1973static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
1910{ 1974{
1911 int m; 1975 int m;
1912 for (m = 0; m < conf->raid_disks ; m++) 1976 for (m = 0; m < conf->raid_disks * 2 ; m++)
1913 if (r1_bio->bios[m] == IO_MADE_GOOD) { 1977 if (r1_bio->bios[m] == IO_MADE_GOOD) {
1914 struct md_rdev *rdev = conf->mirrors[m].rdev; 1978 struct md_rdev *rdev = conf->mirrors[m].rdev;
1915 rdev_clear_badblocks(rdev, 1979 rdev_clear_badblocks(rdev,
@@ -2184,7 +2248,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2184 r1_bio->state = 0; 2248 r1_bio->state = 0;
2185 set_bit(R1BIO_IsSync, &r1_bio->state); 2249 set_bit(R1BIO_IsSync, &r1_bio->state);
2186 2250
2187 for (i=0; i < conf->raid_disks; i++) { 2251 for (i = 0; i < conf->raid_disks * 2; i++) {
2188 struct md_rdev *rdev; 2252 struct md_rdev *rdev;
2189 bio = r1_bio->bios[i]; 2253 bio = r1_bio->bios[i];
2190 2254
@@ -2203,7 +2267,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2203 rdev = rcu_dereference(conf->mirrors[i].rdev); 2267 rdev = rcu_dereference(conf->mirrors[i].rdev);
2204 if (rdev == NULL || 2268 if (rdev == NULL ||
2205 test_bit(Faulty, &rdev->flags)) { 2269 test_bit(Faulty, &rdev->flags)) {
2206 still_degraded = 1; 2270 if (i < conf->raid_disks)
2271 still_degraded = 1;
2207 } else if (!test_bit(In_sync, &rdev->flags)) { 2272 } else if (!test_bit(In_sync, &rdev->flags)) {
2208 bio->bi_rw = WRITE; 2273 bio->bi_rw = WRITE;
2209 bio->bi_end_io = end_sync_write; 2274 bio->bi_end_io = end_sync_write;
@@ -2254,7 +2319,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2254 * need to mark them bad on all write targets 2319 * need to mark them bad on all write targets
2255 */ 2320 */
2256 int ok = 1; 2321 int ok = 1;
2257 for (i = 0 ; i < conf->raid_disks ; i++) 2322 for (i = 0 ; i < conf->raid_disks * 2 ; i++)
2258 if (r1_bio->bios[i]->bi_end_io == end_sync_write) { 2323 if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
2259 struct md_rdev *rdev = 2324 struct md_rdev *rdev =
2260 rcu_dereference(conf->mirrors[i].rdev); 2325 rcu_dereference(conf->mirrors[i].rdev);
@@ -2323,7 +2388,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2323 len = sync_blocks<<9; 2388 len = sync_blocks<<9;
2324 } 2389 }
2325 2390
2326 for (i=0 ; i < conf->raid_disks; i++) { 2391 for (i = 0 ; i < conf->raid_disks * 2; i++) {
2327 bio = r1_bio->bios[i]; 2392 bio = r1_bio->bios[i];
2328 if (bio->bi_end_io) { 2393 if (bio->bi_end_io) {
2329 page = bio->bi_io_vec[bio->bi_vcnt].bv_page; 2394 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
@@ -2356,7 +2421,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2356 */ 2421 */
2357 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 2422 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2358 atomic_set(&r1_bio->remaining, read_targets); 2423 atomic_set(&r1_bio->remaining, read_targets);
2359 for (i=0; i<conf->raid_disks; i++) { 2424 for (i = 0; i < conf->raid_disks * 2; i++) {
2360 bio = r1_bio->bios[i]; 2425 bio = r1_bio->bios[i];
2361 if (bio->bi_end_io == end_sync_read) { 2426 if (bio->bi_end_io == end_sync_read) {
2362 md_sync_acct(bio->bi_bdev, nr_sectors); 2427 md_sync_acct(bio->bi_bdev, nr_sectors);
@@ -2393,7 +2458,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2393 if (!conf) 2458 if (!conf)
2394 goto abort; 2459 goto abort;
2395 2460
2396 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 2461 conf->mirrors = kzalloc(sizeof(struct mirror_info)
2462 * mddev->raid_disks * 2,
2397 GFP_KERNEL); 2463 GFP_KERNEL);
2398 if (!conf->mirrors) 2464 if (!conf->mirrors)
2399 goto abort; 2465 goto abort;
@@ -2405,7 +2471,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2405 conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 2471 conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
2406 if (!conf->poolinfo) 2472 if (!conf->poolinfo)
2407 goto abort; 2473 goto abort;
2408 conf->poolinfo->raid_disks = mddev->raid_disks; 2474 conf->poolinfo->raid_disks = mddev->raid_disks * 2;
2409 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 2475 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2410 r1bio_pool_free, 2476 r1bio_pool_free,
2411 conf->poolinfo); 2477 conf->poolinfo);
@@ -2414,14 +2480,20 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2414 2480
2415 conf->poolinfo->mddev = mddev; 2481 conf->poolinfo->mddev = mddev;
2416 2482
2483 err = -EINVAL;
2417 spin_lock_init(&conf->device_lock); 2484 spin_lock_init(&conf->device_lock);
2418 list_for_each_entry(rdev, &mddev->disks, same_set) { 2485 list_for_each_entry(rdev, &mddev->disks, same_set) {
2419 int disk_idx = rdev->raid_disk; 2486 int disk_idx = rdev->raid_disk;
2420 if (disk_idx >= mddev->raid_disks 2487 if (disk_idx >= mddev->raid_disks
2421 || disk_idx < 0) 2488 || disk_idx < 0)
2422 continue; 2489 continue;
2423 disk = conf->mirrors + disk_idx; 2490 if (test_bit(Replacement, &rdev->flags))
2491 disk = conf->mirrors + conf->raid_disks + disk_idx;
2492 else
2493 disk = conf->mirrors + disk_idx;
2424 2494
2495 if (disk->rdev)
2496 goto abort;
2425 disk->rdev = rdev; 2497 disk->rdev = rdev;
2426 2498
2427 disk->head_position = 0; 2499 disk->head_position = 0;
@@ -2437,11 +2509,27 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2437 conf->pending_count = 0; 2509 conf->pending_count = 0;
2438 conf->recovery_disabled = mddev->recovery_disabled - 1; 2510 conf->recovery_disabled = mddev->recovery_disabled - 1;
2439 2511
2512 err = -EIO;
2440 conf->last_used = -1; 2513 conf->last_used = -1;
2441 for (i = 0; i < conf->raid_disks; i++) { 2514 for (i = 0; i < conf->raid_disks * 2; i++) {
2442 2515
2443 disk = conf->mirrors + i; 2516 disk = conf->mirrors + i;
2444 2517
2518 if (i < conf->raid_disks &&
2519 disk[conf->raid_disks].rdev) {
2520 /* This slot has a replacement. */
2521 if (!disk->rdev) {
2522 /* No original, just make the replacement
2523 * a recovering spare
2524 */
2525 disk->rdev =
2526 disk[conf->raid_disks].rdev;
2527 disk[conf->raid_disks].rdev = NULL;
2528 } else if (!test_bit(In_sync, &disk->rdev->flags))
2529 /* Original is not in_sync - bad */
2530 goto abort;
2531 }
2532
2445 if (!disk->rdev || 2533 if (!disk->rdev ||
2446 !test_bit(In_sync, &disk->rdev->flags)) { 2534 !test_bit(In_sync, &disk->rdev->flags)) {
2447 disk->head_position = 0; 2535 disk->head_position = 0;
@@ -2455,7 +2543,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2455 conf->last_used = i; 2543 conf->last_used = i;
2456 } 2544 }
2457 2545
2458 err = -EIO;
2459 if (conf->last_used < 0) { 2546 if (conf->last_used < 0) {
2460 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", 2547 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2461 mdname(mddev)); 2548 mdname(mddev));
@@ -2665,7 +2752,7 @@ static int raid1_reshape(struct mddev *mddev)
2665 if (!newpoolinfo) 2752 if (!newpoolinfo)
2666 return -ENOMEM; 2753 return -ENOMEM;
2667 newpoolinfo->mddev = mddev; 2754 newpoolinfo->mddev = mddev;
2668 newpoolinfo->raid_disks = raid_disks; 2755 newpoolinfo->raid_disks = raid_disks * 2;
2669 2756
2670 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 2757 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2671 r1bio_pool_free, newpoolinfo); 2758 r1bio_pool_free, newpoolinfo);
@@ -2673,7 +2760,8 @@ static int raid1_reshape(struct mddev *mddev)
2673 kfree(newpoolinfo); 2760 kfree(newpoolinfo);
2674 return -ENOMEM; 2761 return -ENOMEM;
2675 } 2762 }
2676 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); 2763 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
2764 GFP_KERNEL);
2677 if (!newmirrors) { 2765 if (!newmirrors) {
2678 kfree(newpoolinfo); 2766 kfree(newpoolinfo);
2679 mempool_destroy(newpool); 2767 mempool_destroy(newpool);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index c732b6cce935..80ded139314c 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -12,6 +12,9 @@ struct mirror_info {
12 * pool was allocated for, so they know how much to allocate and free. 12 * pool was allocated for, so they know how much to allocate and free.
13 * mddev->raid_disks cannot be used, as it can change while a pool is active 13 * mddev->raid_disks cannot be used, as it can change while a pool is active
14 * These two datums are stored in a kmalloced struct. 14 * These two datums are stored in a kmalloced struct.
15 * The 'raid_disks' here is twice the raid_disks in r1conf.
16 * This allows space for each 'real' device can have a replacement in the
17 * second half of the array.
15 */ 18 */
16 19
17struct pool_info { 20struct pool_info {
@@ -21,7 +24,9 @@ struct pool_info {
21 24
22struct r1conf { 25struct r1conf {
23 struct mddev *mddev; 26 struct mddev *mddev;
24 struct mirror_info *mirrors; 27 struct mirror_info *mirrors; /* twice 'raid_disks' to
28 * allow for replacements.
29 */
25 int raid_disks; 30 int raid_disks;
26 31
27 /* When choose the best device for a read (read_balance()) 32 /* When choose the best device for a read (read_balance())
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 685ddf325ee4..6e8aa213f0d5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -73,7 +73,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
73 struct r10conf *conf = data; 73 struct r10conf *conf = data;
74 int size = offsetof(struct r10bio, devs[conf->copies]); 74 int size = offsetof(struct r10bio, devs[conf->copies]);
75 75
76 /* allocate a r10bio with room for raid_disks entries in the bios array */ 76 /* allocate a r10bio with room for raid_disks entries in the
77 * bios array */
77 return kzalloc(size, gfp_flags); 78 return kzalloc(size, gfp_flags);
78} 79}
79 80
@@ -123,12 +124,19 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
123 if (!bio) 124 if (!bio)
124 goto out_free_bio; 125 goto out_free_bio;
125 r10_bio->devs[j].bio = bio; 126 r10_bio->devs[j].bio = bio;
127 if (!conf->have_replacement)
128 continue;
129 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
130 if (!bio)
131 goto out_free_bio;
132 r10_bio->devs[j].repl_bio = bio;
126 } 133 }
127 /* 134 /*
128 * Allocate RESYNC_PAGES data pages and attach them 135 * Allocate RESYNC_PAGES data pages and attach them
129 * where needed. 136 * where needed.
130 */ 137 */
131 for (j = 0 ; j < nalloc; j++) { 138 for (j = 0 ; j < nalloc; j++) {
139 struct bio *rbio = r10_bio->devs[j].repl_bio;
132 bio = r10_bio->devs[j].bio; 140 bio = r10_bio->devs[j].bio;
133 for (i = 0; i < RESYNC_PAGES; i++) { 141 for (i = 0; i < RESYNC_PAGES; i++) {
134 if (j == 1 && !test_bit(MD_RECOVERY_SYNC, 142 if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
@@ -143,6 +151,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
143 goto out_free_pages; 151 goto out_free_pages;
144 152
145 bio->bi_io_vec[i].bv_page = page; 153 bio->bi_io_vec[i].bv_page = page;
154 if (rbio)
155 rbio->bi_io_vec[i].bv_page = page;
146 } 156 }
147 } 157 }
148 158
@@ -156,8 +166,11 @@ out_free_pages:
156 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); 166 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
157 j = -1; 167 j = -1;
158out_free_bio: 168out_free_bio:
159 while ( ++j < nalloc ) 169 while (++j < nalloc) {
160 bio_put(r10_bio->devs[j].bio); 170 bio_put(r10_bio->devs[j].bio);
171 if (r10_bio->devs[j].repl_bio)
172 bio_put(r10_bio->devs[j].repl_bio);
173 }
161 r10bio_pool_free(r10_bio, conf); 174 r10bio_pool_free(r10_bio, conf);
162 return NULL; 175 return NULL;
163} 176}
@@ -178,6 +191,9 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
178 } 191 }
179 bio_put(bio); 192 bio_put(bio);
180 } 193 }
194 bio = r10bio->devs[j].repl_bio;
195 if (bio)
196 bio_put(bio);
181 } 197 }
182 r10bio_pool_free(r10bio, conf); 198 r10bio_pool_free(r10bio, conf);
183} 199}
@@ -191,6 +207,10 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
191 if (!BIO_SPECIAL(*bio)) 207 if (!BIO_SPECIAL(*bio))
192 bio_put(*bio); 208 bio_put(*bio);
193 *bio = NULL; 209 *bio = NULL;
210 bio = &r10_bio->devs[i].repl_bio;
211 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
212 bio_put(*bio);
213 *bio = NULL;
194 } 214 }
195} 215}
196 216
@@ -275,19 +295,27 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio)
275 * Find the disk number which triggered given bio 295 * Find the disk number which triggered given bio
276 */ 296 */
277static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, 297static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
278 struct bio *bio, int *slotp) 298 struct bio *bio, int *slotp, int *replp)
279{ 299{
280 int slot; 300 int slot;
301 int repl = 0;
281 302
282 for (slot = 0; slot < conf->copies; slot++) 303 for (slot = 0; slot < conf->copies; slot++) {
283 if (r10_bio->devs[slot].bio == bio) 304 if (r10_bio->devs[slot].bio == bio)
284 break; 305 break;
306 if (r10_bio->devs[slot].repl_bio == bio) {
307 repl = 1;
308 break;
309 }
310 }
285 311
286 BUG_ON(slot == conf->copies); 312 BUG_ON(slot == conf->copies);
287 update_head_pos(slot, r10_bio); 313 update_head_pos(slot, r10_bio);
288 314
289 if (slotp) 315 if (slotp)
290 *slotp = slot; 316 *slotp = slot;
317 if (replp)
318 *replp = repl;
291 return r10_bio->devs[slot].devnum; 319 return r10_bio->devs[slot].devnum;
292} 320}
293 321
@@ -296,11 +324,13 @@ static void raid10_end_read_request(struct bio *bio, int error)
296 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 324 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
297 struct r10bio *r10_bio = bio->bi_private; 325 struct r10bio *r10_bio = bio->bi_private;
298 int slot, dev; 326 int slot, dev;
327 struct md_rdev *rdev;
299 struct r10conf *conf = r10_bio->mddev->private; 328 struct r10conf *conf = r10_bio->mddev->private;
300 329
301 330
302 slot = r10_bio->read_slot; 331 slot = r10_bio->read_slot;
303 dev = r10_bio->devs[slot].devnum; 332 dev = r10_bio->devs[slot].devnum;
333 rdev = r10_bio->devs[slot].rdev;
304 /* 334 /*
305 * this branch is our 'one mirror IO has finished' event handler: 335 * this branch is our 'one mirror IO has finished' event handler:
306 */ 336 */
@@ -318,7 +348,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
318 */ 348 */
319 set_bit(R10BIO_Uptodate, &r10_bio->state); 349 set_bit(R10BIO_Uptodate, &r10_bio->state);
320 raid_end_bio_io(r10_bio); 350 raid_end_bio_io(r10_bio);
321 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); 351 rdev_dec_pending(rdev, conf->mddev);
322 } else { 352 } else {
323 /* 353 /*
324 * oops, read error - keep the refcount on the rdev 354 * oops, read error - keep the refcount on the rdev
@@ -327,7 +357,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
327 printk_ratelimited(KERN_ERR 357 printk_ratelimited(KERN_ERR
328 "md/raid10:%s: %s: rescheduling sector %llu\n", 358 "md/raid10:%s: %s: rescheduling sector %llu\n",
329 mdname(conf->mddev), 359 mdname(conf->mddev),
330 bdevname(conf->mirrors[dev].rdev->bdev, b), 360 bdevname(rdev->bdev, b),
331 (unsigned long long)r10_bio->sector); 361 (unsigned long long)r10_bio->sector);
332 set_bit(R10BIO_ReadError, &r10_bio->state); 362 set_bit(R10BIO_ReadError, &r10_bio->state);
333 reschedule_retry(r10_bio); 363 reschedule_retry(r10_bio);
@@ -366,17 +396,35 @@ static void raid10_end_write_request(struct bio *bio, int error)
366 int dev; 396 int dev;
367 int dec_rdev = 1; 397 int dec_rdev = 1;
368 struct r10conf *conf = r10_bio->mddev->private; 398 struct r10conf *conf = r10_bio->mddev->private;
369 int slot; 399 int slot, repl;
400 struct md_rdev *rdev = NULL;
370 401
371 dev = find_bio_disk(conf, r10_bio, bio, &slot); 402 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
372 403
404 if (repl)
405 rdev = conf->mirrors[dev].replacement;
406 if (!rdev) {
407 smp_rmb();
408 repl = 0;
409 rdev = conf->mirrors[dev].rdev;
410 }
373 /* 411 /*
374 * this branch is our 'one mirror IO has finished' event handler: 412 * this branch is our 'one mirror IO has finished' event handler:
375 */ 413 */
376 if (!uptodate) { 414 if (!uptodate) {
377 set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); 415 if (repl)
378 set_bit(R10BIO_WriteError, &r10_bio->state); 416 /* Never record new bad blocks to replacement,
379 dec_rdev = 0; 417 * just fail it.
418 */
419 md_error(rdev->mddev, rdev);
420 else {
421 set_bit(WriteErrorSeen, &rdev->flags);
422 if (!test_and_set_bit(WantReplacement, &rdev->flags))
423 set_bit(MD_RECOVERY_NEEDED,
424 &rdev->mddev->recovery);
425 set_bit(R10BIO_WriteError, &r10_bio->state);
426 dec_rdev = 0;
427 }
380 } else { 428 } else {
381 /* 429 /*
382 * Set R10BIO_Uptodate in our master bio, so that 430 * Set R10BIO_Uptodate in our master bio, so that
@@ -393,12 +441,15 @@ static void raid10_end_write_request(struct bio *bio, int error)
393 set_bit(R10BIO_Uptodate, &r10_bio->state); 441 set_bit(R10BIO_Uptodate, &r10_bio->state);
394 442
395 /* Maybe we can clear some bad blocks. */ 443 /* Maybe we can clear some bad blocks. */
396 if (is_badblock(conf->mirrors[dev].rdev, 444 if (is_badblock(rdev,
397 r10_bio->devs[slot].addr, 445 r10_bio->devs[slot].addr,
398 r10_bio->sectors, 446 r10_bio->sectors,
399 &first_bad, &bad_sectors)) { 447 &first_bad, &bad_sectors)) {
400 bio_put(bio); 448 bio_put(bio);
401 r10_bio->devs[slot].bio = IO_MADE_GOOD; 449 if (repl)
450 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
451 else
452 r10_bio->devs[slot].bio = IO_MADE_GOOD;
402 dec_rdev = 0; 453 dec_rdev = 0;
403 set_bit(R10BIO_MadeGood, &r10_bio->state); 454 set_bit(R10BIO_MadeGood, &r10_bio->state);
404 } 455 }
@@ -414,7 +465,6 @@ static void raid10_end_write_request(struct bio *bio, int error)
414 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); 465 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
415} 466}
416 467
417
418/* 468/*
419 * RAID10 layout manager 469 * RAID10 layout manager
420 * As well as the chunksize and raid_disks count, there are two 470 * As well as the chunksize and raid_disks count, there are two
@@ -562,14 +612,16 @@ static int raid10_mergeable_bvec(struct request_queue *q,
562 * FIXME: possibly should rethink readbalancing and do it differently 612 * FIXME: possibly should rethink readbalancing and do it differently
563 * depending on near_copies / far_copies geometry. 613 * depending on near_copies / far_copies geometry.
564 */ 614 */
565static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors) 615static struct md_rdev *read_balance(struct r10conf *conf,
616 struct r10bio *r10_bio,
617 int *max_sectors)
566{ 618{
567 const sector_t this_sector = r10_bio->sector; 619 const sector_t this_sector = r10_bio->sector;
568 int disk, slot; 620 int disk, slot;
569 int sectors = r10_bio->sectors; 621 int sectors = r10_bio->sectors;
570 int best_good_sectors; 622 int best_good_sectors;
571 sector_t new_distance, best_dist; 623 sector_t new_distance, best_dist;
572 struct md_rdev *rdev; 624 struct md_rdev *rdev, *best_rdev;
573 int do_balance; 625 int do_balance;
574 int best_slot; 626 int best_slot;
575 627
@@ -578,6 +630,7 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s
578retry: 630retry:
579 sectors = r10_bio->sectors; 631 sectors = r10_bio->sectors;
580 best_slot = -1; 632 best_slot = -1;
633 best_rdev = NULL;
581 best_dist = MaxSector; 634 best_dist = MaxSector;
582 best_good_sectors = 0; 635 best_good_sectors = 0;
583 do_balance = 1; 636 do_balance = 1;
@@ -599,10 +652,16 @@ retry:
599 if (r10_bio->devs[slot].bio == IO_BLOCKED) 652 if (r10_bio->devs[slot].bio == IO_BLOCKED)
600 continue; 653 continue;
601 disk = r10_bio->devs[slot].devnum; 654 disk = r10_bio->devs[slot].devnum;
602 rdev = rcu_dereference(conf->mirrors[disk].rdev); 655 rdev = rcu_dereference(conf->mirrors[disk].replacement);
656 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
657 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
658 rdev = rcu_dereference(conf->mirrors[disk].rdev);
603 if (rdev == NULL) 659 if (rdev == NULL)
604 continue; 660 continue;
605 if (!test_bit(In_sync, &rdev->flags)) 661 if (test_bit(Faulty, &rdev->flags))
662 continue;
663 if (!test_bit(In_sync, &rdev->flags) &&
664 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
606 continue; 665 continue;
607 666
608 dev_sector = r10_bio->devs[slot].addr; 667 dev_sector = r10_bio->devs[slot].addr;
@@ -627,6 +686,7 @@ retry:
627 if (good_sectors > best_good_sectors) { 686 if (good_sectors > best_good_sectors) {
628 best_good_sectors = good_sectors; 687 best_good_sectors = good_sectors;
629 best_slot = slot; 688 best_slot = slot;
689 best_rdev = rdev;
630 } 690 }
631 if (!do_balance) 691 if (!do_balance)
632 /* Must read from here */ 692 /* Must read from here */
@@ -655,16 +715,15 @@ retry:
655 if (new_distance < best_dist) { 715 if (new_distance < best_dist) {
656 best_dist = new_distance; 716 best_dist = new_distance;
657 best_slot = slot; 717 best_slot = slot;
718 best_rdev = rdev;
658 } 719 }
659 } 720 }
660 if (slot == conf->copies) 721 if (slot >= conf->copies) {
661 slot = best_slot; 722 slot = best_slot;
723 rdev = best_rdev;
724 }
662 725
663 if (slot >= 0) { 726 if (slot >= 0) {
664 disk = r10_bio->devs[slot].devnum;
665 rdev = rcu_dereference(conf->mirrors[disk].rdev);
666 if (!rdev)
667 goto retry;
668 atomic_inc(&rdev->nr_pending); 727 atomic_inc(&rdev->nr_pending);
669 if (test_bit(Faulty, &rdev->flags)) { 728 if (test_bit(Faulty, &rdev->flags)) {
670 /* Cannot risk returning a device that failed 729 /* Cannot risk returning a device that failed
@@ -675,11 +734,11 @@ retry:
675 } 734 }
676 r10_bio->read_slot = slot; 735 r10_bio->read_slot = slot;
677 } else 736 } else
678 disk = -1; 737 rdev = NULL;
679 rcu_read_unlock(); 738 rcu_read_unlock();
680 *max_sectors = best_good_sectors; 739 *max_sectors = best_good_sectors;
681 740
682 return disk; 741 return rdev;
683} 742}
684 743
685static int raid10_congested(void *data, int bits) 744static int raid10_congested(void *data, int bits)
@@ -846,7 +905,6 @@ static void unfreeze_array(struct r10conf *conf)
846static void make_request(struct mddev *mddev, struct bio * bio) 905static void make_request(struct mddev *mddev, struct bio * bio)
847{ 906{
848 struct r10conf *conf = mddev->private; 907 struct r10conf *conf = mddev->private;
849 struct mirror_info *mirror;
850 struct r10bio *r10_bio; 908 struct r10bio *r10_bio;
851 struct bio *read_bio; 909 struct bio *read_bio;
852 int i; 910 int i;
@@ -945,27 +1003,27 @@ static void make_request(struct mddev *mddev, struct bio * bio)
945 /* 1003 /*
946 * read balancing logic: 1004 * read balancing logic:
947 */ 1005 */
948 int disk; 1006 struct md_rdev *rdev;
949 int slot; 1007 int slot;
950 1008
951read_again: 1009read_again:
952 disk = read_balance(conf, r10_bio, &max_sectors); 1010 rdev = read_balance(conf, r10_bio, &max_sectors);
953 slot = r10_bio->read_slot; 1011 if (!rdev) {
954 if (disk < 0) {
955 raid_end_bio_io(r10_bio); 1012 raid_end_bio_io(r10_bio);
956 return; 1013 return;
957 } 1014 }
958 mirror = conf->mirrors + disk; 1015 slot = r10_bio->read_slot;
959 1016
960 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1017 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
961 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, 1018 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
962 max_sectors); 1019 max_sectors);
963 1020
964 r10_bio->devs[slot].bio = read_bio; 1021 r10_bio->devs[slot].bio = read_bio;
1022 r10_bio->devs[slot].rdev = rdev;
965 1023
966 read_bio->bi_sector = r10_bio->devs[slot].addr + 1024 read_bio->bi_sector = r10_bio->devs[slot].addr +
967 mirror->rdev->data_offset; 1025 rdev->data_offset;
968 read_bio->bi_bdev = mirror->rdev->bdev; 1026 read_bio->bi_bdev = rdev->bdev;
969 read_bio->bi_end_io = raid10_end_read_request; 1027 read_bio->bi_end_io = raid10_end_read_request;
970 read_bio->bi_rw = READ | do_sync; 1028 read_bio->bi_rw = READ | do_sync;
971 read_bio->bi_private = r10_bio; 1029 read_bio->bi_private = r10_bio;
@@ -1025,6 +1083,7 @@ read_again:
1025 */ 1083 */
1026 plugged = mddev_check_plugged(mddev); 1084 plugged = mddev_check_plugged(mddev);
1027 1085
1086 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
1028 raid10_find_phys(conf, r10_bio); 1087 raid10_find_phys(conf, r10_bio);
1029retry_write: 1088retry_write:
1030 blocked_rdev = NULL; 1089 blocked_rdev = NULL;
@@ -1034,12 +1093,25 @@ retry_write:
1034 for (i = 0; i < conf->copies; i++) { 1093 for (i = 0; i < conf->copies; i++) {
1035 int d = r10_bio->devs[i].devnum; 1094 int d = r10_bio->devs[i].devnum;
1036 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 1095 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1096 struct md_rdev *rrdev = rcu_dereference(
1097 conf->mirrors[d].replacement);
1098 if (rdev == rrdev)
1099 rrdev = NULL;
1037 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 1100 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1038 atomic_inc(&rdev->nr_pending); 1101 atomic_inc(&rdev->nr_pending);
1039 blocked_rdev = rdev; 1102 blocked_rdev = rdev;
1040 break; 1103 break;
1041 } 1104 }
1105 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1106 atomic_inc(&rrdev->nr_pending);
1107 blocked_rdev = rrdev;
1108 break;
1109 }
1110 if (rrdev && test_bit(Faulty, &rrdev->flags))
1111 rrdev = NULL;
1112
1042 r10_bio->devs[i].bio = NULL; 1113 r10_bio->devs[i].bio = NULL;
1114 r10_bio->devs[i].repl_bio = NULL;
1043 if (!rdev || test_bit(Faulty, &rdev->flags)) { 1115 if (!rdev || test_bit(Faulty, &rdev->flags)) {
1044 set_bit(R10BIO_Degraded, &r10_bio->state); 1116 set_bit(R10BIO_Degraded, &r10_bio->state);
1045 continue; 1117 continue;
@@ -1088,6 +1160,10 @@ retry_write:
1088 } 1160 }
1089 r10_bio->devs[i].bio = bio; 1161 r10_bio->devs[i].bio = bio;
1090 atomic_inc(&rdev->nr_pending); 1162 atomic_inc(&rdev->nr_pending);
1163 if (rrdev) {
1164 r10_bio->devs[i].repl_bio = bio;
1165 atomic_inc(&rrdev->nr_pending);
1166 }
1091 } 1167 }
1092 rcu_read_unlock(); 1168 rcu_read_unlock();
1093 1169
@@ -1096,11 +1172,23 @@ retry_write:
1096 int j; 1172 int j;
1097 int d; 1173 int d;
1098 1174
1099 for (j = 0; j < i; j++) 1175 for (j = 0; j < i; j++) {
1100 if (r10_bio->devs[j].bio) { 1176 if (r10_bio->devs[j].bio) {
1101 d = r10_bio->devs[j].devnum; 1177 d = r10_bio->devs[j].devnum;
1102 rdev_dec_pending(conf->mirrors[d].rdev, mddev); 1178 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1103 } 1179 }
1180 if (r10_bio->devs[j].repl_bio) {
1181 struct md_rdev *rdev;
1182 d = r10_bio->devs[j].devnum;
1183 rdev = conf->mirrors[d].replacement;
1184 if (!rdev) {
1185 /* Race with remove_disk */
1186 smp_mb();
1187 rdev = conf->mirrors[d].rdev;
1188 }
1189 rdev_dec_pending(rdev, mddev);
1190 }
1191 }
1104 allow_barrier(conf); 1192 allow_barrier(conf);
1105 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1193 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1106 wait_barrier(conf); 1194 wait_barrier(conf);
@@ -1147,6 +1235,31 @@ retry_write:
1147 bio_list_add(&conf->pending_bio_list, mbio); 1235 bio_list_add(&conf->pending_bio_list, mbio);
1148 conf->pending_count++; 1236 conf->pending_count++;
1149 spin_unlock_irqrestore(&conf->device_lock, flags); 1237 spin_unlock_irqrestore(&conf->device_lock, flags);
1238
1239 if (!r10_bio->devs[i].repl_bio)
1240 continue;
1241
1242 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1243 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1244 max_sectors);
1245 r10_bio->devs[i].repl_bio = mbio;
1246
1247 /* We are actively writing to the original device
1248 * so it cannot disappear, so the replacement cannot
1249 * become NULL here
1250 */
1251 mbio->bi_sector = (r10_bio->devs[i].addr+
1252 conf->mirrors[d].replacement->data_offset);
1253 mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
1254 mbio->bi_end_io = raid10_end_write_request;
1255 mbio->bi_rw = WRITE | do_sync | do_fua;
1256 mbio->bi_private = r10_bio;
1257
1258 atomic_inc(&r10_bio->remaining);
1259 spin_lock_irqsave(&conf->device_lock, flags);
1260 bio_list_add(&conf->pending_bio_list, mbio);
1261 conf->pending_count++;
1262 spin_unlock_irqrestore(&conf->device_lock, flags);
1150 } 1263 }
1151 1264
1152 /* Don't remove the bias on 'remaining' (one_write_done) until 1265 /* Don't remove the bias on 'remaining' (one_write_done) until
@@ -1309,9 +1422,27 @@ static int raid10_spare_active(struct mddev *mddev)
1309 */ 1422 */
1310 for (i = 0; i < conf->raid_disks; i++) { 1423 for (i = 0; i < conf->raid_disks; i++) {
1311 tmp = conf->mirrors + i; 1424 tmp = conf->mirrors + i;
1312 if (tmp->rdev 1425 if (tmp->replacement
1313 && !test_bit(Faulty, &tmp->rdev->flags) 1426 && tmp->replacement->recovery_offset == MaxSector
1314 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 1427 && !test_bit(Faulty, &tmp->replacement->flags)
1428 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1429 /* Replacement has just become active */
1430 if (!tmp->rdev
1431 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1432 count++;
1433 if (tmp->rdev) {
1434 /* Replaced device not technically faulty,
1435 * but we need to be sure it gets removed
1436 * and never re-added.
1437 */
1438 set_bit(Faulty, &tmp->rdev->flags);
1439 sysfs_notify_dirent_safe(
1440 tmp->rdev->sysfs_state);
1441 }
1442 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1443 } else if (tmp->rdev
1444 && !test_bit(Faulty, &tmp->rdev->flags)
1445 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1315 count++; 1446 count++;
1316 sysfs_notify_dirent(tmp->rdev->sysfs_state); 1447 sysfs_notify_dirent(tmp->rdev->sysfs_state);
1317 } 1448 }
@@ -1353,8 +1484,25 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1353 struct mirror_info *p = &conf->mirrors[mirror]; 1484 struct mirror_info *p = &conf->mirrors[mirror];
1354 if (p->recovery_disabled == mddev->recovery_disabled) 1485 if (p->recovery_disabled == mddev->recovery_disabled)
1355 continue; 1486 continue;
1356 if (p->rdev) 1487 if (p->rdev) {
1357 continue; 1488 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1489 p->replacement != NULL)
1490 continue;
1491 clear_bit(In_sync, &rdev->flags);
1492 set_bit(Replacement, &rdev->flags);
1493 rdev->raid_disk = mirror;
1494 err = 0;
1495 disk_stack_limits(mddev->gendisk, rdev->bdev,
1496 rdev->data_offset << 9);
1497 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1498 blk_queue_max_segments(mddev->queue, 1);
1499 blk_queue_segment_boundary(mddev->queue,
1500 PAGE_CACHE_SIZE - 1);
1501 }
1502 conf->fullsync = 1;
1503 rcu_assign_pointer(p->replacement, rdev);
1504 break;
1505 }
1358 1506
1359 disk_stack_limits(mddev->gendisk, rdev->bdev, 1507 disk_stack_limits(mddev->gendisk, rdev->bdev,
1360 rdev->data_offset << 9); 1508 rdev->data_offset << 9);
@@ -1385,40 +1533,61 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1385 return err; 1533 return err;
1386} 1534}
1387 1535
1388static int raid10_remove_disk(struct mddev *mddev, int number) 1536static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1389{ 1537{
1390 struct r10conf *conf = mddev->private; 1538 struct r10conf *conf = mddev->private;
1391 int err = 0; 1539 int err = 0;
1392 struct md_rdev *rdev; 1540 int number = rdev->raid_disk;
1393 struct mirror_info *p = conf->mirrors+ number; 1541 struct md_rdev **rdevp;
1542 struct mirror_info *p = conf->mirrors + number;
1394 1543
1395 print_conf(conf); 1544 print_conf(conf);
1396 rdev = p->rdev; 1545 if (rdev == p->rdev)
1397 if (rdev) { 1546 rdevp = &p->rdev;
1398 if (test_bit(In_sync, &rdev->flags) || 1547 else if (rdev == p->replacement)
1399 atomic_read(&rdev->nr_pending)) { 1548 rdevp = &p->replacement;
1400 err = -EBUSY; 1549 else
1401 goto abort; 1550 return 0;
1402 } 1551
1403 /* Only remove faulty devices in recovery 1552 if (test_bit(In_sync, &rdev->flags) ||
1404 * is not possible. 1553 atomic_read(&rdev->nr_pending)) {
1405 */ 1554 err = -EBUSY;
1406 if (!test_bit(Faulty, &rdev->flags) && 1555 goto abort;
1407 mddev->recovery_disabled != p->recovery_disabled &&
1408 enough(conf, -1)) {
1409 err = -EBUSY;
1410 goto abort;
1411 }
1412 p->rdev = NULL;
1413 synchronize_rcu();
1414 if (atomic_read(&rdev->nr_pending)) {
1415 /* lost the race, try later */
1416 err = -EBUSY;
1417 p->rdev = rdev;
1418 goto abort;
1419 }
1420 err = md_integrity_register(mddev);
1421 } 1556 }
1557 /* Only remove faulty devices if recovery
1558 * is not possible.
1559 */
1560 if (!test_bit(Faulty, &rdev->flags) &&
1561 mddev->recovery_disabled != p->recovery_disabled &&
1562 (!p->replacement || p->replacement == rdev) &&
1563 enough(conf, -1)) {
1564 err = -EBUSY;
1565 goto abort;
1566 }
1567 *rdevp = NULL;
1568 synchronize_rcu();
1569 if (atomic_read(&rdev->nr_pending)) {
1570 /* lost the race, try later */
1571 err = -EBUSY;
1572 *rdevp = rdev;
1573 goto abort;
1574 } else if (p->replacement) {
1575 /* We must have just cleared 'rdev' */
1576 p->rdev = p->replacement;
1577 clear_bit(Replacement, &p->replacement->flags);
1578 smp_mb(); /* Make sure other CPUs may see both as identical
1579 * but will never see neither -- if they are careful.
1580 */
1581 p->replacement = NULL;
1582 clear_bit(WantReplacement, &rdev->flags);
1583 } else
1584 /* We might have just remove the Replacement as faulty
1585 * Clear the flag just in case
1586 */
1587 clear_bit(WantReplacement, &rdev->flags);
1588
1589 err = md_integrity_register(mddev);
1590
1422abort: 1591abort:
1423 1592
1424 print_conf(conf); 1593 print_conf(conf);
@@ -1432,7 +1601,7 @@ static void end_sync_read(struct bio *bio, int error)
1432 struct r10conf *conf = r10_bio->mddev->private; 1601 struct r10conf *conf = r10_bio->mddev->private;
1433 int d; 1602 int d;
1434 1603
1435 d = find_bio_disk(conf, r10_bio, bio, NULL); 1604 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1436 1605
1437 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1606 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1438 set_bit(R10BIO_Uptodate, &r10_bio->state); 1607 set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1493,19 +1662,34 @@ static void end_sync_write(struct bio *bio, int error)
1493 sector_t first_bad; 1662 sector_t first_bad;
1494 int bad_sectors; 1663 int bad_sectors;
1495 int slot; 1664 int slot;
1496 1665 int repl;
1497 d = find_bio_disk(conf, r10_bio, bio, &slot); 1666 struct md_rdev *rdev = NULL;
1667
1668 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1669 if (repl)
1670 rdev = conf->mirrors[d].replacement;
1671 if (!rdev) {
1672 smp_mb();
1673 rdev = conf->mirrors[d].rdev;
1674 }
1498 1675
1499 if (!uptodate) { 1676 if (!uptodate) {
1500 set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); 1677 if (repl)
1501 set_bit(R10BIO_WriteError, &r10_bio->state); 1678 md_error(mddev, rdev);
1502 } else if (is_badblock(conf->mirrors[d].rdev, 1679 else {
1680 set_bit(WriteErrorSeen, &rdev->flags);
1681 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1682 set_bit(MD_RECOVERY_NEEDED,
1683 &rdev->mddev->recovery);
1684 set_bit(R10BIO_WriteError, &r10_bio->state);
1685 }
1686 } else if (is_badblock(rdev,
1503 r10_bio->devs[slot].addr, 1687 r10_bio->devs[slot].addr,
1504 r10_bio->sectors, 1688 r10_bio->sectors,
1505 &first_bad, &bad_sectors)) 1689 &first_bad, &bad_sectors))
1506 set_bit(R10BIO_MadeGood, &r10_bio->state); 1690 set_bit(R10BIO_MadeGood, &r10_bio->state);
1507 1691
1508 rdev_dec_pending(conf->mirrors[d].rdev, mddev); 1692 rdev_dec_pending(rdev, mddev);
1509 1693
1510 end_sync_request(r10_bio); 1694 end_sync_request(r10_bio);
1511} 1695}
@@ -1609,6 +1793,29 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1609 generic_make_request(tbio); 1793 generic_make_request(tbio);
1610 } 1794 }
1611 1795
1796 /* Now write out to any replacement devices
1797 * that are active
1798 */
1799 for (i = 0; i < conf->copies; i++) {
1800 int j, d;
1801 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1802
1803 tbio = r10_bio->devs[i].repl_bio;
1804 if (!tbio || !tbio->bi_end_io)
1805 continue;
1806 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
1807 && r10_bio->devs[i].bio != fbio)
1808 for (j = 0; j < vcnt; j++)
1809 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1810 page_address(fbio->bi_io_vec[j].bv_page),
1811 PAGE_SIZE);
1812 d = r10_bio->devs[i].devnum;
1813 atomic_inc(&r10_bio->remaining);
1814 md_sync_acct(conf->mirrors[d].replacement->bdev,
1815 tbio->bi_size >> 9);
1816 generic_make_request(tbio);
1817 }
1818
1612done: 1819done:
1613 if (atomic_dec_and_test(&r10_bio->remaining)) { 1820 if (atomic_dec_and_test(&r10_bio->remaining)) {
1614 md_done_sync(mddev, r10_bio->sectors, 1); 1821 md_done_sync(mddev, r10_bio->sectors, 1);
@@ -1668,8 +1875,13 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
1668 s << 9, 1875 s << 9,
1669 bio->bi_io_vec[idx].bv_page, 1876 bio->bi_io_vec[idx].bv_page,
1670 WRITE, false); 1877 WRITE, false);
1671 if (!ok) 1878 if (!ok) {
1672 set_bit(WriteErrorSeen, &rdev->flags); 1879 set_bit(WriteErrorSeen, &rdev->flags);
1880 if (!test_and_set_bit(WantReplacement,
1881 &rdev->flags))
1882 set_bit(MD_RECOVERY_NEEDED,
1883 &rdev->mddev->recovery);
1884 }
1673 } 1885 }
1674 if (!ok) { 1886 if (!ok) {
1675 /* We don't worry if we cannot set a bad block - 1887 /* We don't worry if we cannot set a bad block -
@@ -1709,7 +1921,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1709{ 1921{
1710 struct r10conf *conf = mddev->private; 1922 struct r10conf *conf = mddev->private;
1711 int d; 1923 int d;
1712 struct bio *wbio; 1924 struct bio *wbio, *wbio2;
1713 1925
1714 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { 1926 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
1715 fix_recovery_read_error(r10_bio); 1927 fix_recovery_read_error(r10_bio);
@@ -1721,12 +1933,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1721 * share the pages with the first bio 1933 * share the pages with the first bio
1722 * and submit the write request 1934 * and submit the write request
1723 */ 1935 */
1724 wbio = r10_bio->devs[1].bio;
1725 d = r10_bio->devs[1].devnum; 1936 d = r10_bio->devs[1].devnum;
1726 1937 wbio = r10_bio->devs[1].bio;
1727 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 1938 wbio2 = r10_bio->devs[1].repl_bio;
1728 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 1939 if (wbio->bi_end_io) {
1729 generic_make_request(wbio); 1940 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1941 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1942 generic_make_request(wbio);
1943 }
1944 if (wbio2 && wbio2->bi_end_io) {
1945 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
1946 md_sync_acct(conf->mirrors[d].replacement->bdev,
1947 wbio2->bi_size >> 9);
1948 generic_make_request(wbio2);
1949 }
1730} 1950}
1731 1951
1732 1952
@@ -1779,8 +1999,12 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
1779 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) 1999 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1780 /* success */ 2000 /* success */
1781 return 1; 2001 return 1;
1782 if (rw == WRITE) 2002 if (rw == WRITE) {
1783 set_bit(WriteErrorSeen, &rdev->flags); 2003 set_bit(WriteErrorSeen, &rdev->flags);
2004 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2005 set_bit(MD_RECOVERY_NEEDED,
2006 &rdev->mddev->recovery);
2007 }
1784 /* need to record an error - either for the block or the device */ 2008 /* need to record an error - either for the block or the device */
1785 if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 2009 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1786 md_error(rdev->mddev, rdev); 2010 md_error(rdev->mddev, rdev);
@@ -2060,10 +2284,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
2060static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) 2284static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2061{ 2285{
2062 int slot = r10_bio->read_slot; 2286 int slot = r10_bio->read_slot;
2063 int mirror = r10_bio->devs[slot].devnum;
2064 struct bio *bio; 2287 struct bio *bio;
2065 struct r10conf *conf = mddev->private; 2288 struct r10conf *conf = mddev->private;
2066 struct md_rdev *rdev; 2289 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2067 char b[BDEVNAME_SIZE]; 2290 char b[BDEVNAME_SIZE];
2068 unsigned long do_sync; 2291 unsigned long do_sync;
2069 int max_sectors; 2292 int max_sectors;
@@ -2081,15 +2304,15 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2081 fix_read_error(conf, mddev, r10_bio); 2304 fix_read_error(conf, mddev, r10_bio);
2082 unfreeze_array(conf); 2305 unfreeze_array(conf);
2083 } 2306 }
2084 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); 2307 rdev_dec_pending(rdev, mddev);
2085 2308
2086 bio = r10_bio->devs[slot].bio; 2309 bio = r10_bio->devs[slot].bio;
2087 bdevname(bio->bi_bdev, b); 2310 bdevname(bio->bi_bdev, b);
2088 r10_bio->devs[slot].bio = 2311 r10_bio->devs[slot].bio =
2089 mddev->ro ? IO_BLOCKED : NULL; 2312 mddev->ro ? IO_BLOCKED : NULL;
2090read_more: 2313read_more:
2091 mirror = read_balance(conf, r10_bio, &max_sectors); 2314 rdev = read_balance(conf, r10_bio, &max_sectors);
2092 if (mirror == -1) { 2315 if (rdev == NULL) {
2093 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" 2316 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2094 " read error for block %llu\n", 2317 " read error for block %llu\n",
2095 mdname(mddev), b, 2318 mdname(mddev), b,
@@ -2103,7 +2326,6 @@ read_more:
2103 if (bio) 2326 if (bio)
2104 bio_put(bio); 2327 bio_put(bio);
2105 slot = r10_bio->read_slot; 2328 slot = r10_bio->read_slot;
2106 rdev = conf->mirrors[mirror].rdev;
2107 printk_ratelimited( 2329 printk_ratelimited(
2108 KERN_ERR 2330 KERN_ERR
2109 "md/raid10:%s: %s: redirecting" 2331 "md/raid10:%s: %s: redirecting"
@@ -2117,6 +2339,7 @@ read_more:
2117 r10_bio->sector - bio->bi_sector, 2339 r10_bio->sector - bio->bi_sector,
2118 max_sectors); 2340 max_sectors);
2119 r10_bio->devs[slot].bio = bio; 2341 r10_bio->devs[slot].bio = bio;
2342 r10_bio->devs[slot].rdev = rdev;
2120 bio->bi_sector = r10_bio->devs[slot].addr 2343 bio->bi_sector = r10_bio->devs[slot].addr
2121 + rdev->data_offset; 2344 + rdev->data_offset;
2122 bio->bi_bdev = rdev->bdev; 2345 bio->bi_bdev = rdev->bdev;
@@ -2187,6 +2410,22 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2187 r10_bio->sectors, 0)) 2410 r10_bio->sectors, 0))
2188 md_error(conf->mddev, rdev); 2411 md_error(conf->mddev, rdev);
2189 } 2412 }
2413 rdev = conf->mirrors[dev].replacement;
2414 if (r10_bio->devs[m].repl_bio == NULL)
2415 continue;
2416 if (test_bit(BIO_UPTODATE,
2417 &r10_bio->devs[m].repl_bio->bi_flags)) {
2418 rdev_clear_badblocks(
2419 rdev,
2420 r10_bio->devs[m].addr,
2421 r10_bio->sectors);
2422 } else {
2423 if (!rdev_set_badblocks(
2424 rdev,
2425 r10_bio->devs[m].addr,
2426 r10_bio->sectors, 0))
2427 md_error(conf->mddev, rdev);
2428 }
2190 } 2429 }
2191 put_buf(r10_bio); 2430 put_buf(r10_bio);
2192 } else { 2431 } else {
@@ -2209,6 +2448,15 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2209 } 2448 }
2210 rdev_dec_pending(rdev, conf->mddev); 2449 rdev_dec_pending(rdev, conf->mddev);
2211 } 2450 }
2451 bio = r10_bio->devs[m].repl_bio;
2452 rdev = conf->mirrors[dev].replacement;
2453 if (rdev && bio == IO_MADE_GOOD) {
2454 rdev_clear_badblocks(
2455 rdev,
2456 r10_bio->devs[m].addr,
2457 r10_bio->sectors);
2458 rdev_dec_pending(rdev, conf->mddev);
2459 }
2212 } 2460 }
2213 if (test_bit(R10BIO_WriteError, 2461 if (test_bit(R10BIO_WriteError,
2214 &r10_bio->state)) 2462 &r10_bio->state))
@@ -2272,9 +2520,14 @@ static void raid10d(struct mddev *mddev)
2272static int init_resync(struct r10conf *conf) 2520static int init_resync(struct r10conf *conf)
2273{ 2521{
2274 int buffs; 2522 int buffs;
2523 int i;
2275 2524
2276 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 2525 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2277 BUG_ON(conf->r10buf_pool); 2526 BUG_ON(conf->r10buf_pool);
2527 conf->have_replacement = 0;
2528 for (i = 0; i < conf->raid_disks; i++)
2529 if (conf->mirrors[i].replacement)
2530 conf->have_replacement = 1;
2278 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); 2531 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2279 if (!conf->r10buf_pool) 2532 if (!conf->r10buf_pool)
2280 return -ENOMEM; 2533 return -ENOMEM;
@@ -2355,9 +2608,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2355 bitmap_end_sync(mddev->bitmap, sect, 2608 bitmap_end_sync(mddev->bitmap, sect,
2356 &sync_blocks, 1); 2609 &sync_blocks, 1);
2357 } 2610 }
2358 } else /* completed sync */ 2611 } else {
2612 /* completed sync */
2613 if ((!mddev->bitmap || conf->fullsync)
2614 && conf->have_replacement
2615 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2616 /* Completed a full sync so the replacements
2617 * are now fully recovered.
2618 */
2619 for (i = 0; i < conf->raid_disks; i++)
2620 if (conf->mirrors[i].replacement)
2621 conf->mirrors[i].replacement
2622 ->recovery_offset
2623 = MaxSector;
2624 }
2359 conf->fullsync = 0; 2625 conf->fullsync = 0;
2360 2626 }
2361 bitmap_close_sync(mddev->bitmap); 2627 bitmap_close_sync(mddev->bitmap);
2362 close_sync(conf); 2628 close_sync(conf);
2363 *skipped = 1; 2629 *skipped = 1;
@@ -2414,23 +2680,30 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2414 sector_t sect; 2680 sector_t sect;
2415 int must_sync; 2681 int must_sync;
2416 int any_working; 2682 int any_working;
2417 2683 struct mirror_info *mirror = &conf->mirrors[i];
2418 if (conf->mirrors[i].rdev == NULL || 2684
2419 test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 2685 if ((mirror->rdev == NULL ||
2686 test_bit(In_sync, &mirror->rdev->flags))
2687 &&
2688 (mirror->replacement == NULL ||
2689 test_bit(Faulty,
2690 &mirror->replacement->flags)))
2420 continue; 2691 continue;
2421 2692
2422 still_degraded = 0; 2693 still_degraded = 0;
2423 /* want to reconstruct this device */ 2694 /* want to reconstruct this device */
2424 rb2 = r10_bio; 2695 rb2 = r10_bio;
2425 sect = raid10_find_virt(conf, sector_nr, i); 2696 sect = raid10_find_virt(conf, sector_nr, i);
2426 /* Unless we are doing a full sync, we only need 2697 /* Unless we are doing a full sync, or a replacement
2427 * to recover the block if it is set in the bitmap 2698 * we only need to recover the block if it is set in
2699 * the bitmap
2428 */ 2700 */
2429 must_sync = bitmap_start_sync(mddev->bitmap, sect, 2701 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2430 &sync_blocks, 1); 2702 &sync_blocks, 1);
2431 if (sync_blocks < max_sync) 2703 if (sync_blocks < max_sync)
2432 max_sync = sync_blocks; 2704 max_sync = sync_blocks;
2433 if (!must_sync && 2705 if (!must_sync &&
2706 mirror->replacement == NULL &&
2434 !conf->fullsync) { 2707 !conf->fullsync) {
2435 /* yep, skip the sync_blocks here, but don't assume 2708 /* yep, skip the sync_blocks here, but don't assume
2436 * that there will never be anything to do here 2709 * that there will never be anything to do here
@@ -2500,33 +2773,60 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2500 bio->bi_end_io = end_sync_read; 2773 bio->bi_end_io = end_sync_read;
2501 bio->bi_rw = READ; 2774 bio->bi_rw = READ;
2502 from_addr = r10_bio->devs[j].addr; 2775 from_addr = r10_bio->devs[j].addr;
2503 bio->bi_sector = from_addr + 2776 bio->bi_sector = from_addr + rdev->data_offset;
2504 conf->mirrors[d].rdev->data_offset; 2777 bio->bi_bdev = rdev->bdev;
2505 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2778 atomic_inc(&rdev->nr_pending);
2506 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2779 /* and we write to 'i' (if not in_sync) */
2507 atomic_inc(&r10_bio->remaining);
2508 /* and we write to 'i' */
2509 2780
2510 for (k=0; k<conf->copies; k++) 2781 for (k=0; k<conf->copies; k++)
2511 if (r10_bio->devs[k].devnum == i) 2782 if (r10_bio->devs[k].devnum == i)
2512 break; 2783 break;
2513 BUG_ON(k == conf->copies); 2784 BUG_ON(k == conf->copies);
2514 bio = r10_bio->devs[1].bio;
2515 bio->bi_next = biolist;
2516 biolist = bio;
2517 bio->bi_private = r10_bio;
2518 bio->bi_end_io = end_sync_write;
2519 bio->bi_rw = WRITE;
2520 to_addr = r10_bio->devs[k].addr; 2785 to_addr = r10_bio->devs[k].addr;
2521 bio->bi_sector = to_addr +
2522 conf->mirrors[i].rdev->data_offset;
2523 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
2524
2525 r10_bio->devs[0].devnum = d; 2786 r10_bio->devs[0].devnum = d;
2526 r10_bio->devs[0].addr = from_addr; 2787 r10_bio->devs[0].addr = from_addr;
2527 r10_bio->devs[1].devnum = i; 2788 r10_bio->devs[1].devnum = i;
2528 r10_bio->devs[1].addr = to_addr; 2789 r10_bio->devs[1].addr = to_addr;
2529 2790
2791 rdev = mirror->rdev;
2792 if (!test_bit(In_sync, &rdev->flags)) {
2793 bio = r10_bio->devs[1].bio;
2794 bio->bi_next = biolist;
2795 biolist = bio;
2796 bio->bi_private = r10_bio;
2797 bio->bi_end_io = end_sync_write;
2798 bio->bi_rw = WRITE;
2799 bio->bi_sector = to_addr
2800 + rdev->data_offset;
2801 bio->bi_bdev = rdev->bdev;
2802 atomic_inc(&r10_bio->remaining);
2803 } else
2804 r10_bio->devs[1].bio->bi_end_io = NULL;
2805
2806 /* and maybe write to replacement */
2807 bio = r10_bio->devs[1].repl_bio;
2808 if (bio)
2809 bio->bi_end_io = NULL;
2810 rdev = mirror->replacement;
2811 /* Note: if rdev != NULL, then bio
2812 * cannot be NULL as r10buf_pool_alloc will
2813 * have allocated it.
2814 * So the second test here is pointless.
2815 * But it keeps semantic-checkers happy, and
2816 * this comment keeps human reviewers
2817 * happy.
2818 */
2819 if (rdev == NULL || bio == NULL ||
2820 test_bit(Faulty, &rdev->flags))
2821 break;
2822 bio->bi_next = biolist;
2823 biolist = bio;
2824 bio->bi_private = r10_bio;
2825 bio->bi_end_io = end_sync_write;
2826 bio->bi_rw = WRITE;
2827 bio->bi_sector = to_addr + rdev->data_offset;
2828 bio->bi_bdev = rdev->bdev;
2829 atomic_inc(&r10_bio->remaining);
2530 break; 2830 break;
2531 } 2831 }
2532 if (j == conf->copies) { 2832 if (j == conf->copies) {
@@ -2544,8 +2844,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2544 for (k = 0; k < conf->copies; k++) 2844 for (k = 0; k < conf->copies; k++)
2545 if (r10_bio->devs[k].devnum == i) 2845 if (r10_bio->devs[k].devnum == i)
2546 break; 2846 break;
2547 if (!rdev_set_badblocks( 2847 if (!test_bit(In_sync,
2548 conf->mirrors[i].rdev, 2848 &mirror->rdev->flags)
2849 && !rdev_set_badblocks(
2850 mirror->rdev,
2851 r10_bio->devs[k].addr,
2852 max_sync, 0))
2853 any_working = 0;
2854 if (mirror->replacement &&
2855 !rdev_set_badblocks(
2856 mirror->replacement,
2549 r10_bio->devs[k].addr, 2857 r10_bio->devs[k].addr,
2550 max_sync, 0)) 2858 max_sync, 0))
2551 any_working = 0; 2859 any_working = 0;
@@ -2556,7 +2864,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2556 printk(KERN_INFO "md/raid10:%s: insufficient " 2864 printk(KERN_INFO "md/raid10:%s: insufficient "
2557 "working devices for recovery.\n", 2865 "working devices for recovery.\n",
2558 mdname(mddev)); 2866 mdname(mddev));
2559 conf->mirrors[i].recovery_disabled 2867 mirror->recovery_disabled
2560 = mddev->recovery_disabled; 2868 = mddev->recovery_disabled;
2561 } 2869 }
2562 break; 2870 break;
@@ -2605,6 +2913,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2605 sector_t first_bad, sector; 2913 sector_t first_bad, sector;
2606 int bad_sectors; 2914 int bad_sectors;
2607 2915
2916 if (r10_bio->devs[i].repl_bio)
2917 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
2918
2608 bio = r10_bio->devs[i].bio; 2919 bio = r10_bio->devs[i].bio;
2609 bio->bi_end_io = NULL; 2920 bio->bi_end_io = NULL;
2610 clear_bit(BIO_UPTODATE, &bio->bi_flags); 2921 clear_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -2635,6 +2946,27 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2635 conf->mirrors[d].rdev->data_offset; 2946 conf->mirrors[d].rdev->data_offset;
2636 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2947 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
2637 count++; 2948 count++;
2949
2950 if (conf->mirrors[d].replacement == NULL ||
2951 test_bit(Faulty,
2952 &conf->mirrors[d].replacement->flags))
2953 continue;
2954
2955 /* Need to set up for writing to the replacement */
2956 bio = r10_bio->devs[i].repl_bio;
2957 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2958
2959 sector = r10_bio->devs[i].addr;
2960 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2961 bio->bi_next = biolist;
2962 biolist = bio;
2963 bio->bi_private = r10_bio;
2964 bio->bi_end_io = end_sync_write;
2965 bio->bi_rw = WRITE;
2966 bio->bi_sector = sector +
2967 conf->mirrors[d].replacement->data_offset;
2968 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
2969 count++;
2638 } 2970 }
2639 2971
2640 if (count < 2) { 2972 if (count < 2) {
@@ -2643,6 +2975,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2643 if (r10_bio->devs[i].bio->bi_end_io) 2975 if (r10_bio->devs[i].bio->bi_end_io)
2644 rdev_dec_pending(conf->mirrors[d].rdev, 2976 rdev_dec_pending(conf->mirrors[d].rdev,
2645 mddev); 2977 mddev);
2978 if (r10_bio->devs[i].repl_bio &&
2979 r10_bio->devs[i].repl_bio->bi_end_io)
2980 rdev_dec_pending(
2981 conf->mirrors[d].replacement,
2982 mddev);
2646 } 2983 }
2647 put_buf(r10_bio); 2984 put_buf(r10_bio);
2648 biolist = NULL; 2985 biolist = NULL;
@@ -2896,6 +3233,16 @@ static int run(struct mddev *mddev)
2896 continue; 3233 continue;
2897 disk = conf->mirrors + disk_idx; 3234 disk = conf->mirrors + disk_idx;
2898 3235
3236 if (test_bit(Replacement, &rdev->flags)) {
3237 if (disk->replacement)
3238 goto out_free_conf;
3239 disk->replacement = rdev;
3240 } else {
3241 if (disk->rdev)
3242 goto out_free_conf;
3243 disk->rdev = rdev;
3244 }
3245
2899 disk->rdev = rdev; 3246 disk->rdev = rdev;
2900 disk_stack_limits(mddev->gendisk, rdev->bdev, 3247 disk_stack_limits(mddev->gendisk, rdev->bdev,
2901 rdev->data_offset << 9); 3248 rdev->data_offset << 9);
@@ -2923,6 +3270,13 @@ static int run(struct mddev *mddev)
2923 3270
2924 disk = conf->mirrors + i; 3271 disk = conf->mirrors + i;
2925 3272
3273 if (!disk->rdev && disk->replacement) {
3274 /* The replacement is all we have - use it */
3275 disk->rdev = disk->replacement;
3276 disk->replacement = NULL;
3277 clear_bit(Replacement, &disk->rdev->flags);
3278 }
3279
2926 if (!disk->rdev || 3280 if (!disk->rdev ||
2927 !test_bit(In_sync, &disk->rdev->flags)) { 3281 !test_bit(In_sync, &disk->rdev->flags)) {
2928 disk->head_position = 0; 3282 disk->head_position = 0;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 7facfdf841f4..7c615613c381 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -2,7 +2,7 @@
2#define _RAID10_H 2#define _RAID10_H
3 3
4struct mirror_info { 4struct mirror_info {
5 struct md_rdev *rdev; 5 struct md_rdev *rdev, *replacement;
6 sector_t head_position; 6 sector_t head_position;
7 int recovery_disabled; /* matches 7 int recovery_disabled; /* matches
8 * mddev->recovery_disabled 8 * mddev->recovery_disabled
@@ -18,12 +18,13 @@ struct r10conf {
18 spinlock_t device_lock; 18 spinlock_t device_lock;
19 19
20 /* geometry */ 20 /* geometry */
21 int near_copies; /* number of copies laid out raid0 style */ 21 int near_copies; /* number of copies laid out
22 * raid0 style */
22 int far_copies; /* number of copies laid out 23 int far_copies; /* number of copies laid out
23 * at large strides across drives 24 * at large strides across drives
24 */ 25 */
25 int far_offset; /* far_copies are offset by 1 stripe 26 int far_offset; /* far_copies are offset by 1
26 * instead of many 27 * stripe instead of many
27 */ 28 */
28 int copies; /* near_copies * far_copies. 29 int copies; /* near_copies * far_copies.
29 * must be <= raid_disks 30 * must be <= raid_disks
@@ -34,10 +35,11 @@ struct r10conf {
34 * 1 stripe. 35 * 1 stripe.
35 */ 36 */
36 37
37 sector_t dev_sectors; /* temp copy of mddev->dev_sectors */ 38 sector_t dev_sectors; /* temp copy of
39 * mddev->dev_sectors */
38 40
39 int chunk_shift; /* shift from chunks to sectors */ 41 int chunk_shift; /* shift from chunks to sectors */
40 sector_t chunk_mask; 42 sector_t chunk_mask;
41 43
42 struct list_head retry_list; 44 struct list_head retry_list;
43 /* queue pending writes and submit them on unplug */ 45 /* queue pending writes and submit them on unplug */
@@ -45,20 +47,22 @@ struct r10conf {
45 int pending_count; 47 int pending_count;
46 48
47 spinlock_t resync_lock; 49 spinlock_t resync_lock;
48 int nr_pending; 50 int nr_pending;
49 int nr_waiting; 51 int nr_waiting;
50 int nr_queued; 52 int nr_queued;
51 int barrier; 53 int barrier;
52 sector_t next_resync; 54 sector_t next_resync;
53 int fullsync; /* set to 1 if a full sync is needed, 55 int fullsync; /* set to 1 if a full sync is needed,
54 * (fresh device added). 56 * (fresh device added).
55 * Cleared when a sync completes. 57 * Cleared when a sync completes.
56 */ 58 */
57 59 int have_replacement; /* There is at least one
60 * replacement device.
61 */
58 wait_queue_head_t wait_barrier; 62 wait_queue_head_t wait_barrier;
59 63
60 mempool_t *r10bio_pool; 64 mempool_t *r10bio_pool;
61 mempool_t *r10buf_pool; 65 mempool_t *r10buf_pool;
62 struct page *tmppage; 66 struct page *tmppage;
63 67
64 /* When taking over an array from a different personality, we store 68 /* When taking over an array from a different personality, we store
@@ -98,11 +102,18 @@ struct r10bio {
98 * When resyncing we also use one for each copy. 102 * When resyncing we also use one for each copy.
99 * When reconstructing, we use 2 bios, one for read, one for write. 103 * When reconstructing, we use 2 bios, one for read, one for write.
100 * We choose the number when they are allocated. 104 * We choose the number when they are allocated.
105 * We sometimes need an extra bio to write to the replacement.
101 */ 106 */
102 struct { 107 struct {
103 struct bio *bio; 108 struct bio *bio;
104 sector_t addr; 109 union {
105 int devnum; 110 struct bio *repl_bio; /* used for resync and
111 * writes */
112 struct md_rdev *rdev; /* used for reads
113 * (read_slot >= 0) */
114 };
115 sector_t addr;
116 int devnum;
106 } devs[0]; 117 } devs[0];
107}; 118};
108 119
@@ -121,17 +132,19 @@ struct r10bio {
121#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) 132#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
122 133
123/* bits for r10bio.state */ 134/* bits for r10bio.state */
124#define R10BIO_Uptodate 0 135enum r10bio_state {
125#define R10BIO_IsSync 1 136 R10BIO_Uptodate,
126#define R10BIO_IsRecover 2 137 R10BIO_IsSync,
127#define R10BIO_Degraded 3 138 R10BIO_IsRecover,
139 R10BIO_Degraded,
128/* Set ReadError on bios that experience a read error 140/* Set ReadError on bios that experience a read error
129 * so that raid10d knows what to do with them. 141 * so that raid10d knows what to do with them.
130 */ 142 */
131#define R10BIO_ReadError 4 143 R10BIO_ReadError,
132/* If a write for this request means we can clear some 144/* If a write for this request means we can clear some
133 * known-bad-block records, we set this flag. 145 * known-bad-block records, we set this flag.
134 */ 146 */
135#define R10BIO_MadeGood 5 147 R10BIO_MadeGood,
136#define R10BIO_WriteError 6 148 R10BIO_WriteError,
149};
137#endif 150#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 858fdbb7eb07..360f2b98f62b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -370,12 +370,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
370 * of the two sections, and some non-in_sync devices may 370 * of the two sections, and some non-in_sync devices may
371 * be insync in the section most affected by failed devices. 371 * be insync in the section most affected by failed devices.
372 */ 372 */
373static int has_failed(struct r5conf *conf) 373static int calc_degraded(struct r5conf *conf)
374{ 374{
375 int degraded; 375 int degraded, degraded2;
376 int i; 376 int i;
377 if (conf->mddev->reshape_position == MaxSector)
378 return conf->mddev->degraded > conf->max_degraded;
379 377
380 rcu_read_lock(); 378 rcu_read_lock();
381 degraded = 0; 379 degraded = 0;
@@ -399,14 +397,14 @@ static int has_failed(struct r5conf *conf)
399 degraded++; 397 degraded++;
400 } 398 }
401 rcu_read_unlock(); 399 rcu_read_unlock();
402 if (degraded > conf->max_degraded) 400 if (conf->raid_disks == conf->previous_raid_disks)
403 return 1; 401 return degraded;
404 rcu_read_lock(); 402 rcu_read_lock();
405 degraded = 0; 403 degraded2 = 0;
406 for (i = 0; i < conf->raid_disks; i++) { 404 for (i = 0; i < conf->raid_disks; i++) {
407 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 405 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
408 if (!rdev || test_bit(Faulty, &rdev->flags)) 406 if (!rdev || test_bit(Faulty, &rdev->flags))
409 degraded++; 407 degraded2++;
410 else if (test_bit(In_sync, &rdev->flags)) 408 else if (test_bit(In_sync, &rdev->flags))
411 ; 409 ;
412 else 410 else
@@ -416,9 +414,22 @@ static int has_failed(struct r5conf *conf)
416 * almost certainly hasn't. 414 * almost certainly hasn't.
417 */ 415 */
418 if (conf->raid_disks <= conf->previous_raid_disks) 416 if (conf->raid_disks <= conf->previous_raid_disks)
419 degraded++; 417 degraded2++;
420 } 418 }
421 rcu_read_unlock(); 419 rcu_read_unlock();
420 if (degraded2 > degraded)
421 return degraded2;
422 return degraded;
423}
424
425static int has_failed(struct r5conf *conf)
426{
427 int degraded;
428
429 if (conf->mddev->reshape_position == MaxSector)
430 return conf->mddev->degraded > conf->max_degraded;
431
432 degraded = calc_degraded(conf);
422 if (degraded > conf->max_degraded) 433 if (degraded > conf->max_degraded)
423 return 1; 434 return 1;
424 return 0; 435 return 0;
@@ -492,8 +503,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
492 503
493 for (i = disks; i--; ) { 504 for (i = disks; i--; ) {
494 int rw; 505 int rw;
495 struct bio *bi; 506 int replace_only = 0;
496 struct md_rdev *rdev; 507 struct bio *bi, *rbi;
508 struct md_rdev *rdev, *rrdev = NULL;
497 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
498 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 510 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
499 rw = WRITE_FUA; 511 rw = WRITE_FUA;
@@ -501,27 +513,57 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
501 rw = WRITE; 513 rw = WRITE;
502 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 514 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
503 rw = READ; 515 rw = READ;
504 else 516 else if (test_and_clear_bit(R5_WantReplace,
517 &sh->dev[i].flags)) {
518 rw = WRITE;
519 replace_only = 1;
520 } else
505 continue; 521 continue;
506 522
507 bi = &sh->dev[i].req; 523 bi = &sh->dev[i].req;
524 rbi = &sh->dev[i].rreq; /* For writing to replacement */
508 525
509 bi->bi_rw = rw; 526 bi->bi_rw = rw;
510 if (rw & WRITE) 527 rbi->bi_rw = rw;
528 if (rw & WRITE) {
511 bi->bi_end_io = raid5_end_write_request; 529 bi->bi_end_io = raid5_end_write_request;
512 else 530 rbi->bi_end_io = raid5_end_write_request;
531 } else
513 bi->bi_end_io = raid5_end_read_request; 532 bi->bi_end_io = raid5_end_read_request;
514 533
515 rcu_read_lock(); 534 rcu_read_lock();
535 rrdev = rcu_dereference(conf->disks[i].replacement);
536 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
516 rdev = rcu_dereference(conf->disks[i].rdev); 537 rdev = rcu_dereference(conf->disks[i].rdev);
538 if (!rdev) {
539 rdev = rrdev;
540 rrdev = NULL;
541 }
542 if (rw & WRITE) {
543 if (replace_only)
544 rdev = NULL;
545 if (rdev == rrdev)
546 /* We raced and saw duplicates */
547 rrdev = NULL;
548 } else {
549 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
550 rdev = rrdev;
551 rrdev = NULL;
552 }
553
517 if (rdev && test_bit(Faulty, &rdev->flags)) 554 if (rdev && test_bit(Faulty, &rdev->flags))
518 rdev = NULL; 555 rdev = NULL;
519 if (rdev) 556 if (rdev)
520 atomic_inc(&rdev->nr_pending); 557 atomic_inc(&rdev->nr_pending);
558 if (rrdev && test_bit(Faulty, &rrdev->flags))
559 rrdev = NULL;
560 if (rrdev)
561 atomic_inc(&rrdev->nr_pending);
521 rcu_read_unlock(); 562 rcu_read_unlock();
522 563
523 /* We have already checked bad blocks for reads. Now 564 /* We have already checked bad blocks for reads. Now
524 * need to check for writes. 565 * need to check for writes. We never accept write errors
566 * on the replacement, so we don't to check rrdev.
525 */ 567 */
526 while ((rw & WRITE) && rdev && 568 while ((rw & WRITE) && rdev &&
527 test_bit(WriteErrorSeen, &rdev->flags)) { 569 test_bit(WriteErrorSeen, &rdev->flags)) {
@@ -551,7 +593,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
551 } 593 }
552 594
553 if (rdev) { 595 if (rdev) {
554 if (s->syncing || s->expanding || s->expanded) 596 if (s->syncing || s->expanding || s->expanded
597 || s->replacing)
555 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 598 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
556 599
557 set_bit(STRIPE_IO_STARTED, &sh->state); 600 set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -563,16 +606,38 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
563 atomic_inc(&sh->count); 606 atomic_inc(&sh->count);
564 bi->bi_sector = sh->sector + rdev->data_offset; 607 bi->bi_sector = sh->sector + rdev->data_offset;
565 bi->bi_flags = 1 << BIO_UPTODATE; 608 bi->bi_flags = 1 << BIO_UPTODATE;
566 bi->bi_vcnt = 1;
567 bi->bi_max_vecs = 1;
568 bi->bi_idx = 0; 609 bi->bi_idx = 0;
569 bi->bi_io_vec = &sh->dev[i].vec;
570 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 610 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
571 bi->bi_io_vec[0].bv_offset = 0; 611 bi->bi_io_vec[0].bv_offset = 0;
572 bi->bi_size = STRIPE_SIZE; 612 bi->bi_size = STRIPE_SIZE;
573 bi->bi_next = NULL; 613 bi->bi_next = NULL;
614 if (rrdev)
615 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
574 generic_make_request(bi); 616 generic_make_request(bi);
575 } else { 617 }
618 if (rrdev) {
619 if (s->syncing || s->expanding || s->expanded
620 || s->replacing)
621 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
622
623 set_bit(STRIPE_IO_STARTED, &sh->state);
624
625 rbi->bi_bdev = rrdev->bdev;
626 pr_debug("%s: for %llu schedule op %ld on "
627 "replacement disc %d\n",
628 __func__, (unsigned long long)sh->sector,
629 rbi->bi_rw, i);
630 atomic_inc(&sh->count);
631 rbi->bi_sector = sh->sector + rrdev->data_offset;
632 rbi->bi_flags = 1 << BIO_UPTODATE;
633 rbi->bi_idx = 0;
634 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
635 rbi->bi_io_vec[0].bv_offset = 0;
636 rbi->bi_size = STRIPE_SIZE;
637 rbi->bi_next = NULL;
638 generic_make_request(rbi);
639 }
640 if (!rdev && !rrdev) {
576 if (rw & WRITE) 641 if (rw & WRITE)
577 set_bit(STRIPE_DEGRADED, &sh->state); 642 set_bit(STRIPE_DEGRADED, &sh->state);
578 pr_debug("skip op %ld on disc %d for sector %llu\n", 643 pr_debug("skip op %ld on disc %d for sector %llu\n",
@@ -1583,7 +1648,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1583 int disks = sh->disks, i; 1648 int disks = sh->disks, i;
1584 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1649 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1585 char b[BDEVNAME_SIZE]; 1650 char b[BDEVNAME_SIZE];
1586 struct md_rdev *rdev; 1651 struct md_rdev *rdev = NULL;
1587 1652
1588 1653
1589 for (i=0 ; i<disks; i++) 1654 for (i=0 ; i<disks; i++)
@@ -1597,11 +1662,23 @@ static void raid5_end_read_request(struct bio * bi, int error)
1597 BUG(); 1662 BUG();
1598 return; 1663 return;
1599 } 1664 }
1665 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1666 /* If replacement finished while this request was outstanding,
1667 * 'replacement' might be NULL already.
1668 * In that case it moved down to 'rdev'.
1669 * rdev is not removed until all requests are finished.
1670 */
1671 rdev = conf->disks[i].replacement;
1672 if (!rdev)
1673 rdev = conf->disks[i].rdev;
1600 1674
1601 if (uptodate) { 1675 if (uptodate) {
1602 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1676 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1603 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1677 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1604 rdev = conf->disks[i].rdev; 1678 /* Note that this cannot happen on a
1679 * replacement device. We just fail those on
1680 * any error
1681 */
1605 printk_ratelimited( 1682 printk_ratelimited(
1606 KERN_INFO 1683 KERN_INFO
1607 "md/raid:%s: read error corrected" 1684 "md/raid:%s: read error corrected"
@@ -1614,16 +1691,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
1614 clear_bit(R5_ReadError, &sh->dev[i].flags); 1691 clear_bit(R5_ReadError, &sh->dev[i].flags);
1615 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1692 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1616 } 1693 }
1617 if (atomic_read(&conf->disks[i].rdev->read_errors)) 1694 if (atomic_read(&rdev->read_errors))
1618 atomic_set(&conf->disks[i].rdev->read_errors, 0); 1695 atomic_set(&rdev->read_errors, 0);
1619 } else { 1696 } else {
1620 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); 1697 const char *bdn = bdevname(rdev->bdev, b);
1621 int retry = 0; 1698 int retry = 0;
1622 rdev = conf->disks[i].rdev;
1623 1699
1624 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1700 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1625 atomic_inc(&rdev->read_errors); 1701 atomic_inc(&rdev->read_errors);
1626 if (conf->mddev->degraded >= conf->max_degraded) 1702 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1703 printk_ratelimited(
1704 KERN_WARNING
1705 "md/raid:%s: read error on replacement device "
1706 "(sector %llu on %s).\n",
1707 mdname(conf->mddev),
1708 (unsigned long long)(sh->sector
1709 + rdev->data_offset),
1710 bdn);
1711 else if (conf->mddev->degraded >= conf->max_degraded)
1627 printk_ratelimited( 1712 printk_ratelimited(
1628 KERN_WARNING 1713 KERN_WARNING
1629 "md/raid:%s: read error not correctable " 1714 "md/raid:%s: read error not correctable "
@@ -1657,7 +1742,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1657 md_error(conf->mddev, rdev); 1742 md_error(conf->mddev, rdev);
1658 } 1743 }
1659 } 1744 }
1660 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1745 rdev_dec_pending(rdev, conf->mddev);
1661 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1746 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1662 set_bit(STRIPE_HANDLE, &sh->state); 1747 set_bit(STRIPE_HANDLE, &sh->state);
1663 release_stripe(sh); 1748 release_stripe(sh);
@@ -1668,14 +1753,30 @@ static void raid5_end_write_request(struct bio *bi, int error)
1668 struct stripe_head *sh = bi->bi_private; 1753 struct stripe_head *sh = bi->bi_private;
1669 struct r5conf *conf = sh->raid_conf; 1754 struct r5conf *conf = sh->raid_conf;
1670 int disks = sh->disks, i; 1755 int disks = sh->disks, i;
1756 struct md_rdev *uninitialized_var(rdev);
1671 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1757 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1672 sector_t first_bad; 1758 sector_t first_bad;
1673 int bad_sectors; 1759 int bad_sectors;
1760 int replacement = 0;
1674 1761
1675 for (i=0 ; i<disks; i++) 1762 for (i = 0 ; i < disks; i++) {
1676 if (bi == &sh->dev[i].req) 1763 if (bi == &sh->dev[i].req) {
1764 rdev = conf->disks[i].rdev;
1677 break; 1765 break;
1678 1766 }
1767 if (bi == &sh->dev[i].rreq) {
1768 rdev = conf->disks[i].replacement;
1769 if (rdev)
1770 replacement = 1;
1771 else
1772 /* rdev was removed and 'replacement'
1773 * replaced it. rdev is not removed
1774 * until all requests are finished.
1775 */
1776 rdev = conf->disks[i].rdev;
1777 break;
1778 }
1779 }
1679 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1780 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1680 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1781 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1681 uptodate); 1782 uptodate);
@@ -1684,21 +1785,33 @@ static void raid5_end_write_request(struct bio *bi, int error)
1684 return; 1785 return;
1685 } 1786 }
1686 1787
1687 if (!uptodate) { 1788 if (replacement) {
1688 set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); 1789 if (!uptodate)
1689 set_bit(R5_WriteError, &sh->dev[i].flags); 1790 md_error(conf->mddev, rdev);
1690 } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, 1791 else if (is_badblock(rdev, sh->sector,
1691 &first_bad, &bad_sectors)) 1792 STRIPE_SECTORS,
1692 set_bit(R5_MadeGood, &sh->dev[i].flags); 1793 &first_bad, &bad_sectors))
1794 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
1795 } else {
1796 if (!uptodate) {
1797 set_bit(WriteErrorSeen, &rdev->flags);
1798 set_bit(R5_WriteError, &sh->dev[i].flags);
1799 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1800 set_bit(MD_RECOVERY_NEEDED,
1801 &rdev->mddev->recovery);
1802 } else if (is_badblock(rdev, sh->sector,
1803 STRIPE_SECTORS,
1804 &first_bad, &bad_sectors))
1805 set_bit(R5_MadeGood, &sh->dev[i].flags);
1806 }
1807 rdev_dec_pending(rdev, conf->mddev);
1693 1808
1694 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1809 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
1695 1810 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1696 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1697 set_bit(STRIPE_HANDLE, &sh->state); 1811 set_bit(STRIPE_HANDLE, &sh->state);
1698 release_stripe(sh); 1812 release_stripe(sh);
1699} 1813}
1700 1814
1701
1702static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1815static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1703 1816
1704static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1817static void raid5_build_block(struct stripe_head *sh, int i, int previous)
@@ -1709,12 +1822,15 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1709 dev->req.bi_io_vec = &dev->vec; 1822 dev->req.bi_io_vec = &dev->vec;
1710 dev->req.bi_vcnt++; 1823 dev->req.bi_vcnt++;
1711 dev->req.bi_max_vecs++; 1824 dev->req.bi_max_vecs++;
1825 dev->req.bi_private = sh;
1712 dev->vec.bv_page = dev->page; 1826 dev->vec.bv_page = dev->page;
1713 dev->vec.bv_len = STRIPE_SIZE;
1714 dev->vec.bv_offset = 0;
1715 1827
1716 dev->req.bi_sector = sh->sector; 1828 bio_init(&dev->rreq);
1717 dev->req.bi_private = sh; 1829 dev->rreq.bi_io_vec = &dev->rvec;
1830 dev->rreq.bi_vcnt++;
1831 dev->rreq.bi_max_vecs++;
1832 dev->rreq.bi_private = sh;
1833 dev->rvec.bv_page = dev->page;
1718 1834
1719 dev->flags = 0; 1835 dev->flags = 0;
1720 dev->sector = compute_blocknr(sh, i, previous); 1836 dev->sector = compute_blocknr(sh, i, previous);
@@ -1724,18 +1840,15 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1724{ 1840{
1725 char b[BDEVNAME_SIZE]; 1841 char b[BDEVNAME_SIZE];
1726 struct r5conf *conf = mddev->private; 1842 struct r5conf *conf = mddev->private;
1843 unsigned long flags;
1727 pr_debug("raid456: error called\n"); 1844 pr_debug("raid456: error called\n");
1728 1845
1729 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1846 spin_lock_irqsave(&conf->device_lock, flags);
1730 unsigned long flags; 1847 clear_bit(In_sync, &rdev->flags);
1731 spin_lock_irqsave(&conf->device_lock, flags); 1848 mddev->degraded = calc_degraded(conf);
1732 mddev->degraded++; 1849 spin_unlock_irqrestore(&conf->device_lock, flags);
1733 spin_unlock_irqrestore(&conf->device_lock, flags); 1850 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1734 /* 1851
1735 * if recovery was running, make sure it aborts.
1736 */
1737 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1738 }
1739 set_bit(Blocked, &rdev->flags); 1852 set_bit(Blocked, &rdev->flags);
1740 set_bit(Faulty, &rdev->flags); 1853 set_bit(Faulty, &rdev->flags);
1741 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1854 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -2362,8 +2475,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2362 md_done_sync(conf->mddev, STRIPE_SECTORS, 0); 2475 md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2363 clear_bit(STRIPE_SYNCING, &sh->state); 2476 clear_bit(STRIPE_SYNCING, &sh->state);
2364 s->syncing = 0; 2477 s->syncing = 0;
2478 s->replacing = 0;
2365 /* There is nothing more to do for sync/check/repair. 2479 /* There is nothing more to do for sync/check/repair.
2366 * For recover we need to record a bad block on all 2480 * For recover/replace we need to record a bad block on all
2367 * non-sync devices, or abort the recovery 2481 * non-sync devices, or abort the recovery
2368 */ 2482 */
2369 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) 2483 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
@@ -2373,12 +2487,18 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2373 */ 2487 */
2374 for (i = 0; i < conf->raid_disks; i++) { 2488 for (i = 0; i < conf->raid_disks; i++) {
2375 struct md_rdev *rdev = conf->disks[i].rdev; 2489 struct md_rdev *rdev = conf->disks[i].rdev;
2376 if (!rdev 2490 if (rdev
2377 || test_bit(Faulty, &rdev->flags) 2491 && !test_bit(Faulty, &rdev->flags)
2378 || test_bit(In_sync, &rdev->flags)) 2492 && !test_bit(In_sync, &rdev->flags)
2379 continue; 2493 && !rdev_set_badblocks(rdev, sh->sector,
2380 if (!rdev_set_badblocks(rdev, sh->sector, 2494 STRIPE_SECTORS, 0))
2381 STRIPE_SECTORS, 0)) 2495 abort = 1;
2496 rdev = conf->disks[i].replacement;
2497 if (rdev
2498 && !test_bit(Faulty, &rdev->flags)
2499 && !test_bit(In_sync, &rdev->flags)
2500 && !rdev_set_badblocks(rdev, sh->sector,
2501 STRIPE_SECTORS, 0))
2382 abort = 1; 2502 abort = 1;
2383 } 2503 }
2384 if (abort) { 2504 if (abort) {
@@ -2387,6 +2507,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2387 } 2507 }
2388} 2508}
2389 2509
2510static int want_replace(struct stripe_head *sh, int disk_idx)
2511{
2512 struct md_rdev *rdev;
2513 int rv = 0;
2514 /* Doing recovery so rcu locking not required */
2515 rdev = sh->raid_conf->disks[disk_idx].replacement;
2516 if (rdev
2517 && !test_bit(Faulty, &rdev->flags)
2518 && !test_bit(In_sync, &rdev->flags)
2519 && (rdev->recovery_offset <= sh->sector
2520 || rdev->mddev->recovery_cp <= sh->sector))
2521 rv = 1;
2522
2523 return rv;
2524}
2525
2390/* fetch_block - checks the given member device to see if its data needs 2526/* fetch_block - checks the given member device to see if its data needs
2391 * to be read or computed to satisfy a request. 2527 * to be read or computed to satisfy a request.
2392 * 2528 *
@@ -2406,6 +2542,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2406 (dev->toread || 2542 (dev->toread ||
2407 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2543 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2408 s->syncing || s->expanding || 2544 s->syncing || s->expanding ||
2545 (s->replacing && want_replace(sh, disk_idx)) ||
2409 (s->failed >= 1 && fdev[0]->toread) || 2546 (s->failed >= 1 && fdev[0]->toread) ||
2410 (s->failed >= 2 && fdev[1]->toread) || 2547 (s->failed >= 2 && fdev[1]->toread) ||
2411 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2548 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
@@ -2959,22 +3096,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
2959 } 3096 }
2960} 3097}
2961 3098
2962
2963/* 3099/*
2964 * handle_stripe - do things to a stripe. 3100 * handle_stripe - do things to a stripe.
2965 * 3101 *
2966 * We lock the stripe and then examine the state of various bits 3102 * We lock the stripe by setting STRIPE_ACTIVE and then examine the
2967 * to see what needs to be done. 3103 * state of various bits to see what needs to be done.
2968 * Possible results: 3104 * Possible results:
2969 * return some read request which now have data 3105 * return some read requests which now have data
2970 * return some write requests which are safely on disc 3106 * return some write requests which are safely on storage
2971 * schedule a read on some buffers 3107 * schedule a read on some buffers
2972 * schedule a write of some buffers 3108 * schedule a write of some buffers
2973 * return confirmation of parity correctness 3109 * return confirmation of parity correctness
2974 * 3110 *
2975 * buffers are taken off read_list or write_list, and bh_cache buffers
2976 * get BH_Lock set before the stripe lock is released.
2977 *
2978 */ 3111 */
2979 3112
2980static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3113static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
@@ -2983,10 +3116,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
2983 int disks = sh->disks; 3116 int disks = sh->disks;
2984 struct r5dev *dev; 3117 struct r5dev *dev;
2985 int i; 3118 int i;
3119 int do_recovery = 0;
2986 3120
2987 memset(s, 0, sizeof(*s)); 3121 memset(s, 0, sizeof(*s));
2988 3122
2989 s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
2990 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3123 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2991 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3124 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2992 s->failed_num[0] = -1; 3125 s->failed_num[0] = -1;
@@ -3004,7 +3137,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3004 dev = &sh->dev[i]; 3137 dev = &sh->dev[i];
3005 3138
3006 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3139 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3007 i, dev->flags, dev->toread, dev->towrite, dev->written); 3140 i, dev->flags,
3141 dev->toread, dev->towrite, dev->written);
3008 /* maybe we can reply to a read 3142 /* maybe we can reply to a read
3009 * 3143 *
3010 * new wantfill requests are only permitted while 3144 * new wantfill requests are only permitted while
@@ -3035,7 +3169,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3035 } 3169 }
3036 if (dev->written) 3170 if (dev->written)
3037 s->written++; 3171 s->written++;
3038 rdev = rcu_dereference(conf->disks[i].rdev); 3172 /* Prefer to use the replacement for reads, but only
3173 * if it is recovered enough and has no bad blocks.
3174 */
3175 rdev = rcu_dereference(conf->disks[i].replacement);
3176 if (rdev && !test_bit(Faulty, &rdev->flags) &&
3177 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
3178 !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3179 &first_bad, &bad_sectors))
3180 set_bit(R5_ReadRepl, &dev->flags);
3181 else {
3182 if (rdev)
3183 set_bit(R5_NeedReplace, &dev->flags);
3184 rdev = rcu_dereference(conf->disks[i].rdev);
3185 clear_bit(R5_ReadRepl, &dev->flags);
3186 }
3039 if (rdev && test_bit(Faulty, &rdev->flags)) 3187 if (rdev && test_bit(Faulty, &rdev->flags))
3040 rdev = NULL; 3188 rdev = NULL;
3041 if (rdev) { 3189 if (rdev) {
@@ -3077,20 +3225,38 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3077 set_bit(R5_Insync, &dev->flags); 3225 set_bit(R5_Insync, &dev->flags);
3078 3226
3079 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3227 if (rdev && test_bit(R5_WriteError, &dev->flags)) {
3080 clear_bit(R5_Insync, &dev->flags); 3228 /* This flag does not apply to '.replacement'
3081 if (!test_bit(Faulty, &rdev->flags)) { 3229 * only to .rdev, so make sure to check that*/
3230 struct md_rdev *rdev2 = rcu_dereference(
3231 conf->disks[i].rdev);
3232 if (rdev2 == rdev)
3233 clear_bit(R5_Insync, &dev->flags);
3234 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3082 s->handle_bad_blocks = 1; 3235 s->handle_bad_blocks = 1;
3083 atomic_inc(&rdev->nr_pending); 3236 atomic_inc(&rdev2->nr_pending);
3084 } else 3237 } else
3085 clear_bit(R5_WriteError, &dev->flags); 3238 clear_bit(R5_WriteError, &dev->flags);
3086 } 3239 }
3087 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3240 if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
3088 if (!test_bit(Faulty, &rdev->flags)) { 3241 /* This flag does not apply to '.replacement'
3242 * only to .rdev, so make sure to check that*/
3243 struct md_rdev *rdev2 = rcu_dereference(
3244 conf->disks[i].rdev);
3245 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3089 s->handle_bad_blocks = 1; 3246 s->handle_bad_blocks = 1;
3090 atomic_inc(&rdev->nr_pending); 3247 atomic_inc(&rdev2->nr_pending);
3091 } else 3248 } else
3092 clear_bit(R5_MadeGood, &dev->flags); 3249 clear_bit(R5_MadeGood, &dev->flags);
3093 } 3250 }
3251 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
3252 struct md_rdev *rdev2 = rcu_dereference(
3253 conf->disks[i].replacement);
3254 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3255 s->handle_bad_blocks = 1;
3256 atomic_inc(&rdev2->nr_pending);
3257 } else
3258 clear_bit(R5_MadeGoodRepl, &dev->flags);
3259 }
3094 if (!test_bit(R5_Insync, &dev->flags)) { 3260 if (!test_bit(R5_Insync, &dev->flags)) {
3095 /* The ReadError flag will just be confusing now */ 3261 /* The ReadError flag will just be confusing now */
3096 clear_bit(R5_ReadError, &dev->flags); 3262 clear_bit(R5_ReadError, &dev->flags);
@@ -3102,9 +3268,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3102 if (s->failed < 2) 3268 if (s->failed < 2)
3103 s->failed_num[s->failed] = i; 3269 s->failed_num[s->failed] = i;
3104 s->failed++; 3270 s->failed++;
3271 if (rdev && !test_bit(Faulty, &rdev->flags))
3272 do_recovery = 1;
3105 } 3273 }
3106 } 3274 }
3107 spin_unlock_irq(&conf->device_lock); 3275 spin_unlock_irq(&conf->device_lock);
3276 if (test_bit(STRIPE_SYNCING, &sh->state)) {
3277 /* If there is a failed device being replaced,
3278 * we must be recovering.
3279 * else if we are after recovery_cp, we must be syncing
3280 * else we can only be replacing
3281 * sync and recovery both need to read all devices, and so
3282 * use the same flag.
3283 */
3284 if (do_recovery ||
3285 sh->sector >= conf->mddev->recovery_cp)
3286 s->syncing = 1;
3287 else
3288 s->replacing = 1;
3289 }
3108 rcu_read_unlock(); 3290 rcu_read_unlock();
3109} 3291}
3110 3292
@@ -3146,7 +3328,7 @@ static void handle_stripe(struct stripe_head *sh)
3146 3328
3147 if (unlikely(s.blocked_rdev)) { 3329 if (unlikely(s.blocked_rdev)) {
3148 if (s.syncing || s.expanding || s.expanded || 3330 if (s.syncing || s.expanding || s.expanded ||
3149 s.to_write || s.written) { 3331 s.replacing || s.to_write || s.written) {
3150 set_bit(STRIPE_HANDLE, &sh->state); 3332 set_bit(STRIPE_HANDLE, &sh->state);
3151 goto finish; 3333 goto finish;
3152 } 3334 }
@@ -3172,7 +3354,7 @@ static void handle_stripe(struct stripe_head *sh)
3172 sh->reconstruct_state = 0; 3354 sh->reconstruct_state = 0;
3173 if (s.to_read+s.to_write+s.written) 3355 if (s.to_read+s.to_write+s.written)
3174 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3356 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3175 if (s.syncing) 3357 if (s.syncing + s.replacing)
3176 handle_failed_sync(conf, sh, &s); 3358 handle_failed_sync(conf, sh, &s);
3177 } 3359 }
3178 3360
@@ -3203,7 +3385,9 @@ static void handle_stripe(struct stripe_head *sh)
3203 */ 3385 */
3204 if (s.to_read || s.non_overwrite 3386 if (s.to_read || s.non_overwrite
3205 || (conf->level == 6 && s.to_write && s.failed) 3387 || (conf->level == 6 && s.to_write && s.failed)
3206 || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3388 || (s.syncing && (s.uptodate + s.compute < disks))
3389 || s.replacing
3390 || s.expanding)
3207 handle_stripe_fill(sh, &s, disks); 3391 handle_stripe_fill(sh, &s, disks);
3208 3392
3209 /* Now we check to see if any write operations have recently 3393 /* Now we check to see if any write operations have recently
@@ -3265,7 +3449,20 @@ static void handle_stripe(struct stripe_head *sh)
3265 handle_parity_checks5(conf, sh, &s, disks); 3449 handle_parity_checks5(conf, sh, &s, disks);
3266 } 3450 }
3267 3451
3268 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3452 if (s.replacing && s.locked == 0
3453 && !test_bit(STRIPE_INSYNC, &sh->state)) {
3454 /* Write out to replacement devices where possible */
3455 for (i = 0; i < conf->raid_disks; i++)
3456 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
3457 test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
3458 set_bit(R5_WantReplace, &sh->dev[i].flags);
3459 set_bit(R5_LOCKED, &sh->dev[i].flags);
3460 s.locked++;
3461 }
3462 set_bit(STRIPE_INSYNC, &sh->state);
3463 }
3464 if ((s.syncing || s.replacing) && s.locked == 0 &&
3465 test_bit(STRIPE_INSYNC, &sh->state)) {
3269 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3466 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3270 clear_bit(STRIPE_SYNCING, &sh->state); 3467 clear_bit(STRIPE_SYNCING, &sh->state);
3271 } 3468 }
@@ -3363,6 +3560,15 @@ finish:
3363 STRIPE_SECTORS); 3560 STRIPE_SECTORS);
3364 rdev_dec_pending(rdev, conf->mddev); 3561 rdev_dec_pending(rdev, conf->mddev);
3365 } 3562 }
3563 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
3564 rdev = conf->disks[i].replacement;
3565 if (!rdev)
3566 /* rdev have been moved down */
3567 rdev = conf->disks[i].rdev;
3568 rdev_clear_badblocks(rdev, sh->sector,
3569 STRIPE_SECTORS);
3570 rdev_dec_pending(rdev, conf->mddev);
3571 }
3366 } 3572 }
3367 3573
3368 if (s.ops_request) 3574 if (s.ops_request)
@@ -3586,6 +3792,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3586 int dd_idx; 3792 int dd_idx;
3587 struct bio* align_bi; 3793 struct bio* align_bi;
3588 struct md_rdev *rdev; 3794 struct md_rdev *rdev;
3795 sector_t end_sector;
3589 3796
3590 if (!in_chunk_boundary(mddev, raid_bio)) { 3797 if (!in_chunk_boundary(mddev, raid_bio)) {
3591 pr_debug("chunk_aligned_read : non aligned\n"); 3798 pr_debug("chunk_aligned_read : non aligned\n");
@@ -3610,9 +3817,19 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3610 0, 3817 0,
3611 &dd_idx, NULL); 3818 &dd_idx, NULL);
3612 3819
3820 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
3613 rcu_read_lock(); 3821 rcu_read_lock();
3614 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3822 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
3615 if (rdev && test_bit(In_sync, &rdev->flags)) { 3823 if (!rdev || test_bit(Faulty, &rdev->flags) ||
3824 rdev->recovery_offset < end_sector) {
3825 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3826 if (rdev &&
3827 (test_bit(Faulty, &rdev->flags) ||
3828 !(test_bit(In_sync, &rdev->flags) ||
3829 rdev->recovery_offset >= end_sector)))
3830 rdev = NULL;
3831 }
3832 if (rdev) {
3616 sector_t first_bad; 3833 sector_t first_bad;
3617 int bad_sectors; 3834 int bad_sectors;
3618 3835
@@ -4137,7 +4354,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
4137 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4354 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
4138 } 4355 }
4139 4356
4140
4141 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4357 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
4142 4358
4143 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4359 sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
@@ -4208,7 +4424,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4208 return handled; 4424 return handled;
4209 } 4425 }
4210 4426
4211 set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
4212 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4427 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4213 release_stripe(sh); 4428 release_stripe(sh);
4214 raid5_set_bi_hw_segments(raid_bio, scnt); 4429 raid5_set_bi_hw_segments(raid_bio, scnt);
@@ -4635,7 +4850,15 @@ static struct r5conf *setup_conf(struct mddev *mddev)
4635 continue; 4850 continue;
4636 disk = conf->disks + raid_disk; 4851 disk = conf->disks + raid_disk;
4637 4852
4638 disk->rdev = rdev; 4853 if (test_bit(Replacement, &rdev->flags)) {
4854 if (disk->replacement)
4855 goto abort;
4856 disk->replacement = rdev;
4857 } else {
4858 if (disk->rdev)
4859 goto abort;
4860 disk->rdev = rdev;
4861 }
4639 4862
4640 if (test_bit(In_sync, &rdev->flags)) { 4863 if (test_bit(In_sync, &rdev->flags)) {
4641 char b[BDEVNAME_SIZE]; 4864 char b[BDEVNAME_SIZE];
@@ -4724,6 +4947,7 @@ static int run(struct mddev *mddev)
4724 int dirty_parity_disks = 0; 4947 int dirty_parity_disks = 0;
4725 struct md_rdev *rdev; 4948 struct md_rdev *rdev;
4726 sector_t reshape_offset = 0; 4949 sector_t reshape_offset = 0;
4950 int i;
4727 4951
4728 if (mddev->recovery_cp != MaxSector) 4952 if (mddev->recovery_cp != MaxSector)
4729 printk(KERN_NOTICE "md/raid:%s: not clean" 4953 printk(KERN_NOTICE "md/raid:%s: not clean"
@@ -4813,12 +5037,25 @@ static int run(struct mddev *mddev)
4813 conf->thread = NULL; 5037 conf->thread = NULL;
4814 mddev->private = conf; 5038 mddev->private = conf;
4815 5039
4816 /* 5040 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
4817 * 0 for a fully functional array, 1 or 2 for a degraded array. 5041 i++) {
4818 */ 5042 rdev = conf->disks[i].rdev;
4819 list_for_each_entry(rdev, &mddev->disks, same_set) { 5043 if (!rdev && conf->disks[i].replacement) {
4820 if (rdev->raid_disk < 0) 5044 /* The replacement is all we have yet */
5045 rdev = conf->disks[i].replacement;
5046 conf->disks[i].replacement = NULL;
5047 clear_bit(Replacement, &rdev->flags);
5048 conf->disks[i].rdev = rdev;
5049 }
5050 if (!rdev)
4821 continue; 5051 continue;
5052 if (conf->disks[i].replacement &&
5053 conf->reshape_progress != MaxSector) {
5054 /* replacements and reshape simply do not mix. */
5055 printk(KERN_ERR "md: cannot handle concurrent "
5056 "replacement and reshape.\n");
5057 goto abort;
5058 }
4822 if (test_bit(In_sync, &rdev->flags)) { 5059 if (test_bit(In_sync, &rdev->flags)) {
4823 working_disks++; 5060 working_disks++;
4824 continue; 5061 continue;
@@ -4852,8 +5089,10 @@ static int run(struct mddev *mddev)
4852 dirty_parity_disks++; 5089 dirty_parity_disks++;
4853 } 5090 }
4854 5091
4855 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) 5092 /*
4856 - working_disks); 5093 * 0 for a fully functional array, 1 or 2 for a degraded array.
5094 */
5095 mddev->degraded = calc_degraded(conf);
4857 5096
4858 if (has_failed(conf)) { 5097 if (has_failed(conf)) {
4859 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5098 printk(KERN_ERR "md/raid:%s: not enough operational devices"
@@ -5016,7 +5255,25 @@ static int raid5_spare_active(struct mddev *mddev)
5016 5255
5017 for (i = 0; i < conf->raid_disks; i++) { 5256 for (i = 0; i < conf->raid_disks; i++) {
5018 tmp = conf->disks + i; 5257 tmp = conf->disks + i;
5019 if (tmp->rdev 5258 if (tmp->replacement
5259 && tmp->replacement->recovery_offset == MaxSector
5260 && !test_bit(Faulty, &tmp->replacement->flags)
5261 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
5262 /* Replacement has just become active. */
5263 if (!tmp->rdev
5264 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
5265 count++;
5266 if (tmp->rdev) {
5267 /* Replaced device not technically faulty,
5268 * but we need to be sure it gets removed
5269 * and never re-added.
5270 */
5271 set_bit(Faulty, &tmp->rdev->flags);
5272 sysfs_notify_dirent_safe(
5273 tmp->rdev->sysfs_state);
5274 }
5275 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
5276 } else if (tmp->rdev
5020 && tmp->rdev->recovery_offset == MaxSector 5277 && tmp->rdev->recovery_offset == MaxSector
5021 && !test_bit(Faulty, &tmp->rdev->flags) 5278 && !test_bit(Faulty, &tmp->rdev->flags)
5022 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5279 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
@@ -5025,49 +5282,68 @@ static int raid5_spare_active(struct mddev *mddev)
5025 } 5282 }
5026 } 5283 }
5027 spin_lock_irqsave(&conf->device_lock, flags); 5284 spin_lock_irqsave(&conf->device_lock, flags);
5028 mddev->degraded -= count; 5285 mddev->degraded = calc_degraded(conf);
5029 spin_unlock_irqrestore(&conf->device_lock, flags); 5286 spin_unlock_irqrestore(&conf->device_lock, flags);
5030 print_raid5_conf(conf); 5287 print_raid5_conf(conf);
5031 return count; 5288 return count;
5032} 5289}
5033 5290
5034static int raid5_remove_disk(struct mddev *mddev, int number) 5291static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
5035{ 5292{
5036 struct r5conf *conf = mddev->private; 5293 struct r5conf *conf = mddev->private;
5037 int err = 0; 5294 int err = 0;
5038 struct md_rdev *rdev; 5295 int number = rdev->raid_disk;
5296 struct md_rdev **rdevp;
5039 struct disk_info *p = conf->disks + number; 5297 struct disk_info *p = conf->disks + number;
5040 5298
5041 print_raid5_conf(conf); 5299 print_raid5_conf(conf);
5042 rdev = p->rdev; 5300 if (rdev == p->rdev)
5043 if (rdev) { 5301 rdevp = &p->rdev;
5044 if (number >= conf->raid_disks && 5302 else if (rdev == p->replacement)
5045 conf->reshape_progress == MaxSector) 5303 rdevp = &p->replacement;
5046 clear_bit(In_sync, &rdev->flags); 5304 else
5305 return 0;
5047 5306
5048 if (test_bit(In_sync, &rdev->flags) || 5307 if (number >= conf->raid_disks &&
5049 atomic_read(&rdev->nr_pending)) { 5308 conf->reshape_progress == MaxSector)
5050 err = -EBUSY; 5309 clear_bit(In_sync, &rdev->flags);
5051 goto abort; 5310
5052 } 5311 if (test_bit(In_sync, &rdev->flags) ||
5053 /* Only remove non-faulty devices if recovery 5312 atomic_read(&rdev->nr_pending)) {
5054 * isn't possible. 5313 err = -EBUSY;
5055 */ 5314 goto abort;
5056 if (!test_bit(Faulty, &rdev->flags) &&
5057 mddev->recovery_disabled != conf->recovery_disabled &&
5058 !has_failed(conf) &&
5059 number < conf->raid_disks) {
5060 err = -EBUSY;
5061 goto abort;
5062 }
5063 p->rdev = NULL;
5064 synchronize_rcu();
5065 if (atomic_read(&rdev->nr_pending)) {
5066 /* lost the race, try later */
5067 err = -EBUSY;
5068 p->rdev = rdev;
5069 }
5070 } 5315 }
5316 /* Only remove non-faulty devices if recovery
5317 * isn't possible.
5318 */
5319 if (!test_bit(Faulty, &rdev->flags) &&
5320 mddev->recovery_disabled != conf->recovery_disabled &&
5321 !has_failed(conf) &&
5322 (!p->replacement || p->replacement == rdev) &&
5323 number < conf->raid_disks) {
5324 err = -EBUSY;
5325 goto abort;
5326 }
5327 *rdevp = NULL;
5328 synchronize_rcu();
5329 if (atomic_read(&rdev->nr_pending)) {
5330 /* lost the race, try later */
5331 err = -EBUSY;
5332 *rdevp = rdev;
5333 } else if (p->replacement) {
5334 /* We must have just cleared 'rdev' */
5335 p->rdev = p->replacement;
5336 clear_bit(Replacement, &p->replacement->flags);
5337 smp_mb(); /* Make sure other CPUs may see both as identical
5338 * but will never see neither - if they are careful
5339 */
5340 p->replacement = NULL;
5341 clear_bit(WantReplacement, &rdev->flags);
5342 } else
5343 /* We might have just removed the Replacement as faulty-
5344 * clear the bit just in case
5345 */
5346 clear_bit(WantReplacement, &rdev->flags);
5071abort: 5347abort:
5072 5348
5073 print_raid5_conf(conf); 5349 print_raid5_conf(conf);
@@ -5103,8 +5379,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5103 disk = rdev->saved_raid_disk; 5379 disk = rdev->saved_raid_disk;
5104 else 5380 else
5105 disk = first; 5381 disk = first;
5106 for ( ; disk <= last ; disk++) 5382 for ( ; disk <= last ; disk++) {
5107 if ((p=conf->disks + disk)->rdev == NULL) { 5383 p = conf->disks + disk;
5384 if (p->rdev == NULL) {
5108 clear_bit(In_sync, &rdev->flags); 5385 clear_bit(In_sync, &rdev->flags);
5109 rdev->raid_disk = disk; 5386 rdev->raid_disk = disk;
5110 err = 0; 5387 err = 0;
@@ -5113,6 +5390,17 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5113 rcu_assign_pointer(p->rdev, rdev); 5390 rcu_assign_pointer(p->rdev, rdev);
5114 break; 5391 break;
5115 } 5392 }
5393 if (test_bit(WantReplacement, &p->rdev->flags) &&
5394 p->replacement == NULL) {
5395 clear_bit(In_sync, &rdev->flags);
5396 set_bit(Replacement, &rdev->flags);
5397 rdev->raid_disk = disk;
5398 err = 0;
5399 conf->fullsync = 1;
5400 rcu_assign_pointer(p->replacement, rdev);
5401 break;
5402 }
5403 }
5116 print_raid5_conf(conf); 5404 print_raid5_conf(conf);
5117 return err; 5405 return err;
5118} 5406}
@@ -5286,8 +5574,7 @@ static int raid5_start_reshape(struct mddev *mddev)
5286 * pre and post number of devices. 5574 * pre and post number of devices.
5287 */ 5575 */
5288 spin_lock_irqsave(&conf->device_lock, flags); 5576 spin_lock_irqsave(&conf->device_lock, flags);
5289 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) 5577 mddev->degraded = calc_degraded(conf);
5290 - added_devices;
5291 spin_unlock_irqrestore(&conf->device_lock, flags); 5578 spin_unlock_irqrestore(&conf->device_lock, flags);
5292 } 5579 }
5293 mddev->raid_disks = conf->raid_disks; 5580 mddev->raid_disks = conf->raid_disks;
@@ -5356,17 +5643,15 @@ static void raid5_finish_reshape(struct mddev *mddev)
5356 revalidate_disk(mddev->gendisk); 5643 revalidate_disk(mddev->gendisk);
5357 } else { 5644 } else {
5358 int d; 5645 int d;
5359 mddev->degraded = conf->raid_disks; 5646 spin_lock_irq(&conf->device_lock);
5360 for (d = 0; d < conf->raid_disks ; d++) 5647 mddev->degraded = calc_degraded(conf);
5361 if (conf->disks[d].rdev && 5648 spin_unlock_irq(&conf->device_lock);
5362 test_bit(In_sync,
5363 &conf->disks[d].rdev->flags))
5364 mddev->degraded--;
5365 for (d = conf->raid_disks ; 5649 for (d = conf->raid_disks ;
5366 d < conf->raid_disks - mddev->delta_disks; 5650 d < conf->raid_disks - mddev->delta_disks;
5367 d++) { 5651 d++) {
5368 struct md_rdev *rdev = conf->disks[d].rdev; 5652 struct md_rdev *rdev = conf->disks[d].rdev;
5369 if (rdev && raid5_remove_disk(mddev, d) == 0) { 5653 if (rdev &&
5654 raid5_remove_disk(mddev, rdev) == 0) {
5370 sysfs_unlink_rdev(mddev, rdev); 5655 sysfs_unlink_rdev(mddev, rdev);
5371 rdev->raid_disk = -1; 5656 rdev->raid_disk = -1;
5372 } 5657 }
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index e10c5531f9c5..8d8e13934a48 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -27,7 +27,7 @@
27 * The possible state transitions are: 27 * The possible state transitions are:
28 * 28 *
29 * Empty -> Want - on read or write to get old data for parity calc 29 * Empty -> Want - on read or write to get old data for parity calc
30 * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) 30 * Empty -> Dirty - on compute_parity to satisfy write/sync request.
31 * Empty -> Clean - on compute_block when computing a block for failed drive 31 * Empty -> Clean - on compute_block when computing a block for failed drive
32 * Want -> Empty - on failed read 32 * Want -> Empty - on failed read
33 * Want -> Clean - on successful completion of read request 33 * Want -> Clean - on successful completion of read request
@@ -226,8 +226,11 @@ struct stripe_head {
226 #endif 226 #endif
227 } ops; 227 } ops;
228 struct r5dev { 228 struct r5dev {
229 struct bio req; 229 /* rreq and rvec are used for the replacement device when
230 struct bio_vec vec; 230 * writing data to both devices.
231 */
232 struct bio req, rreq;
233 struct bio_vec vec, rvec;
231 struct page *page; 234 struct page *page;
232 struct bio *toread, *read, *towrite, *written; 235 struct bio *toread, *read, *towrite, *written;
233 sector_t sector; /* sector of this page */ 236 sector_t sector; /* sector of this page */
@@ -239,7 +242,13 @@ struct stripe_head {
239 * for handle_stripe. 242 * for handle_stripe.
240 */ 243 */
241struct stripe_head_state { 244struct stripe_head_state {
242 int syncing, expanding, expanded; 245 /* 'syncing' means that we need to read all devices, either
246 * to check/correct parity, or to reconstruct a missing device.
247 * 'replacing' means we are replacing one or more drives and
248 * the source is valid at this point so we don't need to
249 * read all devices, just the replacement targets.
250 */
251 int syncing, expanding, expanded, replacing;
243 int locked, uptodate, to_read, to_write, failed, written; 252 int locked, uptodate, to_read, to_write, failed, written;
244 int to_fill, compute, req_compute, non_overwrite; 253 int to_fill, compute, req_compute, non_overwrite;
245 int failed_num[2]; 254 int failed_num[2];
@@ -252,38 +261,41 @@ struct stripe_head_state {
252 int handle_bad_blocks; 261 int handle_bad_blocks;
253}; 262};
254 263
255/* Flags */ 264/* Flags for struct r5dev.flags */
256#define R5_UPTODATE 0 /* page contains current data */ 265enum r5dev_flags {
257#define R5_LOCKED 1 /* IO has been submitted on "req" */ 266 R5_UPTODATE, /* page contains current data */
258#define R5_OVERWRITE 2 /* towrite covers whole page */ 267 R5_LOCKED, /* IO has been submitted on "req" */
268 R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
269 R5_OVERWRITE, /* towrite covers whole page */
259/* and some that are internal to handle_stripe */ 270/* and some that are internal to handle_stripe */
260#define R5_Insync 3 /* rdev && rdev->in_sync at start */ 271 R5_Insync, /* rdev && rdev->in_sync at start */
261#define R5_Wantread 4 /* want to schedule a read */ 272 R5_Wantread, /* want to schedule a read */
262#define R5_Wantwrite 5 273 R5_Wantwrite,
263#define R5_Overlap 7 /* There is a pending overlapping request on this block */ 274 R5_Overlap, /* There is a pending overlapping request
264#define R5_ReadError 8 /* seen a read error here recently */ 275 * on this block */
265#define R5_ReWrite 9 /* have tried to over-write the readerror */ 276 R5_ReadError, /* seen a read error here recently */
277 R5_ReWrite, /* have tried to over-write the readerror */
266 278
267#define R5_Expanded 10 /* This block now has post-expand data */ 279 R5_Expanded, /* This block now has post-expand data */
268#define R5_Wantcompute 11 /* compute_block in progress treat as 280 R5_Wantcompute, /* compute_block in progress treat as
269 * uptodate 281 * uptodate
270 */ 282 */
271#define R5_Wantfill 12 /* dev->toread contains a bio that needs 283 R5_Wantfill, /* dev->toread contains a bio that needs
272 * filling 284 * filling
273 */ 285 */
274#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ 286 R5_Wantdrain, /* dev->towrite needs to be drained */
275#define R5_WantFUA 14 /* Write should be FUA */ 287 R5_WantFUA, /* Write should be FUA */
276#define R5_WriteError 15 /* got a write error - need to record it */ 288 R5_WriteError, /* got a write error - need to record it */
277#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/ 289 R5_MadeGood, /* A bad block has been fixed by writing to it */
278/* 290 R5_ReadRepl, /* Will/did read from replacement rather than orig */
279 * Write method 291 R5_MadeGoodRepl,/* A bad block on the replacement device has been
280 */ 292 * fixed by writing to it */
281#define RECONSTRUCT_WRITE 1 293 R5_NeedReplace, /* This device has a replacement which is not
282#define READ_MODIFY_WRITE 2 294 * up-to-date at this stripe. */
283/* not a write method, but a compute_parity mode */ 295 R5_WantReplace, /* We need to update the replacement, we have read
284#define CHECK_PARITY 3 296 * data in, and now is a good time to write it out.
285/* Additional compute_parity mode -- updates the parity w/o LOCKING */ 297 */
286#define UPDATE_PARITY 4 298};
287 299
288/* 300/*
289 * Stripe state 301 * Stripe state
@@ -311,13 +323,14 @@ enum {
311/* 323/*
312 * Operation request flags 324 * Operation request flags
313 */ 325 */
314#define STRIPE_OP_BIOFILL 0 326enum {
315#define STRIPE_OP_COMPUTE_BLK 1 327 STRIPE_OP_BIOFILL,
316#define STRIPE_OP_PREXOR 2 328 STRIPE_OP_COMPUTE_BLK,
317#define STRIPE_OP_BIODRAIN 3 329 STRIPE_OP_PREXOR,
318#define STRIPE_OP_RECONSTRUCT 4 330 STRIPE_OP_BIODRAIN,
319#define STRIPE_OP_CHECK 5 331 STRIPE_OP_RECONSTRUCT,
320 332 STRIPE_OP_CHECK,
333};
321/* 334/*
322 * Plugging: 335 * Plugging:
323 * 336 *
@@ -344,13 +357,12 @@ enum {
344 357
345 358
346struct disk_info { 359struct disk_info {
347 struct md_rdev *rdev; 360 struct md_rdev *rdev, *replacement;
348}; 361};
349 362
350struct r5conf { 363struct r5conf {
351 struct hlist_head *stripe_hashtbl; 364 struct hlist_head *stripe_hashtbl;
352 struct mddev *mddev; 365 struct mddev *mddev;
353 struct disk_info *spare;
354 int chunk_sectors; 366 int chunk_sectors;
355 int level, algorithm; 367 int level, algorithm;
356 int max_degraded; 368 int max_degraded;
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 9e65d9e20662..6f6df86f1ae5 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -277,7 +277,10 @@ struct mdp_superblock_1 {
277 */ 277 */
278#define MD_FEATURE_RESHAPE_ACTIVE 4 278#define MD_FEATURE_RESHAPE_ACTIVE 4
279#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ 279#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */
280 280#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an
281#define MD_FEATURE_ALL (1|2|4|8) 281 * active device with same 'role'.
282 * 'recovery_offset' is also set.
283 */
284#define MD_FEATURE_ALL (1|2|4|8|16)
282 285
283#endif 286#endif
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 2b59cc824395..53272e9860a7 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -132,7 +132,7 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
132 PROT_READ|PROT_WRITE, \ 132 PROT_READ|PROT_WRITE, \
133 MAP_PRIVATE|MAP_ANONYMOUS,\ 133 MAP_PRIVATE|MAP_ANONYMOUS,\
134 0, 0)) 134 0, 0))
135# define free_pages(x, y) munmap((void *)(x), (y)*PAGE_SIZE) 135# define free_pages(x, y) munmap((void *)(x), PAGE_SIZE << (y))
136 136
137static inline void cpu_relax(void) 137static inline void cpu_relax(void)
138{ 138{