diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-08 16:28:33 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-08 16:28:33 -0500 |
commit | 2943c833222ef87c111ee0c6b7b8519ad2983e99 (patch) | |
tree | 0ef8cc4f72a63b325e7ae858ec68822ec4f3c64f /drivers | |
parent | 98793265b429a3f0b3f1750e74d67cd4d740d162 (diff) | |
parent | 19d671695e1931ebfd75b2b888778201aefe35ca (diff) |
Merge tag 'md-3.3' of git://neil.brown.name/md
md update for 3.3
Big change is new hot-replacement.
A slot in an array can hold 2 devices - one that
wants-replacement and one that is the replacement.
Once the replacement is built - either from the
original or (in the case of errors) from elsewhere,
the wants-replacement device will be removed.
* tag 'md-3.3' of git://neil.brown.name/md: (36 commits)
md/raid1: Mark device want_replacement when we see a write error.
md/raid1: If there is a spare and a want_replacement device, start replacement.
md/raid1: recognise replacements when assembling arrays.
md/raid1: handle activation of replacement device when recovery completes.
md/raid1: Allow a failed replacement device to be removed.
md/raid1: Allocate spare to store replacement devices and their bios.
md/raid1: Replace use of mddev->raid_disks with conf->raid_disks.
md/raid10: If there is a spare and a want_replacement device, start replacement.
md/raid10: recognise replacements when assembling array.
md/raid10: Allow replacement device to be replace old drive.
md/raid10: handle recovery of replacement devices.
md/raid10: Handle replacement devices during resync.
md/raid10: writes should get directed to replacement as well as original.
md/raid10: allow removal of failed replacement devices.
md/raid10: preferentially read from replacement device if possible.
md/raid10: change read_balance to return an rdev
md/raid10: prepare data structures for handling replacement.
md/raid5: Mark device want_replacement when we see a write error.
md/raid5: If there is a spare and a want_replacement device, start replacement.
md/raid5: recognise replacements when assembling array.
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/bitmap.c | 12 | ||||
-rw-r--r-- | drivers/md/md.c | 107 | ||||
-rw-r--r-- | drivers/md/md.h | 82 | ||||
-rw-r--r-- | drivers/md/multipath.c | 7 | ||||
-rw-r--r-- | drivers/md/raid1.c | 174 | ||||
-rw-r--r-- | drivers/md/raid1.h | 7 | ||||
-rw-r--r-- | drivers/md/raid10.c | 582 | ||||
-rw-r--r-- | drivers/md/raid10.h | 61 | ||||
-rw-r--r-- | drivers/md/raid5.c | 557 | ||||
-rw-r--r-- | drivers/md/raid5.h | 98 |
10 files changed, 1256 insertions, 431 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 6d03774b176e..cdf36b1e9aa6 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -1149,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1149 | return; | 1149 | return; |
1150 | } | 1150 | } |
1151 | if (time_before(jiffies, bitmap->daemon_lastrun | 1151 | if (time_before(jiffies, bitmap->daemon_lastrun |
1152 | + bitmap->mddev->bitmap_info.daemon_sleep)) | 1152 | + mddev->bitmap_info.daemon_sleep)) |
1153 | goto done; | 1153 | goto done; |
1154 | 1154 | ||
1155 | bitmap->daemon_lastrun = jiffies; | 1155 | bitmap->daemon_lastrun = jiffies; |
1156 | if (bitmap->allclean) { | 1156 | if (bitmap->allclean) { |
1157 | bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | 1157 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; |
1158 | goto done; | 1158 | goto done; |
1159 | } | 1159 | } |
1160 | bitmap->allclean = 1; | 1160 | bitmap->allclean = 1; |
@@ -1206,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1206 | * sure that events_cleared is up-to-date. | 1206 | * sure that events_cleared is up-to-date. |
1207 | */ | 1207 | */ |
1208 | if (bitmap->need_sync && | 1208 | if (bitmap->need_sync && |
1209 | bitmap->mddev->bitmap_info.external == 0) { | 1209 | mddev->bitmap_info.external == 0) { |
1210 | bitmap_super_t *sb; | 1210 | bitmap_super_t *sb; |
1211 | bitmap->need_sync = 0; | 1211 | bitmap->need_sync = 0; |
1212 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | 1212 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); |
@@ -1270,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1270 | 1270 | ||
1271 | done: | 1271 | done: |
1272 | if (bitmap->allclean == 0) | 1272 | if (bitmap->allclean == 0) |
1273 | bitmap->mddev->thread->timeout = | 1273 | mddev->thread->timeout = |
1274 | bitmap->mddev->bitmap_info.daemon_sleep; | 1274 | mddev->bitmap_info.daemon_sleep; |
1275 | mutex_unlock(&mddev->bitmap_info.mutex); | 1275 | mutex_unlock(&mddev->bitmap_info.mutex); |
1276 | } | 1276 | } |
1277 | 1277 | ||
@@ -1587,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n | |||
1587 | } | 1587 | } |
1588 | if (!*bmc) { | 1588 | if (!*bmc) { |
1589 | struct page *page; | 1589 | struct page *page; |
1590 | *bmc = 1 | (needed ? NEEDED_MASK : 0); | 1590 | *bmc = 2 | (needed ? NEEDED_MASK : 0); |
1591 | bitmap_count_page(bitmap, offset, 1); | 1591 | bitmap_count_page(bitmap, offset, 1); |
1592 | page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); | 1592 | page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); |
1593 | set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); | 1593 | set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 5d1b6762f108..ca8527fe77eb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -1713,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1713 | } | 1713 | } |
1714 | if (sb->devflags & WriteMostly1) | 1714 | if (sb->devflags & WriteMostly1) |
1715 | set_bit(WriteMostly, &rdev->flags); | 1715 | set_bit(WriteMostly, &rdev->flags); |
1716 | if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) | ||
1717 | set_bit(Replacement, &rdev->flags); | ||
1716 | } else /* MULTIPATH are always insync */ | 1718 | } else /* MULTIPATH are always insync */ |
1717 | set_bit(In_sync, &rdev->flags); | 1719 | set_bit(In_sync, &rdev->flags); |
1718 | 1720 | ||
@@ -1766,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
1766 | sb->recovery_offset = | 1768 | sb->recovery_offset = |
1767 | cpu_to_le64(rdev->recovery_offset); | 1769 | cpu_to_le64(rdev->recovery_offset); |
1768 | } | 1770 | } |
1771 | if (test_bit(Replacement, &rdev->flags)) | ||
1772 | sb->feature_map |= | ||
1773 | cpu_to_le32(MD_FEATURE_REPLACEMENT); | ||
1769 | 1774 | ||
1770 | if (mddev->reshape_position != MaxSector) { | 1775 | if (mddev->reshape_position != MaxSector) { |
1771 | sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); | 1776 | sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); |
@@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page) | |||
2559 | len += sprintf(page+len, "%swrite_error", sep); | 2564 | len += sprintf(page+len, "%swrite_error", sep); |
2560 | sep = ","; | 2565 | sep = ","; |
2561 | } | 2566 | } |
2567 | if (test_bit(WantReplacement, &rdev->flags)) { | ||
2568 | len += sprintf(page+len, "%swant_replacement", sep); | ||
2569 | sep = ","; | ||
2570 | } | ||
2571 | if (test_bit(Replacement, &rdev->flags)) { | ||
2572 | len += sprintf(page+len, "%sreplacement", sep); | ||
2573 | sep = ","; | ||
2574 | } | ||
2575 | |||
2562 | return len+sprintf(page+len, "\n"); | 2576 | return len+sprintf(page+len, "\n"); |
2563 | } | 2577 | } |
2564 | 2578 | ||
@@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2627 | } else if (cmd_match(buf, "-write_error")) { | 2641 | } else if (cmd_match(buf, "-write_error")) { |
2628 | clear_bit(WriteErrorSeen, &rdev->flags); | 2642 | clear_bit(WriteErrorSeen, &rdev->flags); |
2629 | err = 0; | 2643 | err = 0; |
2644 | } else if (cmd_match(buf, "want_replacement")) { | ||
2645 | /* Any non-spare device that is not a replacement can | ||
2646 | * become want_replacement at any time, but we then need to | ||
2647 | * check if recovery is needed. | ||
2648 | */ | ||
2649 | if (rdev->raid_disk >= 0 && | ||
2650 | !test_bit(Replacement, &rdev->flags)) | ||
2651 | set_bit(WantReplacement, &rdev->flags); | ||
2652 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | ||
2653 | md_wakeup_thread(rdev->mddev->thread); | ||
2654 | err = 0; | ||
2655 | } else if (cmd_match(buf, "-want_replacement")) { | ||
2656 | /* Clearing 'want_replacement' is always allowed. | ||
2657 | * Once replacements starts it is too late though. | ||
2658 | */ | ||
2659 | err = 0; | ||
2660 | clear_bit(WantReplacement, &rdev->flags); | ||
2661 | } else if (cmd_match(buf, "replacement")) { | ||
2662 | /* Can only set a device as a replacement when array has not | ||
2663 | * yet been started. Once running, replacement is automatic | ||
2664 | * from spares, or by assigning 'slot'. | ||
2665 | */ | ||
2666 | if (rdev->mddev->pers) | ||
2667 | err = -EBUSY; | ||
2668 | else { | ||
2669 | set_bit(Replacement, &rdev->flags); | ||
2670 | err = 0; | ||
2671 | } | ||
2672 | } else if (cmd_match(buf, "-replacement")) { | ||
2673 | /* Similarly, can only clear Replacement before start */ | ||
2674 | if (rdev->mddev->pers) | ||
2675 | err = -EBUSY; | ||
2676 | else { | ||
2677 | clear_bit(Replacement, &rdev->flags); | ||
2678 | err = 0; | ||
2679 | } | ||
2630 | } | 2680 | } |
2631 | if (!err) | 2681 | if (!err) |
2632 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2682 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
@@ -2688,7 +2738,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2688 | if (rdev->mddev->pers->hot_remove_disk == NULL) | 2738 | if (rdev->mddev->pers->hot_remove_disk == NULL) |
2689 | return -EINVAL; | 2739 | return -EINVAL; |
2690 | err = rdev->mddev->pers-> | 2740 | err = rdev->mddev->pers-> |
2691 | hot_remove_disk(rdev->mddev, rdev->raid_disk); | 2741 | hot_remove_disk(rdev->mddev, rdev); |
2692 | if (err) | 2742 | if (err) |
2693 | return err; | 2743 | return err; |
2694 | sysfs_unlink_rdev(rdev->mddev, rdev); | 2744 | sysfs_unlink_rdev(rdev->mddev, rdev); |
@@ -2696,7 +2746,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2696 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2746 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
2697 | md_wakeup_thread(rdev->mddev->thread); | 2747 | md_wakeup_thread(rdev->mddev->thread); |
2698 | } else if (rdev->mddev->pers) { | 2748 | } else if (rdev->mddev->pers) { |
2699 | struct md_rdev *rdev2; | ||
2700 | /* Activating a spare .. or possibly reactivating | 2749 | /* Activating a spare .. or possibly reactivating |
2701 | * if we ever get bitmaps working here. | 2750 | * if we ever get bitmaps working here. |
2702 | */ | 2751 | */ |
@@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2710 | if (rdev->mddev->pers->hot_add_disk == NULL) | 2759 | if (rdev->mddev->pers->hot_add_disk == NULL) |
2711 | return -EINVAL; | 2760 | return -EINVAL; |
2712 | 2761 | ||
2713 | list_for_each_entry(rdev2, &rdev->mddev->disks, same_set) | ||
2714 | if (rdev2->raid_disk == slot) | ||
2715 | return -EEXIST; | ||
2716 | |||
2717 | if (slot >= rdev->mddev->raid_disks && | 2762 | if (slot >= rdev->mddev->raid_disks && |
2718 | slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) | 2763 | slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) |
2719 | return -ENOSPC; | 2764 | return -ENOSPC; |
@@ -6053,8 +6098,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6053 | struct mddev *mddev = NULL; | 6098 | struct mddev *mddev = NULL; |
6054 | int ro; | 6099 | int ro; |
6055 | 6100 | ||
6056 | if (!capable(CAP_SYS_ADMIN)) | 6101 | switch (cmd) { |
6057 | return -EACCES; | 6102 | case RAID_VERSION: |
6103 | case GET_ARRAY_INFO: | ||
6104 | case GET_DISK_INFO: | ||
6105 | break; | ||
6106 | default: | ||
6107 | if (!capable(CAP_SYS_ADMIN)) | ||
6108 | return -EACCES; | ||
6109 | } | ||
6058 | 6110 | ||
6059 | /* | 6111 | /* |
6060 | * Commands dealing with the RAID driver but not any | 6112 | * Commands dealing with the RAID driver but not any |
@@ -6714,8 +6766,11 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
6714 | if (test_bit(Faulty, &rdev->flags)) { | 6766 | if (test_bit(Faulty, &rdev->flags)) { |
6715 | seq_printf(seq, "(F)"); | 6767 | seq_printf(seq, "(F)"); |
6716 | continue; | 6768 | continue; |
6717 | } else if (rdev->raid_disk < 0) | 6769 | } |
6770 | if (rdev->raid_disk < 0) | ||
6718 | seq_printf(seq, "(S)"); /* spare */ | 6771 | seq_printf(seq, "(S)"); /* spare */ |
6772 | if (test_bit(Replacement, &rdev->flags)) | ||
6773 | seq_printf(seq, "(R)"); | ||
6719 | sectors += rdev->sectors; | 6774 | sectors += rdev->sectors; |
6720 | } | 6775 | } |
6721 | 6776 | ||
@@ -7337,29 +7392,27 @@ static int remove_and_add_spares(struct mddev *mddev) | |||
7337 | ! test_bit(In_sync, &rdev->flags)) && | 7392 | ! test_bit(In_sync, &rdev->flags)) && |
7338 | atomic_read(&rdev->nr_pending)==0) { | 7393 | atomic_read(&rdev->nr_pending)==0) { |
7339 | if (mddev->pers->hot_remove_disk( | 7394 | if (mddev->pers->hot_remove_disk( |
7340 | mddev, rdev->raid_disk)==0) { | 7395 | mddev, rdev) == 0) { |
7341 | sysfs_unlink_rdev(mddev, rdev); | 7396 | sysfs_unlink_rdev(mddev, rdev); |
7342 | rdev->raid_disk = -1; | 7397 | rdev->raid_disk = -1; |
7343 | } | 7398 | } |
7344 | } | 7399 | } |
7345 | 7400 | ||
7346 | if (mddev->degraded) { | 7401 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
7347 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 7402 | if (rdev->raid_disk >= 0 && |
7348 | if (rdev->raid_disk >= 0 && | 7403 | !test_bit(In_sync, &rdev->flags) && |
7349 | !test_bit(In_sync, &rdev->flags) && | 7404 | !test_bit(Faulty, &rdev->flags)) |
7350 | !test_bit(Faulty, &rdev->flags)) | 7405 | spares++; |
7406 | if (rdev->raid_disk < 0 | ||
7407 | && !test_bit(Faulty, &rdev->flags)) { | ||
7408 | rdev->recovery_offset = 0; | ||
7409 | if (mddev->pers-> | ||
7410 | hot_add_disk(mddev, rdev) == 0) { | ||
7411 | if (sysfs_link_rdev(mddev, rdev)) | ||
7412 | /* failure here is OK */; | ||
7351 | spares++; | 7413 | spares++; |
7352 | if (rdev->raid_disk < 0 | 7414 | md_new_event(mddev); |
7353 | && !test_bit(Faulty, &rdev->flags)) { | 7415 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
7354 | rdev->recovery_offset = 0; | ||
7355 | if (mddev->pers-> | ||
7356 | hot_add_disk(mddev, rdev) == 0) { | ||
7357 | if (sysfs_link_rdev(mddev, rdev)) | ||
7358 | /* failure here is OK */; | ||
7359 | spares++; | ||
7360 | md_new_event(mddev); | ||
7361 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
7362 | } | ||
7363 | } | 7416 | } |
7364 | } | 7417 | } |
7365 | } | 7418 | } |
@@ -7474,7 +7527,7 @@ void md_check_recovery(struct mddev *mddev) | |||
7474 | test_bit(Faulty, &rdev->flags) && | 7527 | test_bit(Faulty, &rdev->flags) && |
7475 | atomic_read(&rdev->nr_pending)==0) { | 7528 | atomic_read(&rdev->nr_pending)==0) { |
7476 | if (mddev->pers->hot_remove_disk( | 7529 | if (mddev->pers->hot_remove_disk( |
7477 | mddev, rdev->raid_disk)==0) { | 7530 | mddev, rdev) == 0) { |
7478 | sysfs_unlink_rdev(mddev, rdev); | 7531 | sysfs_unlink_rdev(mddev, rdev); |
7479 | rdev->raid_disk = -1; | 7532 | rdev->raid_disk = -1; |
7480 | } | 7533 | } |
diff --git a/drivers/md/md.h b/drivers/md/md.h index cf742d9306ec..44c63dfeeb2b 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -72,34 +72,7 @@ struct md_rdev { | |||
72 | * This reduces the burden of testing multiple flags in many cases | 72 | * This reduces the burden of testing multiple flags in many cases |
73 | */ | 73 | */ |
74 | 74 | ||
75 | unsigned long flags; | 75 | unsigned long flags; /* bit set of 'enum flag_bits' bits. */ |
76 | #define Faulty 1 /* device is known to have a fault */ | ||
77 | #define In_sync 2 /* device is in_sync with rest of array */ | ||
78 | #define WriteMostly 4 /* Avoid reading if at all possible */ | ||
79 | #define AutoDetected 7 /* added by auto-detect */ | ||
80 | #define Blocked 8 /* An error occurred but has not yet | ||
81 | * been acknowledged by the metadata | ||
82 | * handler, so don't allow writes | ||
83 | * until it is cleared */ | ||
84 | #define WriteErrorSeen 9 /* A write error has been seen on this | ||
85 | * device | ||
86 | */ | ||
87 | #define FaultRecorded 10 /* Intermediate state for clearing | ||
88 | * Blocked. The Fault is/will-be | ||
89 | * recorded in the metadata, but that | ||
90 | * metadata hasn't been stored safely | ||
91 | * on disk yet. | ||
92 | */ | ||
93 | #define BlockedBadBlocks 11 /* A writer is blocked because they | ||
94 | * found an unacknowledged bad-block. | ||
95 | * This can safely be cleared at any | ||
96 | * time, and the writer will re-check. | ||
97 | * It may be set at any time, and at | ||
98 | * worst the writer will timeout and | ||
99 | * re-check. So setting it as | ||
100 | * accurately as possible is good, but | ||
101 | * not absolutely critical. | ||
102 | */ | ||
103 | wait_queue_head_t blocked_wait; | 76 | wait_queue_head_t blocked_wait; |
104 | 77 | ||
105 | int desc_nr; /* descriptor index in the superblock */ | 78 | int desc_nr; /* descriptor index in the superblock */ |
@@ -152,6 +125,44 @@ struct md_rdev { | |||
152 | sector_t size; /* in sectors */ | 125 | sector_t size; /* in sectors */ |
153 | } badblocks; | 126 | } badblocks; |
154 | }; | 127 | }; |
128 | enum flag_bits { | ||
129 | Faulty, /* device is known to have a fault */ | ||
130 | In_sync, /* device is in_sync with rest of array */ | ||
131 | WriteMostly, /* Avoid reading if at all possible */ | ||
132 | AutoDetected, /* added by auto-detect */ | ||
133 | Blocked, /* An error occurred but has not yet | ||
134 | * been acknowledged by the metadata | ||
135 | * handler, so don't allow writes | ||
136 | * until it is cleared */ | ||
137 | WriteErrorSeen, /* A write error has been seen on this | ||
138 | * device | ||
139 | */ | ||
140 | FaultRecorded, /* Intermediate state for clearing | ||
141 | * Blocked. The Fault is/will-be | ||
142 | * recorded in the metadata, but that | ||
143 | * metadata hasn't been stored safely | ||
144 | * on disk yet. | ||
145 | */ | ||
146 | BlockedBadBlocks, /* A writer is blocked because they | ||
147 | * found an unacknowledged bad-block. | ||
148 | * This can safely be cleared at any | ||
149 | * time, and the writer will re-check. | ||
150 | * It may be set at any time, and at | ||
151 | * worst the writer will timeout and | ||
152 | * re-check. So setting it as | ||
153 | * accurately as possible is good, but | ||
154 | * not absolutely critical. | ||
155 | */ | ||
156 | WantReplacement, /* This device is a candidate to be | ||
157 | * hot-replaced, either because it has | ||
158 | * reported some faults, or because | ||
159 | * of explicit request. | ||
160 | */ | ||
161 | Replacement, /* This device is a replacement for | ||
162 | * a want_replacement device with same | ||
163 | * raid_disk number. | ||
164 | */ | ||
165 | }; | ||
155 | 166 | ||
156 | #define BB_LEN_MASK (0x00000000000001FFULL) | 167 | #define BB_LEN_MASK (0x00000000000001FFULL) |
157 | #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) | 168 | #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) |
@@ -428,7 +439,7 @@ struct md_personality | |||
428 | */ | 439 | */ |
429 | void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); | 440 | void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); |
430 | int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); | 441 | int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); |
431 | int (*hot_remove_disk) (struct mddev *mddev, int number); | 442 | int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); |
432 | int (*spare_active) (struct mddev *mddev); | 443 | int (*spare_active) (struct mddev *mddev); |
433 | sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); | 444 | sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); |
434 | int (*resize) (struct mddev *mddev, sector_t sectors); | 445 | int (*resize) (struct mddev *mddev, sector_t sectors); |
@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev) | |||
482 | static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) | 493 | static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) |
483 | { | 494 | { |
484 | char nm[20]; | 495 | char nm[20]; |
485 | sprintf(nm, "rd%d", rdev->raid_disk); | 496 | if (!test_bit(Replacement, &rdev->flags)) { |
486 | return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); | 497 | sprintf(nm, "rd%d", rdev->raid_disk); |
498 | return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); | ||
499 | } else | ||
500 | return 0; | ||
487 | } | 501 | } |
488 | 502 | ||
489 | static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) | 503 | static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) |
490 | { | 504 | { |
491 | char nm[20]; | 505 | char nm[20]; |
492 | sprintf(nm, "rd%d", rdev->raid_disk); | 506 | if (!test_bit(Replacement, &rdev->flags)) { |
493 | sysfs_remove_link(&mddev->kobj, nm); | 507 | sprintf(nm, "rd%d", rdev->raid_disk); |
508 | sysfs_remove_link(&mddev->kobj, nm); | ||
509 | } | ||
494 | } | 510 | } |
495 | 511 | ||
496 | /* | 512 | /* |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 5899246fa37e..a222f516660e 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -292,17 +292,16 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
292 | return err; | 292 | return err; |
293 | } | 293 | } |
294 | 294 | ||
295 | static int multipath_remove_disk(struct mddev *mddev, int number) | 295 | static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) |
296 | { | 296 | { |
297 | struct mpconf *conf = mddev->private; | 297 | struct mpconf *conf = mddev->private; |
298 | int err = 0; | 298 | int err = 0; |
299 | struct md_rdev *rdev; | 299 | int number = rdev->raid_disk; |
300 | struct multipath_info *p = conf->multipaths + number; | 300 | struct multipath_info *p = conf->multipaths + number; |
301 | 301 | ||
302 | print_multipath_conf(conf); | 302 | print_multipath_conf(conf); |
303 | 303 | ||
304 | rdev = p->rdev; | 304 | if (rdev == p->rdev) { |
305 | if (rdev) { | ||
306 | if (test_bit(In_sync, &rdev->flags) || | 305 | if (test_bit(In_sync, &rdev->flags) || |
307 | atomic_read(&rdev->nr_pending)) { | 306 | atomic_read(&rdev->nr_pending)) { |
308 | printk(KERN_ERR "hot-remove-disk, slot %d is identified" | 307 | printk(KERN_ERR "hot-remove-disk, slot %d is identified" |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ede2461e79c5..cc24f0cb7ee3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -135,7 +135,7 @@ out_free_pages: | |||
135 | put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); | 135 | put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); |
136 | j = -1; | 136 | j = -1; |
137 | out_free_bio: | 137 | out_free_bio: |
138 | while ( ++j < pi->raid_disks ) | 138 | while (++j < pi->raid_disks) |
139 | bio_put(r1_bio->bios[j]); | 139 | bio_put(r1_bio->bios[j]); |
140 | r1bio_pool_free(r1_bio, data); | 140 | r1bio_pool_free(r1_bio, data); |
141 | return NULL; | 141 | return NULL; |
@@ -164,7 +164,7 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) | |||
164 | { | 164 | { |
165 | int i; | 165 | int i; |
166 | 166 | ||
167 | for (i = 0; i < conf->raid_disks; i++) { | 167 | for (i = 0; i < conf->raid_disks * 2; i++) { |
168 | struct bio **bio = r1_bio->bios + i; | 168 | struct bio **bio = r1_bio->bios + i; |
169 | if (!BIO_SPECIAL(*bio)) | 169 | if (!BIO_SPECIAL(*bio)) |
170 | bio_put(*bio); | 170 | bio_put(*bio); |
@@ -185,7 +185,7 @@ static void put_buf(struct r1bio *r1_bio) | |||
185 | struct r1conf *conf = r1_bio->mddev->private; | 185 | struct r1conf *conf = r1_bio->mddev->private; |
186 | int i; | 186 | int i; |
187 | 187 | ||
188 | for (i=0; i<conf->raid_disks; i++) { | 188 | for (i = 0; i < conf->raid_disks * 2; i++) { |
189 | struct bio *bio = r1_bio->bios[i]; | 189 | struct bio *bio = r1_bio->bios[i]; |
190 | if (bio->bi_end_io) | 190 | if (bio->bi_end_io) |
191 | rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); | 191 | rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); |
@@ -277,13 +277,14 @@ static inline void update_head_pos(int disk, struct r1bio *r1_bio) | |||
277 | static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) | 277 | static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) |
278 | { | 278 | { |
279 | int mirror; | 279 | int mirror; |
280 | int raid_disks = r1_bio->mddev->raid_disks; | 280 | struct r1conf *conf = r1_bio->mddev->private; |
281 | int raid_disks = conf->raid_disks; | ||
281 | 282 | ||
282 | for (mirror = 0; mirror < raid_disks; mirror++) | 283 | for (mirror = 0; mirror < raid_disks * 2; mirror++) |
283 | if (r1_bio->bios[mirror] == bio) | 284 | if (r1_bio->bios[mirror] == bio) |
284 | break; | 285 | break; |
285 | 286 | ||
286 | BUG_ON(mirror == raid_disks); | 287 | BUG_ON(mirror == raid_disks * 2); |
287 | update_head_pos(mirror, r1_bio); | 288 | update_head_pos(mirror, r1_bio); |
288 | 289 | ||
289 | return mirror; | 290 | return mirror; |
@@ -390,6 +391,11 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
390 | if (!uptodate) { | 391 | if (!uptodate) { |
391 | set_bit(WriteErrorSeen, | 392 | set_bit(WriteErrorSeen, |
392 | &conf->mirrors[mirror].rdev->flags); | 393 | &conf->mirrors[mirror].rdev->flags); |
394 | if (!test_and_set_bit(WantReplacement, | ||
395 | &conf->mirrors[mirror].rdev->flags)) | ||
396 | set_bit(MD_RECOVERY_NEEDED, & | ||
397 | conf->mddev->recovery); | ||
398 | |||
393 | set_bit(R1BIO_WriteError, &r1_bio->state); | 399 | set_bit(R1BIO_WriteError, &r1_bio->state); |
394 | } else { | 400 | } else { |
395 | /* | 401 | /* |
@@ -505,7 +511,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
505 | start_disk = conf->last_used; | 511 | start_disk = conf->last_used; |
506 | } | 512 | } |
507 | 513 | ||
508 | for (i = 0 ; i < conf->raid_disks ; i++) { | 514 | for (i = 0 ; i < conf->raid_disks * 2 ; i++) { |
509 | sector_t dist; | 515 | sector_t dist; |
510 | sector_t first_bad; | 516 | sector_t first_bad; |
511 | int bad_sectors; | 517 | int bad_sectors; |
@@ -609,7 +615,7 @@ int md_raid1_congested(struct mddev *mddev, int bits) | |||
609 | return 1; | 615 | return 1; |
610 | 616 | ||
611 | rcu_read_lock(); | 617 | rcu_read_lock(); |
612 | for (i = 0; i < mddev->raid_disks; i++) { | 618 | for (i = 0; i < conf->raid_disks; i++) { |
613 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | 619 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); |
614 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 620 | if (rdev && !test_bit(Faulty, &rdev->flags)) { |
615 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 621 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
@@ -974,7 +980,7 @@ read_again: | |||
974 | */ | 980 | */ |
975 | plugged = mddev_check_plugged(mddev); | 981 | plugged = mddev_check_plugged(mddev); |
976 | 982 | ||
977 | disks = conf->raid_disks; | 983 | disks = conf->raid_disks * 2; |
978 | retry_write: | 984 | retry_write: |
979 | blocked_rdev = NULL; | 985 | blocked_rdev = NULL; |
980 | rcu_read_lock(); | 986 | rcu_read_lock(); |
@@ -988,7 +994,8 @@ read_again: | |||
988 | } | 994 | } |
989 | r1_bio->bios[i] = NULL; | 995 | r1_bio->bios[i] = NULL; |
990 | if (!rdev || test_bit(Faulty, &rdev->flags)) { | 996 | if (!rdev || test_bit(Faulty, &rdev->flags)) { |
991 | set_bit(R1BIO_Degraded, &r1_bio->state); | 997 | if (i < conf->raid_disks) |
998 | set_bit(R1BIO_Degraded, &r1_bio->state); | ||
992 | continue; | 999 | continue; |
993 | } | 1000 | } |
994 | 1001 | ||
@@ -1263,6 +1270,25 @@ static int raid1_spare_active(struct mddev *mddev) | |||
1263 | */ | 1270 | */ |
1264 | for (i = 0; i < conf->raid_disks; i++) { | 1271 | for (i = 0; i < conf->raid_disks; i++) { |
1265 | struct md_rdev *rdev = conf->mirrors[i].rdev; | 1272 | struct md_rdev *rdev = conf->mirrors[i].rdev; |
1273 | struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; | ||
1274 | if (repl | ||
1275 | && repl->recovery_offset == MaxSector | ||
1276 | && !test_bit(Faulty, &repl->flags) | ||
1277 | && !test_and_set_bit(In_sync, &repl->flags)) { | ||
1278 | /* replacement has just become active */ | ||
1279 | if (!rdev || | ||
1280 | !test_and_clear_bit(In_sync, &rdev->flags)) | ||
1281 | count++; | ||
1282 | if (rdev) { | ||
1283 | /* Replaced device not technically | ||
1284 | * faulty, but we need to be sure | ||
1285 | * it gets removed and never re-added | ||
1286 | */ | ||
1287 | set_bit(Faulty, &rdev->flags); | ||
1288 | sysfs_notify_dirent_safe( | ||
1289 | rdev->sysfs_state); | ||
1290 | } | ||
1291 | } | ||
1266 | if (rdev | 1292 | if (rdev |
1267 | && !test_bit(Faulty, &rdev->flags) | 1293 | && !test_bit(Faulty, &rdev->flags) |
1268 | && !test_and_set_bit(In_sync, &rdev->flags)) { | 1294 | && !test_and_set_bit(In_sync, &rdev->flags)) { |
@@ -1286,7 +1312,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1286 | int mirror = 0; | 1312 | int mirror = 0; |
1287 | struct mirror_info *p; | 1313 | struct mirror_info *p; |
1288 | int first = 0; | 1314 | int first = 0; |
1289 | int last = mddev->raid_disks - 1; | 1315 | int last = conf->raid_disks - 1; |
1290 | 1316 | ||
1291 | if (mddev->recovery_disabled == conf->recovery_disabled) | 1317 | if (mddev->recovery_disabled == conf->recovery_disabled) |
1292 | return -EBUSY; | 1318 | return -EBUSY; |
@@ -1294,8 +1320,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1294 | if (rdev->raid_disk >= 0) | 1320 | if (rdev->raid_disk >= 0) |
1295 | first = last = rdev->raid_disk; | 1321 | first = last = rdev->raid_disk; |
1296 | 1322 | ||
1297 | for (mirror = first; mirror <= last; mirror++) | 1323 | for (mirror = first; mirror <= last; mirror++) { |
1298 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1324 | p = conf->mirrors+mirror; |
1325 | if (!p->rdev) { | ||
1299 | 1326 | ||
1300 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1327 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1301 | rdev->data_offset << 9); | 1328 | rdev->data_offset << 9); |
@@ -1322,21 +1349,35 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1322 | rcu_assign_pointer(p->rdev, rdev); | 1349 | rcu_assign_pointer(p->rdev, rdev); |
1323 | break; | 1350 | break; |
1324 | } | 1351 | } |
1352 | if (test_bit(WantReplacement, &p->rdev->flags) && | ||
1353 | p[conf->raid_disks].rdev == NULL) { | ||
1354 | /* Add this device as a replacement */ | ||
1355 | clear_bit(In_sync, &rdev->flags); | ||
1356 | set_bit(Replacement, &rdev->flags); | ||
1357 | rdev->raid_disk = mirror; | ||
1358 | err = 0; | ||
1359 | conf->fullsync = 1; | ||
1360 | rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); | ||
1361 | break; | ||
1362 | } | ||
1363 | } | ||
1325 | md_integrity_add_rdev(rdev, mddev); | 1364 | md_integrity_add_rdev(rdev, mddev); |
1326 | print_conf(conf); | 1365 | print_conf(conf); |
1327 | return err; | 1366 | return err; |
1328 | } | 1367 | } |
1329 | 1368 | ||
1330 | static int raid1_remove_disk(struct mddev *mddev, int number) | 1369 | static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) |
1331 | { | 1370 | { |
1332 | struct r1conf *conf = mddev->private; | 1371 | struct r1conf *conf = mddev->private; |
1333 | int err = 0; | 1372 | int err = 0; |
1334 | struct md_rdev *rdev; | 1373 | int number = rdev->raid_disk; |
1335 | struct mirror_info *p = conf->mirrors+ number; | 1374 | struct mirror_info *p = conf->mirrors+ number; |
1336 | 1375 | ||
1376 | if (rdev != p->rdev) | ||
1377 | p = conf->mirrors + conf->raid_disks + number; | ||
1378 | |||
1337 | print_conf(conf); | 1379 | print_conf(conf); |
1338 | rdev = p->rdev; | 1380 | if (rdev == p->rdev) { |
1339 | if (rdev) { | ||
1340 | if (test_bit(In_sync, &rdev->flags) || | 1381 | if (test_bit(In_sync, &rdev->flags) || |
1341 | atomic_read(&rdev->nr_pending)) { | 1382 | atomic_read(&rdev->nr_pending)) { |
1342 | err = -EBUSY; | 1383 | err = -EBUSY; |
@@ -1358,7 +1399,21 @@ static int raid1_remove_disk(struct mddev *mddev, int number) | |||
1358 | err = -EBUSY; | 1399 | err = -EBUSY; |
1359 | p->rdev = rdev; | 1400 | p->rdev = rdev; |
1360 | goto abort; | 1401 | goto abort; |
1361 | } | 1402 | } else if (conf->mirrors[conf->raid_disks + number].rdev) { |
1403 | /* We just removed a device that is being replaced. | ||
1404 | * Move down the replacement. We drain all IO before | ||
1405 | * doing this to avoid confusion. | ||
1406 | */ | ||
1407 | struct md_rdev *repl = | ||
1408 | conf->mirrors[conf->raid_disks + number].rdev; | ||
1409 | raise_barrier(conf); | ||
1410 | clear_bit(Replacement, &repl->flags); | ||
1411 | p->rdev = repl; | ||
1412 | conf->mirrors[conf->raid_disks + number].rdev = NULL; | ||
1413 | lower_barrier(conf); | ||
1414 | clear_bit(WantReplacement, &rdev->flags); | ||
1415 | } else | ||
1416 | clear_bit(WantReplacement, &rdev->flags); | ||
1362 | err = md_integrity_register(mddev); | 1417 | err = md_integrity_register(mddev); |
1363 | } | 1418 | } |
1364 | abort: | 1419 | abort: |
@@ -1411,6 +1466,10 @@ static void end_sync_write(struct bio *bio, int error) | |||
1411 | } while (sectors_to_go > 0); | 1466 | } while (sectors_to_go > 0); |
1412 | set_bit(WriteErrorSeen, | 1467 | set_bit(WriteErrorSeen, |
1413 | &conf->mirrors[mirror].rdev->flags); | 1468 | &conf->mirrors[mirror].rdev->flags); |
1469 | if (!test_and_set_bit(WantReplacement, | ||
1470 | &conf->mirrors[mirror].rdev->flags)) | ||
1471 | set_bit(MD_RECOVERY_NEEDED, & | ||
1472 | mddev->recovery); | ||
1414 | set_bit(R1BIO_WriteError, &r1_bio->state); | 1473 | set_bit(R1BIO_WriteError, &r1_bio->state); |
1415 | } else if (is_badblock(conf->mirrors[mirror].rdev, | 1474 | } else if (is_badblock(conf->mirrors[mirror].rdev, |
1416 | r1_bio->sector, | 1475 | r1_bio->sector, |
@@ -1441,8 +1500,13 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, | |||
1441 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) | 1500 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) |
1442 | /* success */ | 1501 | /* success */ |
1443 | return 1; | 1502 | return 1; |
1444 | if (rw == WRITE) | 1503 | if (rw == WRITE) { |
1445 | set_bit(WriteErrorSeen, &rdev->flags); | 1504 | set_bit(WriteErrorSeen, &rdev->flags); |
1505 | if (!test_and_set_bit(WantReplacement, | ||
1506 | &rdev->flags)) | ||
1507 | set_bit(MD_RECOVERY_NEEDED, & | ||
1508 | rdev->mddev->recovery); | ||
1509 | } | ||
1446 | /* need to record an error - either for the block or the device */ | 1510 | /* need to record an error - either for the block or the device */ |
1447 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) | 1511 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) |
1448 | md_error(rdev->mddev, rdev); | 1512 | md_error(rdev->mddev, rdev); |
@@ -1493,7 +1557,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) | |||
1493 | } | 1557 | } |
1494 | } | 1558 | } |
1495 | d++; | 1559 | d++; |
1496 | if (d == conf->raid_disks) | 1560 | if (d == conf->raid_disks * 2) |
1497 | d = 0; | 1561 | d = 0; |
1498 | } while (!success && d != r1_bio->read_disk); | 1562 | } while (!success && d != r1_bio->read_disk); |
1499 | 1563 | ||
@@ -1510,7 +1574,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) | |||
1510 | mdname(mddev), | 1574 | mdname(mddev), |
1511 | bdevname(bio->bi_bdev, b), | 1575 | bdevname(bio->bi_bdev, b), |
1512 | (unsigned long long)r1_bio->sector); | 1576 | (unsigned long long)r1_bio->sector); |
1513 | for (d = 0; d < conf->raid_disks; d++) { | 1577 | for (d = 0; d < conf->raid_disks * 2; d++) { |
1514 | rdev = conf->mirrors[d].rdev; | 1578 | rdev = conf->mirrors[d].rdev; |
1515 | if (!rdev || test_bit(Faulty, &rdev->flags)) | 1579 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
1516 | continue; | 1580 | continue; |
@@ -1536,7 +1600,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) | |||
1536 | /* write it back and re-read */ | 1600 | /* write it back and re-read */ |
1537 | while (d != r1_bio->read_disk) { | 1601 | while (d != r1_bio->read_disk) { |
1538 | if (d == 0) | 1602 | if (d == 0) |
1539 | d = conf->raid_disks; | 1603 | d = conf->raid_disks * 2; |
1540 | d--; | 1604 | d--; |
1541 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | 1605 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) |
1542 | continue; | 1606 | continue; |
@@ -1551,7 +1615,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) | |||
1551 | d = start; | 1615 | d = start; |
1552 | while (d != r1_bio->read_disk) { | 1616 | while (d != r1_bio->read_disk) { |
1553 | if (d == 0) | 1617 | if (d == 0) |
1554 | d = conf->raid_disks; | 1618 | d = conf->raid_disks * 2; |
1555 | d--; | 1619 | d--; |
1556 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | 1620 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) |
1557 | continue; | 1621 | continue; |
@@ -1584,7 +1648,7 @@ static int process_checks(struct r1bio *r1_bio) | |||
1584 | int primary; | 1648 | int primary; |
1585 | int i; | 1649 | int i; |
1586 | 1650 | ||
1587 | for (primary = 0; primary < conf->raid_disks; primary++) | 1651 | for (primary = 0; primary < conf->raid_disks * 2; primary++) |
1588 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && | 1652 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && |
1589 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { | 1653 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { |
1590 | r1_bio->bios[primary]->bi_end_io = NULL; | 1654 | r1_bio->bios[primary]->bi_end_io = NULL; |
@@ -1592,7 +1656,7 @@ static int process_checks(struct r1bio *r1_bio) | |||
1592 | break; | 1656 | break; |
1593 | } | 1657 | } |
1594 | r1_bio->read_disk = primary; | 1658 | r1_bio->read_disk = primary; |
1595 | for (i = 0; i < conf->raid_disks; i++) { | 1659 | for (i = 0; i < conf->raid_disks * 2; i++) { |
1596 | int j; | 1660 | int j; |
1597 | int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); | 1661 | int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); |
1598 | struct bio *pbio = r1_bio->bios[primary]; | 1662 | struct bio *pbio = r1_bio->bios[primary]; |
@@ -1656,7 +1720,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) | |||
1656 | { | 1720 | { |
1657 | struct r1conf *conf = mddev->private; | 1721 | struct r1conf *conf = mddev->private; |
1658 | int i; | 1722 | int i; |
1659 | int disks = conf->raid_disks; | 1723 | int disks = conf->raid_disks * 2; |
1660 | struct bio *bio, *wbio; | 1724 | struct bio *bio, *wbio; |
1661 | 1725 | ||
1662 | bio = r1_bio->bios[r1_bio->read_disk]; | 1726 | bio = r1_bio->bios[r1_bio->read_disk]; |
@@ -1737,7 +1801,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, | |||
1737 | success = 1; | 1801 | success = 1; |
1738 | else { | 1802 | else { |
1739 | d++; | 1803 | d++; |
1740 | if (d == conf->raid_disks) | 1804 | if (d == conf->raid_disks * 2) |
1741 | d = 0; | 1805 | d = 0; |
1742 | } | 1806 | } |
1743 | } while (!success && d != read_disk); | 1807 | } while (!success && d != read_disk); |
@@ -1753,7 +1817,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, | |||
1753 | start = d; | 1817 | start = d; |
1754 | while (d != read_disk) { | 1818 | while (d != read_disk) { |
1755 | if (d==0) | 1819 | if (d==0) |
1756 | d = conf->raid_disks; | 1820 | d = conf->raid_disks * 2; |
1757 | d--; | 1821 | d--; |
1758 | rdev = conf->mirrors[d].rdev; | 1822 | rdev = conf->mirrors[d].rdev; |
1759 | if (rdev && | 1823 | if (rdev && |
@@ -1765,7 +1829,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, | |||
1765 | while (d != read_disk) { | 1829 | while (d != read_disk) { |
1766 | char b[BDEVNAME_SIZE]; | 1830 | char b[BDEVNAME_SIZE]; |
1767 | if (d==0) | 1831 | if (d==0) |
1768 | d = conf->raid_disks; | 1832 | d = conf->raid_disks * 2; |
1769 | d--; | 1833 | d--; |
1770 | rdev = conf->mirrors[d].rdev; | 1834 | rdev = conf->mirrors[d].rdev; |
1771 | if (rdev && | 1835 | if (rdev && |
@@ -1887,7 +1951,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio | |||
1887 | { | 1951 | { |
1888 | int m; | 1952 | int m; |
1889 | int s = r1_bio->sectors; | 1953 | int s = r1_bio->sectors; |
1890 | for (m = 0; m < conf->raid_disks ; m++) { | 1954 | for (m = 0; m < conf->raid_disks * 2 ; m++) { |
1891 | struct md_rdev *rdev = conf->mirrors[m].rdev; | 1955 | struct md_rdev *rdev = conf->mirrors[m].rdev; |
1892 | struct bio *bio = r1_bio->bios[m]; | 1956 | struct bio *bio = r1_bio->bios[m]; |
1893 | if (bio->bi_end_io == NULL) | 1957 | if (bio->bi_end_io == NULL) |
@@ -1909,7 +1973,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio | |||
1909 | static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) | 1973 | static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) |
1910 | { | 1974 | { |
1911 | int m; | 1975 | int m; |
1912 | for (m = 0; m < conf->raid_disks ; m++) | 1976 | for (m = 0; m < conf->raid_disks * 2 ; m++) |
1913 | if (r1_bio->bios[m] == IO_MADE_GOOD) { | 1977 | if (r1_bio->bios[m] == IO_MADE_GOOD) { |
1914 | struct md_rdev *rdev = conf->mirrors[m].rdev; | 1978 | struct md_rdev *rdev = conf->mirrors[m].rdev; |
1915 | rdev_clear_badblocks(rdev, | 1979 | rdev_clear_badblocks(rdev, |
@@ -2184,7 +2248,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
2184 | r1_bio->state = 0; | 2248 | r1_bio->state = 0; |
2185 | set_bit(R1BIO_IsSync, &r1_bio->state); | 2249 | set_bit(R1BIO_IsSync, &r1_bio->state); |
2186 | 2250 | ||
2187 | for (i=0; i < conf->raid_disks; i++) { | 2251 | for (i = 0; i < conf->raid_disks * 2; i++) { |
2188 | struct md_rdev *rdev; | 2252 | struct md_rdev *rdev; |
2189 | bio = r1_bio->bios[i]; | 2253 | bio = r1_bio->bios[i]; |
2190 | 2254 | ||
@@ -2203,7 +2267,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
2203 | rdev = rcu_dereference(conf->mirrors[i].rdev); | 2267 | rdev = rcu_dereference(conf->mirrors[i].rdev); |
2204 | if (rdev == NULL || | 2268 | if (rdev == NULL || |
2205 | test_bit(Faulty, &rdev->flags)) { | 2269 | test_bit(Faulty, &rdev->flags)) { |
2206 | still_degraded = 1; | 2270 | if (i < conf->raid_disks) |
2271 | still_degraded = 1; | ||
2207 | } else if (!test_bit(In_sync, &rdev->flags)) { | 2272 | } else if (!test_bit(In_sync, &rdev->flags)) { |
2208 | bio->bi_rw = WRITE; | 2273 | bio->bi_rw = WRITE; |
2209 | bio->bi_end_io = end_sync_write; | 2274 | bio->bi_end_io = end_sync_write; |
@@ -2254,7 +2319,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
2254 | * need to mark them bad on all write targets | 2319 | * need to mark them bad on all write targets |
2255 | */ | 2320 | */ |
2256 | int ok = 1; | 2321 | int ok = 1; |
2257 | for (i = 0 ; i < conf->raid_disks ; i++) | 2322 | for (i = 0 ; i < conf->raid_disks * 2 ; i++) |
2258 | if (r1_bio->bios[i]->bi_end_io == end_sync_write) { | 2323 | if (r1_bio->bios[i]->bi_end_io == end_sync_write) { |
2259 | struct md_rdev *rdev = | 2324 | struct md_rdev *rdev = |
2260 | rcu_dereference(conf->mirrors[i].rdev); | 2325 | rcu_dereference(conf->mirrors[i].rdev); |
@@ -2323,7 +2388,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
2323 | len = sync_blocks<<9; | 2388 | len = sync_blocks<<9; |
2324 | } | 2389 | } |
2325 | 2390 | ||
2326 | for (i=0 ; i < conf->raid_disks; i++) { | 2391 | for (i = 0 ; i < conf->raid_disks * 2; i++) { |
2327 | bio = r1_bio->bios[i]; | 2392 | bio = r1_bio->bios[i]; |
2328 | if (bio->bi_end_io) { | 2393 | if (bio->bi_end_io) { |
2329 | page = bio->bi_io_vec[bio->bi_vcnt].bv_page; | 2394 | page = bio->bi_io_vec[bio->bi_vcnt].bv_page; |
@@ -2356,7 +2421,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
2356 | */ | 2421 | */ |
2357 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 2422 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { |
2358 | atomic_set(&r1_bio->remaining, read_targets); | 2423 | atomic_set(&r1_bio->remaining, read_targets); |
2359 | for (i=0; i<conf->raid_disks; i++) { | 2424 | for (i = 0; i < conf->raid_disks * 2; i++) { |
2360 | bio = r1_bio->bios[i]; | 2425 | bio = r1_bio->bios[i]; |
2361 | if (bio->bi_end_io == end_sync_read) { | 2426 | if (bio->bi_end_io == end_sync_read) { |
2362 | md_sync_acct(bio->bi_bdev, nr_sectors); | 2427 | md_sync_acct(bio->bi_bdev, nr_sectors); |
@@ -2393,7 +2458,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2393 | if (!conf) | 2458 | if (!conf) |
2394 | goto abort; | 2459 | goto abort; |
2395 | 2460 | ||
2396 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, | 2461 | conf->mirrors = kzalloc(sizeof(struct mirror_info) |
2462 | * mddev->raid_disks * 2, | ||
2397 | GFP_KERNEL); | 2463 | GFP_KERNEL); |
2398 | if (!conf->mirrors) | 2464 | if (!conf->mirrors) |
2399 | goto abort; | 2465 | goto abort; |
@@ -2405,7 +2471,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2405 | conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); | 2471 | conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); |
2406 | if (!conf->poolinfo) | 2472 | if (!conf->poolinfo) |
2407 | goto abort; | 2473 | goto abort; |
2408 | conf->poolinfo->raid_disks = mddev->raid_disks; | 2474 | conf->poolinfo->raid_disks = mddev->raid_disks * 2; |
2409 | conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, | 2475 | conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, |
2410 | r1bio_pool_free, | 2476 | r1bio_pool_free, |
2411 | conf->poolinfo); | 2477 | conf->poolinfo); |
@@ -2414,14 +2480,20 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2414 | 2480 | ||
2415 | conf->poolinfo->mddev = mddev; | 2481 | conf->poolinfo->mddev = mddev; |
2416 | 2482 | ||
2483 | err = -EINVAL; | ||
2417 | spin_lock_init(&conf->device_lock); | 2484 | spin_lock_init(&conf->device_lock); |
2418 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2485 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
2419 | int disk_idx = rdev->raid_disk; | 2486 | int disk_idx = rdev->raid_disk; |
2420 | if (disk_idx >= mddev->raid_disks | 2487 | if (disk_idx >= mddev->raid_disks |
2421 | || disk_idx < 0) | 2488 | || disk_idx < 0) |
2422 | continue; | 2489 | continue; |
2423 | disk = conf->mirrors + disk_idx; | 2490 | if (test_bit(Replacement, &rdev->flags)) |
2491 | disk = conf->mirrors + conf->raid_disks + disk_idx; | ||
2492 | else | ||
2493 | disk = conf->mirrors + disk_idx; | ||
2424 | 2494 | ||
2495 | if (disk->rdev) | ||
2496 | goto abort; | ||
2425 | disk->rdev = rdev; | 2497 | disk->rdev = rdev; |
2426 | 2498 | ||
2427 | disk->head_position = 0; | 2499 | disk->head_position = 0; |
@@ -2437,11 +2509,27 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2437 | conf->pending_count = 0; | 2509 | conf->pending_count = 0; |
2438 | conf->recovery_disabled = mddev->recovery_disabled - 1; | 2510 | conf->recovery_disabled = mddev->recovery_disabled - 1; |
2439 | 2511 | ||
2512 | err = -EIO; | ||
2440 | conf->last_used = -1; | 2513 | conf->last_used = -1; |
2441 | for (i = 0; i < conf->raid_disks; i++) { | 2514 | for (i = 0; i < conf->raid_disks * 2; i++) { |
2442 | 2515 | ||
2443 | disk = conf->mirrors + i; | 2516 | disk = conf->mirrors + i; |
2444 | 2517 | ||
2518 | if (i < conf->raid_disks && | ||
2519 | disk[conf->raid_disks].rdev) { | ||
2520 | /* This slot has a replacement. */ | ||
2521 | if (!disk->rdev) { | ||
2522 | /* No original, just make the replacement | ||
2523 | * a recovering spare | ||
2524 | */ | ||
2525 | disk->rdev = | ||
2526 | disk[conf->raid_disks].rdev; | ||
2527 | disk[conf->raid_disks].rdev = NULL; | ||
2528 | } else if (!test_bit(In_sync, &disk->rdev->flags)) | ||
2529 | /* Original is not in_sync - bad */ | ||
2530 | goto abort; | ||
2531 | } | ||
2532 | |||
2445 | if (!disk->rdev || | 2533 | if (!disk->rdev || |
2446 | !test_bit(In_sync, &disk->rdev->flags)) { | 2534 | !test_bit(In_sync, &disk->rdev->flags)) { |
2447 | disk->head_position = 0; | 2535 | disk->head_position = 0; |
@@ -2455,7 +2543,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2455 | conf->last_used = i; | 2543 | conf->last_used = i; |
2456 | } | 2544 | } |
2457 | 2545 | ||
2458 | err = -EIO; | ||
2459 | if (conf->last_used < 0) { | 2546 | if (conf->last_used < 0) { |
2460 | printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", | 2547 | printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", |
2461 | mdname(mddev)); | 2548 | mdname(mddev)); |
@@ -2665,7 +2752,7 @@ static int raid1_reshape(struct mddev *mddev) | |||
2665 | if (!newpoolinfo) | 2752 | if (!newpoolinfo) |
2666 | return -ENOMEM; | 2753 | return -ENOMEM; |
2667 | newpoolinfo->mddev = mddev; | 2754 | newpoolinfo->mddev = mddev; |
2668 | newpoolinfo->raid_disks = raid_disks; | 2755 | newpoolinfo->raid_disks = raid_disks * 2; |
2669 | 2756 | ||
2670 | newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, | 2757 | newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, |
2671 | r1bio_pool_free, newpoolinfo); | 2758 | r1bio_pool_free, newpoolinfo); |
@@ -2673,7 +2760,8 @@ static int raid1_reshape(struct mddev *mddev) | |||
2673 | kfree(newpoolinfo); | 2760 | kfree(newpoolinfo); |
2674 | return -ENOMEM; | 2761 | return -ENOMEM; |
2675 | } | 2762 | } |
2676 | newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); | 2763 | newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, |
2764 | GFP_KERNEL); | ||
2677 | if (!newmirrors) { | 2765 | if (!newmirrors) { |
2678 | kfree(newpoolinfo); | 2766 | kfree(newpoolinfo); |
2679 | mempool_destroy(newpool); | 2767 | mempool_destroy(newpool); |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index c732b6cce935..80ded139314c 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
@@ -12,6 +12,9 @@ struct mirror_info { | |||
12 | * pool was allocated for, so they know how much to allocate and free. | 12 | * pool was allocated for, so they know how much to allocate and free. |
13 | * mddev->raid_disks cannot be used, as it can change while a pool is active | 13 | * mddev->raid_disks cannot be used, as it can change while a pool is active |
14 | * These two datums are stored in a kmalloced struct. | 14 | * These two datums are stored in a kmalloced struct. |
15 | * The 'raid_disks' here is twice the raid_disks in r1conf. | ||
16 | * This allows space for each 'real' device can have a replacement in the | ||
17 | * second half of the array. | ||
15 | */ | 18 | */ |
16 | 19 | ||
17 | struct pool_info { | 20 | struct pool_info { |
@@ -21,7 +24,9 @@ struct pool_info { | |||
21 | 24 | ||
22 | struct r1conf { | 25 | struct r1conf { |
23 | struct mddev *mddev; | 26 | struct mddev *mddev; |
24 | struct mirror_info *mirrors; | 27 | struct mirror_info *mirrors; /* twice 'raid_disks' to |
28 | * allow for replacements. | ||
29 | */ | ||
25 | int raid_disks; | 30 | int raid_disks; |
26 | 31 | ||
27 | /* When choose the best device for a read (read_balance()) | 32 | /* When choose the best device for a read (read_balance()) |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 685ddf325ee4..6e8aa213f0d5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -73,7 +73,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) | |||
73 | struct r10conf *conf = data; | 73 | struct r10conf *conf = data; |
74 | int size = offsetof(struct r10bio, devs[conf->copies]); | 74 | int size = offsetof(struct r10bio, devs[conf->copies]); |
75 | 75 | ||
76 | /* allocate a r10bio with room for raid_disks entries in the bios array */ | 76 | /* allocate a r10bio with room for raid_disks entries in the |
77 | * bios array */ | ||
77 | return kzalloc(size, gfp_flags); | 78 | return kzalloc(size, gfp_flags); |
78 | } | 79 | } |
79 | 80 | ||
@@ -123,12 +124,19 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
123 | if (!bio) | 124 | if (!bio) |
124 | goto out_free_bio; | 125 | goto out_free_bio; |
125 | r10_bio->devs[j].bio = bio; | 126 | r10_bio->devs[j].bio = bio; |
127 | if (!conf->have_replacement) | ||
128 | continue; | ||
129 | bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); | ||
130 | if (!bio) | ||
131 | goto out_free_bio; | ||
132 | r10_bio->devs[j].repl_bio = bio; | ||
126 | } | 133 | } |
127 | /* | 134 | /* |
128 | * Allocate RESYNC_PAGES data pages and attach them | 135 | * Allocate RESYNC_PAGES data pages and attach them |
129 | * where needed. | 136 | * where needed. |
130 | */ | 137 | */ |
131 | for (j = 0 ; j < nalloc; j++) { | 138 | for (j = 0 ; j < nalloc; j++) { |
139 | struct bio *rbio = r10_bio->devs[j].repl_bio; | ||
132 | bio = r10_bio->devs[j].bio; | 140 | bio = r10_bio->devs[j].bio; |
133 | for (i = 0; i < RESYNC_PAGES; i++) { | 141 | for (i = 0; i < RESYNC_PAGES; i++) { |
134 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, | 142 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, |
@@ -143,6 +151,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
143 | goto out_free_pages; | 151 | goto out_free_pages; |
144 | 152 | ||
145 | bio->bi_io_vec[i].bv_page = page; | 153 | bio->bi_io_vec[i].bv_page = page; |
154 | if (rbio) | ||
155 | rbio->bi_io_vec[i].bv_page = page; | ||
146 | } | 156 | } |
147 | } | 157 | } |
148 | 158 | ||
@@ -156,8 +166,11 @@ out_free_pages: | |||
156 | safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); | 166 | safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); |
157 | j = -1; | 167 | j = -1; |
158 | out_free_bio: | 168 | out_free_bio: |
159 | while ( ++j < nalloc ) | 169 | while (++j < nalloc) { |
160 | bio_put(r10_bio->devs[j].bio); | 170 | bio_put(r10_bio->devs[j].bio); |
171 | if (r10_bio->devs[j].repl_bio) | ||
172 | bio_put(r10_bio->devs[j].repl_bio); | ||
173 | } | ||
161 | r10bio_pool_free(r10_bio, conf); | 174 | r10bio_pool_free(r10_bio, conf); |
162 | return NULL; | 175 | return NULL; |
163 | } | 176 | } |
@@ -178,6 +191,9 @@ static void r10buf_pool_free(void *__r10_bio, void *data) | |||
178 | } | 191 | } |
179 | bio_put(bio); | 192 | bio_put(bio); |
180 | } | 193 | } |
194 | bio = r10bio->devs[j].repl_bio; | ||
195 | if (bio) | ||
196 | bio_put(bio); | ||
181 | } | 197 | } |
182 | r10bio_pool_free(r10bio, conf); | 198 | r10bio_pool_free(r10bio, conf); |
183 | } | 199 | } |
@@ -191,6 +207,10 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) | |||
191 | if (!BIO_SPECIAL(*bio)) | 207 | if (!BIO_SPECIAL(*bio)) |
192 | bio_put(*bio); | 208 | bio_put(*bio); |
193 | *bio = NULL; | 209 | *bio = NULL; |
210 | bio = &r10_bio->devs[i].repl_bio; | ||
211 | if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) | ||
212 | bio_put(*bio); | ||
213 | *bio = NULL; | ||
194 | } | 214 | } |
195 | } | 215 | } |
196 | 216 | ||
@@ -275,19 +295,27 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio) | |||
275 | * Find the disk number which triggered given bio | 295 | * Find the disk number which triggered given bio |
276 | */ | 296 | */ |
277 | static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, | 297 | static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, |
278 | struct bio *bio, int *slotp) | 298 | struct bio *bio, int *slotp, int *replp) |
279 | { | 299 | { |
280 | int slot; | 300 | int slot; |
301 | int repl = 0; | ||
281 | 302 | ||
282 | for (slot = 0; slot < conf->copies; slot++) | 303 | for (slot = 0; slot < conf->copies; slot++) { |
283 | if (r10_bio->devs[slot].bio == bio) | 304 | if (r10_bio->devs[slot].bio == bio) |
284 | break; | 305 | break; |
306 | if (r10_bio->devs[slot].repl_bio == bio) { | ||
307 | repl = 1; | ||
308 | break; | ||
309 | } | ||
310 | } | ||
285 | 311 | ||
286 | BUG_ON(slot == conf->copies); | 312 | BUG_ON(slot == conf->copies); |
287 | update_head_pos(slot, r10_bio); | 313 | update_head_pos(slot, r10_bio); |
288 | 314 | ||
289 | if (slotp) | 315 | if (slotp) |
290 | *slotp = slot; | 316 | *slotp = slot; |
317 | if (replp) | ||
318 | *replp = repl; | ||
291 | return r10_bio->devs[slot].devnum; | 319 | return r10_bio->devs[slot].devnum; |
292 | } | 320 | } |
293 | 321 | ||
@@ -296,11 +324,13 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
296 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 324 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
297 | struct r10bio *r10_bio = bio->bi_private; | 325 | struct r10bio *r10_bio = bio->bi_private; |
298 | int slot, dev; | 326 | int slot, dev; |
327 | struct md_rdev *rdev; | ||
299 | struct r10conf *conf = r10_bio->mddev->private; | 328 | struct r10conf *conf = r10_bio->mddev->private; |
300 | 329 | ||
301 | 330 | ||
302 | slot = r10_bio->read_slot; | 331 | slot = r10_bio->read_slot; |
303 | dev = r10_bio->devs[slot].devnum; | 332 | dev = r10_bio->devs[slot].devnum; |
333 | rdev = r10_bio->devs[slot].rdev; | ||
304 | /* | 334 | /* |
305 | * this branch is our 'one mirror IO has finished' event handler: | 335 | * this branch is our 'one mirror IO has finished' event handler: |
306 | */ | 336 | */ |
@@ -318,7 +348,7 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
318 | */ | 348 | */ |
319 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 349 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
320 | raid_end_bio_io(r10_bio); | 350 | raid_end_bio_io(r10_bio); |
321 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | 351 | rdev_dec_pending(rdev, conf->mddev); |
322 | } else { | 352 | } else { |
323 | /* | 353 | /* |
324 | * oops, read error - keep the refcount on the rdev | 354 | * oops, read error - keep the refcount on the rdev |
@@ -327,7 +357,7 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
327 | printk_ratelimited(KERN_ERR | 357 | printk_ratelimited(KERN_ERR |
328 | "md/raid10:%s: %s: rescheduling sector %llu\n", | 358 | "md/raid10:%s: %s: rescheduling sector %llu\n", |
329 | mdname(conf->mddev), | 359 | mdname(conf->mddev), |
330 | bdevname(conf->mirrors[dev].rdev->bdev, b), | 360 | bdevname(rdev->bdev, b), |
331 | (unsigned long long)r10_bio->sector); | 361 | (unsigned long long)r10_bio->sector); |
332 | set_bit(R10BIO_ReadError, &r10_bio->state); | 362 | set_bit(R10BIO_ReadError, &r10_bio->state); |
333 | reschedule_retry(r10_bio); | 363 | reschedule_retry(r10_bio); |
@@ -366,17 +396,35 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
366 | int dev; | 396 | int dev; |
367 | int dec_rdev = 1; | 397 | int dec_rdev = 1; |
368 | struct r10conf *conf = r10_bio->mddev->private; | 398 | struct r10conf *conf = r10_bio->mddev->private; |
369 | int slot; | 399 | int slot, repl; |
400 | struct md_rdev *rdev = NULL; | ||
370 | 401 | ||
371 | dev = find_bio_disk(conf, r10_bio, bio, &slot); | 402 | dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); |
372 | 403 | ||
404 | if (repl) | ||
405 | rdev = conf->mirrors[dev].replacement; | ||
406 | if (!rdev) { | ||
407 | smp_rmb(); | ||
408 | repl = 0; | ||
409 | rdev = conf->mirrors[dev].rdev; | ||
410 | } | ||
373 | /* | 411 | /* |
374 | * this branch is our 'one mirror IO has finished' event handler: | 412 | * this branch is our 'one mirror IO has finished' event handler: |
375 | */ | 413 | */ |
376 | if (!uptodate) { | 414 | if (!uptodate) { |
377 | set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); | 415 | if (repl) |
378 | set_bit(R10BIO_WriteError, &r10_bio->state); | 416 | /* Never record new bad blocks to replacement, |
379 | dec_rdev = 0; | 417 | * just fail it. |
418 | */ | ||
419 | md_error(rdev->mddev, rdev); | ||
420 | else { | ||
421 | set_bit(WriteErrorSeen, &rdev->flags); | ||
422 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) | ||
423 | set_bit(MD_RECOVERY_NEEDED, | ||
424 | &rdev->mddev->recovery); | ||
425 | set_bit(R10BIO_WriteError, &r10_bio->state); | ||
426 | dec_rdev = 0; | ||
427 | } | ||
380 | } else { | 428 | } else { |
381 | /* | 429 | /* |
382 | * Set R10BIO_Uptodate in our master bio, so that | 430 | * Set R10BIO_Uptodate in our master bio, so that |
@@ -393,12 +441,15 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
393 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 441 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
394 | 442 | ||
395 | /* Maybe we can clear some bad blocks. */ | 443 | /* Maybe we can clear some bad blocks. */ |
396 | if (is_badblock(conf->mirrors[dev].rdev, | 444 | if (is_badblock(rdev, |
397 | r10_bio->devs[slot].addr, | 445 | r10_bio->devs[slot].addr, |
398 | r10_bio->sectors, | 446 | r10_bio->sectors, |
399 | &first_bad, &bad_sectors)) { | 447 | &first_bad, &bad_sectors)) { |
400 | bio_put(bio); | 448 | bio_put(bio); |
401 | r10_bio->devs[slot].bio = IO_MADE_GOOD; | 449 | if (repl) |
450 | r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; | ||
451 | else | ||
452 | r10_bio->devs[slot].bio = IO_MADE_GOOD; | ||
402 | dec_rdev = 0; | 453 | dec_rdev = 0; |
403 | set_bit(R10BIO_MadeGood, &r10_bio->state); | 454 | set_bit(R10BIO_MadeGood, &r10_bio->state); |
404 | } | 455 | } |
@@ -414,7 +465,6 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
414 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | 465 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); |
415 | } | 466 | } |
416 | 467 | ||
417 | |||
418 | /* | 468 | /* |
419 | * RAID10 layout manager | 469 | * RAID10 layout manager |
420 | * As well as the chunksize and raid_disks count, there are two | 470 | * As well as the chunksize and raid_disks count, there are two |
@@ -562,14 +612,16 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
562 | * FIXME: possibly should rethink readbalancing and do it differently | 612 | * FIXME: possibly should rethink readbalancing and do it differently |
563 | * depending on near_copies / far_copies geometry. | 613 | * depending on near_copies / far_copies geometry. |
564 | */ | 614 | */ |
565 | static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors) | 615 | static struct md_rdev *read_balance(struct r10conf *conf, |
616 | struct r10bio *r10_bio, | ||
617 | int *max_sectors) | ||
566 | { | 618 | { |
567 | const sector_t this_sector = r10_bio->sector; | 619 | const sector_t this_sector = r10_bio->sector; |
568 | int disk, slot; | 620 | int disk, slot; |
569 | int sectors = r10_bio->sectors; | 621 | int sectors = r10_bio->sectors; |
570 | int best_good_sectors; | 622 | int best_good_sectors; |
571 | sector_t new_distance, best_dist; | 623 | sector_t new_distance, best_dist; |
572 | struct md_rdev *rdev; | 624 | struct md_rdev *rdev, *best_rdev; |
573 | int do_balance; | 625 | int do_balance; |
574 | int best_slot; | 626 | int best_slot; |
575 | 627 | ||
@@ -578,6 +630,7 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s | |||
578 | retry: | 630 | retry: |
579 | sectors = r10_bio->sectors; | 631 | sectors = r10_bio->sectors; |
580 | best_slot = -1; | 632 | best_slot = -1; |
633 | best_rdev = NULL; | ||
581 | best_dist = MaxSector; | 634 | best_dist = MaxSector; |
582 | best_good_sectors = 0; | 635 | best_good_sectors = 0; |
583 | do_balance = 1; | 636 | do_balance = 1; |
@@ -599,10 +652,16 @@ retry: | |||
599 | if (r10_bio->devs[slot].bio == IO_BLOCKED) | 652 | if (r10_bio->devs[slot].bio == IO_BLOCKED) |
600 | continue; | 653 | continue; |
601 | disk = r10_bio->devs[slot].devnum; | 654 | disk = r10_bio->devs[slot].devnum; |
602 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 655 | rdev = rcu_dereference(conf->mirrors[disk].replacement); |
656 | if (rdev == NULL || test_bit(Faulty, &rdev->flags) || | ||
657 | r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) | ||
658 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | ||
603 | if (rdev == NULL) | 659 | if (rdev == NULL) |
604 | continue; | 660 | continue; |
605 | if (!test_bit(In_sync, &rdev->flags)) | 661 | if (test_bit(Faulty, &rdev->flags)) |
662 | continue; | ||
663 | if (!test_bit(In_sync, &rdev->flags) && | ||
664 | r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) | ||
606 | continue; | 665 | continue; |
607 | 666 | ||
608 | dev_sector = r10_bio->devs[slot].addr; | 667 | dev_sector = r10_bio->devs[slot].addr; |
@@ -627,6 +686,7 @@ retry: | |||
627 | if (good_sectors > best_good_sectors) { | 686 | if (good_sectors > best_good_sectors) { |
628 | best_good_sectors = good_sectors; | 687 | best_good_sectors = good_sectors; |
629 | best_slot = slot; | 688 | best_slot = slot; |
689 | best_rdev = rdev; | ||
630 | } | 690 | } |
631 | if (!do_balance) | 691 | if (!do_balance) |
632 | /* Must read from here */ | 692 | /* Must read from here */ |
@@ -655,16 +715,15 @@ retry: | |||
655 | if (new_distance < best_dist) { | 715 | if (new_distance < best_dist) { |
656 | best_dist = new_distance; | 716 | best_dist = new_distance; |
657 | best_slot = slot; | 717 | best_slot = slot; |
718 | best_rdev = rdev; | ||
658 | } | 719 | } |
659 | } | 720 | } |
660 | if (slot == conf->copies) | 721 | if (slot >= conf->copies) { |
661 | slot = best_slot; | 722 | slot = best_slot; |
723 | rdev = best_rdev; | ||
724 | } | ||
662 | 725 | ||
663 | if (slot >= 0) { | 726 | if (slot >= 0) { |
664 | disk = r10_bio->devs[slot].devnum; | ||
665 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | ||
666 | if (!rdev) | ||
667 | goto retry; | ||
668 | atomic_inc(&rdev->nr_pending); | 727 | atomic_inc(&rdev->nr_pending); |
669 | if (test_bit(Faulty, &rdev->flags)) { | 728 | if (test_bit(Faulty, &rdev->flags)) { |
670 | /* Cannot risk returning a device that failed | 729 | /* Cannot risk returning a device that failed |
@@ -675,11 +734,11 @@ retry: | |||
675 | } | 734 | } |
676 | r10_bio->read_slot = slot; | 735 | r10_bio->read_slot = slot; |
677 | } else | 736 | } else |
678 | disk = -1; | 737 | rdev = NULL; |
679 | rcu_read_unlock(); | 738 | rcu_read_unlock(); |
680 | *max_sectors = best_good_sectors; | 739 | *max_sectors = best_good_sectors; |
681 | 740 | ||
682 | return disk; | 741 | return rdev; |
683 | } | 742 | } |
684 | 743 | ||
685 | static int raid10_congested(void *data, int bits) | 744 | static int raid10_congested(void *data, int bits) |
@@ -846,7 +905,6 @@ static void unfreeze_array(struct r10conf *conf) | |||
846 | static void make_request(struct mddev *mddev, struct bio * bio) | 905 | static void make_request(struct mddev *mddev, struct bio * bio) |
847 | { | 906 | { |
848 | struct r10conf *conf = mddev->private; | 907 | struct r10conf *conf = mddev->private; |
849 | struct mirror_info *mirror; | ||
850 | struct r10bio *r10_bio; | 908 | struct r10bio *r10_bio; |
851 | struct bio *read_bio; | 909 | struct bio *read_bio; |
852 | int i; | 910 | int i; |
@@ -945,27 +1003,27 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
945 | /* | 1003 | /* |
946 | * read balancing logic: | 1004 | * read balancing logic: |
947 | */ | 1005 | */ |
948 | int disk; | 1006 | struct md_rdev *rdev; |
949 | int slot; | 1007 | int slot; |
950 | 1008 | ||
951 | read_again: | 1009 | read_again: |
952 | disk = read_balance(conf, r10_bio, &max_sectors); | 1010 | rdev = read_balance(conf, r10_bio, &max_sectors); |
953 | slot = r10_bio->read_slot; | 1011 | if (!rdev) { |
954 | if (disk < 0) { | ||
955 | raid_end_bio_io(r10_bio); | 1012 | raid_end_bio_io(r10_bio); |
956 | return; | 1013 | return; |
957 | } | 1014 | } |
958 | mirror = conf->mirrors + disk; | 1015 | slot = r10_bio->read_slot; |
959 | 1016 | ||
960 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1017 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
961 | md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, | 1018 | md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, |
962 | max_sectors); | 1019 | max_sectors); |
963 | 1020 | ||
964 | r10_bio->devs[slot].bio = read_bio; | 1021 | r10_bio->devs[slot].bio = read_bio; |
1022 | r10_bio->devs[slot].rdev = rdev; | ||
965 | 1023 | ||
966 | read_bio->bi_sector = r10_bio->devs[slot].addr + | 1024 | read_bio->bi_sector = r10_bio->devs[slot].addr + |
967 | mirror->rdev->data_offset; | 1025 | rdev->data_offset; |
968 | read_bio->bi_bdev = mirror->rdev->bdev; | 1026 | read_bio->bi_bdev = rdev->bdev; |
969 | read_bio->bi_end_io = raid10_end_read_request; | 1027 | read_bio->bi_end_io = raid10_end_read_request; |
970 | read_bio->bi_rw = READ | do_sync; | 1028 | read_bio->bi_rw = READ | do_sync; |
971 | read_bio->bi_private = r10_bio; | 1029 | read_bio->bi_private = r10_bio; |
@@ -1025,6 +1083,7 @@ read_again: | |||
1025 | */ | 1083 | */ |
1026 | plugged = mddev_check_plugged(mddev); | 1084 | plugged = mddev_check_plugged(mddev); |
1027 | 1085 | ||
1086 | r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ | ||
1028 | raid10_find_phys(conf, r10_bio); | 1087 | raid10_find_phys(conf, r10_bio); |
1029 | retry_write: | 1088 | retry_write: |
1030 | blocked_rdev = NULL; | 1089 | blocked_rdev = NULL; |
@@ -1034,12 +1093,25 @@ retry_write: | |||
1034 | for (i = 0; i < conf->copies; i++) { | 1093 | for (i = 0; i < conf->copies; i++) { |
1035 | int d = r10_bio->devs[i].devnum; | 1094 | int d = r10_bio->devs[i].devnum; |
1036 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); | 1095 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); |
1096 | struct md_rdev *rrdev = rcu_dereference( | ||
1097 | conf->mirrors[d].replacement); | ||
1098 | if (rdev == rrdev) | ||
1099 | rrdev = NULL; | ||
1037 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | 1100 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { |
1038 | atomic_inc(&rdev->nr_pending); | 1101 | atomic_inc(&rdev->nr_pending); |
1039 | blocked_rdev = rdev; | 1102 | blocked_rdev = rdev; |
1040 | break; | 1103 | break; |
1041 | } | 1104 | } |
1105 | if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { | ||
1106 | atomic_inc(&rrdev->nr_pending); | ||
1107 | blocked_rdev = rrdev; | ||
1108 | break; | ||
1109 | } | ||
1110 | if (rrdev && test_bit(Faulty, &rrdev->flags)) | ||
1111 | rrdev = NULL; | ||
1112 | |||
1042 | r10_bio->devs[i].bio = NULL; | 1113 | r10_bio->devs[i].bio = NULL; |
1114 | r10_bio->devs[i].repl_bio = NULL; | ||
1043 | if (!rdev || test_bit(Faulty, &rdev->flags)) { | 1115 | if (!rdev || test_bit(Faulty, &rdev->flags)) { |
1044 | set_bit(R10BIO_Degraded, &r10_bio->state); | 1116 | set_bit(R10BIO_Degraded, &r10_bio->state); |
1045 | continue; | 1117 | continue; |
@@ -1088,6 +1160,10 @@ retry_write: | |||
1088 | } | 1160 | } |
1089 | r10_bio->devs[i].bio = bio; | 1161 | r10_bio->devs[i].bio = bio; |
1090 | atomic_inc(&rdev->nr_pending); | 1162 | atomic_inc(&rdev->nr_pending); |
1163 | if (rrdev) { | ||
1164 | r10_bio->devs[i].repl_bio = bio; | ||
1165 | atomic_inc(&rrdev->nr_pending); | ||
1166 | } | ||
1091 | } | 1167 | } |
1092 | rcu_read_unlock(); | 1168 | rcu_read_unlock(); |
1093 | 1169 | ||
@@ -1096,11 +1172,23 @@ retry_write: | |||
1096 | int j; | 1172 | int j; |
1097 | int d; | 1173 | int d; |
1098 | 1174 | ||
1099 | for (j = 0; j < i; j++) | 1175 | for (j = 0; j < i; j++) { |
1100 | if (r10_bio->devs[j].bio) { | 1176 | if (r10_bio->devs[j].bio) { |
1101 | d = r10_bio->devs[j].devnum; | 1177 | d = r10_bio->devs[j].devnum; |
1102 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | 1178 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); |
1103 | } | 1179 | } |
1180 | if (r10_bio->devs[j].repl_bio) { | ||
1181 | struct md_rdev *rdev; | ||
1182 | d = r10_bio->devs[j].devnum; | ||
1183 | rdev = conf->mirrors[d].replacement; | ||
1184 | if (!rdev) { | ||
1185 | /* Race with remove_disk */ | ||
1186 | smp_mb(); | ||
1187 | rdev = conf->mirrors[d].rdev; | ||
1188 | } | ||
1189 | rdev_dec_pending(rdev, mddev); | ||
1190 | } | ||
1191 | } | ||
1104 | allow_barrier(conf); | 1192 | allow_barrier(conf); |
1105 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1193 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
1106 | wait_barrier(conf); | 1194 | wait_barrier(conf); |
@@ -1147,6 +1235,31 @@ retry_write: | |||
1147 | bio_list_add(&conf->pending_bio_list, mbio); | 1235 | bio_list_add(&conf->pending_bio_list, mbio); |
1148 | conf->pending_count++; | 1236 | conf->pending_count++; |
1149 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1237 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1238 | |||
1239 | if (!r10_bio->devs[i].repl_bio) | ||
1240 | continue; | ||
1241 | |||
1242 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | ||
1243 | md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, | ||
1244 | max_sectors); | ||
1245 | r10_bio->devs[i].repl_bio = mbio; | ||
1246 | |||
1247 | /* We are actively writing to the original device | ||
1248 | * so it cannot disappear, so the replacement cannot | ||
1249 | * become NULL here | ||
1250 | */ | ||
1251 | mbio->bi_sector = (r10_bio->devs[i].addr+ | ||
1252 | conf->mirrors[d].replacement->data_offset); | ||
1253 | mbio->bi_bdev = conf->mirrors[d].replacement->bdev; | ||
1254 | mbio->bi_end_io = raid10_end_write_request; | ||
1255 | mbio->bi_rw = WRITE | do_sync | do_fua; | ||
1256 | mbio->bi_private = r10_bio; | ||
1257 | |||
1258 | atomic_inc(&r10_bio->remaining); | ||
1259 | spin_lock_irqsave(&conf->device_lock, flags); | ||
1260 | bio_list_add(&conf->pending_bio_list, mbio); | ||
1261 | conf->pending_count++; | ||
1262 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
1150 | } | 1263 | } |
1151 | 1264 | ||
1152 | /* Don't remove the bias on 'remaining' (one_write_done) until | 1265 | /* Don't remove the bias on 'remaining' (one_write_done) until |
@@ -1309,9 +1422,27 @@ static int raid10_spare_active(struct mddev *mddev) | |||
1309 | */ | 1422 | */ |
1310 | for (i = 0; i < conf->raid_disks; i++) { | 1423 | for (i = 0; i < conf->raid_disks; i++) { |
1311 | tmp = conf->mirrors + i; | 1424 | tmp = conf->mirrors + i; |
1312 | if (tmp->rdev | 1425 | if (tmp->replacement |
1313 | && !test_bit(Faulty, &tmp->rdev->flags) | 1426 | && tmp->replacement->recovery_offset == MaxSector |
1314 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | 1427 | && !test_bit(Faulty, &tmp->replacement->flags) |
1428 | && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { | ||
1429 | /* Replacement has just become active */ | ||
1430 | if (!tmp->rdev | ||
1431 | || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) | ||
1432 | count++; | ||
1433 | if (tmp->rdev) { | ||
1434 | /* Replaced device not technically faulty, | ||
1435 | * but we need to be sure it gets removed | ||
1436 | * and never re-added. | ||
1437 | */ | ||
1438 | set_bit(Faulty, &tmp->rdev->flags); | ||
1439 | sysfs_notify_dirent_safe( | ||
1440 | tmp->rdev->sysfs_state); | ||
1441 | } | ||
1442 | sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); | ||
1443 | } else if (tmp->rdev | ||
1444 | && !test_bit(Faulty, &tmp->rdev->flags) | ||
1445 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | ||
1315 | count++; | 1446 | count++; |
1316 | sysfs_notify_dirent(tmp->rdev->sysfs_state); | 1447 | sysfs_notify_dirent(tmp->rdev->sysfs_state); |
1317 | } | 1448 | } |
@@ -1353,8 +1484,25 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1353 | struct mirror_info *p = &conf->mirrors[mirror]; | 1484 | struct mirror_info *p = &conf->mirrors[mirror]; |
1354 | if (p->recovery_disabled == mddev->recovery_disabled) | 1485 | if (p->recovery_disabled == mddev->recovery_disabled) |
1355 | continue; | 1486 | continue; |
1356 | if (p->rdev) | 1487 | if (p->rdev) { |
1357 | continue; | 1488 | if (!test_bit(WantReplacement, &p->rdev->flags) || |
1489 | p->replacement != NULL) | ||
1490 | continue; | ||
1491 | clear_bit(In_sync, &rdev->flags); | ||
1492 | set_bit(Replacement, &rdev->flags); | ||
1493 | rdev->raid_disk = mirror; | ||
1494 | err = 0; | ||
1495 | disk_stack_limits(mddev->gendisk, rdev->bdev, | ||
1496 | rdev->data_offset << 9); | ||
1497 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
1498 | blk_queue_max_segments(mddev->queue, 1); | ||
1499 | blk_queue_segment_boundary(mddev->queue, | ||
1500 | PAGE_CACHE_SIZE - 1); | ||
1501 | } | ||
1502 | conf->fullsync = 1; | ||
1503 | rcu_assign_pointer(p->replacement, rdev); | ||
1504 | break; | ||
1505 | } | ||
1358 | 1506 | ||
1359 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1507 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1360 | rdev->data_offset << 9); | 1508 | rdev->data_offset << 9); |
@@ -1385,40 +1533,61 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1385 | return err; | 1533 | return err; |
1386 | } | 1534 | } |
1387 | 1535 | ||
1388 | static int raid10_remove_disk(struct mddev *mddev, int number) | 1536 | static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) |
1389 | { | 1537 | { |
1390 | struct r10conf *conf = mddev->private; | 1538 | struct r10conf *conf = mddev->private; |
1391 | int err = 0; | 1539 | int err = 0; |
1392 | struct md_rdev *rdev; | 1540 | int number = rdev->raid_disk; |
1393 | struct mirror_info *p = conf->mirrors+ number; | 1541 | struct md_rdev **rdevp; |
1542 | struct mirror_info *p = conf->mirrors + number; | ||
1394 | 1543 | ||
1395 | print_conf(conf); | 1544 | print_conf(conf); |
1396 | rdev = p->rdev; | 1545 | if (rdev == p->rdev) |
1397 | if (rdev) { | 1546 | rdevp = &p->rdev; |
1398 | if (test_bit(In_sync, &rdev->flags) || | 1547 | else if (rdev == p->replacement) |
1399 | atomic_read(&rdev->nr_pending)) { | 1548 | rdevp = &p->replacement; |
1400 | err = -EBUSY; | 1549 | else |
1401 | goto abort; | 1550 | return 0; |
1402 | } | 1551 | |
1403 | /* Only remove faulty devices in recovery | 1552 | if (test_bit(In_sync, &rdev->flags) || |
1404 | * is not possible. | 1553 | atomic_read(&rdev->nr_pending)) { |
1405 | */ | 1554 | err = -EBUSY; |
1406 | if (!test_bit(Faulty, &rdev->flags) && | 1555 | goto abort; |
1407 | mddev->recovery_disabled != p->recovery_disabled && | ||
1408 | enough(conf, -1)) { | ||
1409 | err = -EBUSY; | ||
1410 | goto abort; | ||
1411 | } | ||
1412 | p->rdev = NULL; | ||
1413 | synchronize_rcu(); | ||
1414 | if (atomic_read(&rdev->nr_pending)) { | ||
1415 | /* lost the race, try later */ | ||
1416 | err = -EBUSY; | ||
1417 | p->rdev = rdev; | ||
1418 | goto abort; | ||
1419 | } | ||
1420 | err = md_integrity_register(mddev); | ||
1421 | } | 1556 | } |
1557 | /* Only remove faulty devices if recovery | ||
1558 | * is not possible. | ||
1559 | */ | ||
1560 | if (!test_bit(Faulty, &rdev->flags) && | ||
1561 | mddev->recovery_disabled != p->recovery_disabled && | ||
1562 | (!p->replacement || p->replacement == rdev) && | ||
1563 | enough(conf, -1)) { | ||
1564 | err = -EBUSY; | ||
1565 | goto abort; | ||
1566 | } | ||
1567 | *rdevp = NULL; | ||
1568 | synchronize_rcu(); | ||
1569 | if (atomic_read(&rdev->nr_pending)) { | ||
1570 | /* lost the race, try later */ | ||
1571 | err = -EBUSY; | ||
1572 | *rdevp = rdev; | ||
1573 | goto abort; | ||
1574 | } else if (p->replacement) { | ||
1575 | /* We must have just cleared 'rdev' */ | ||
1576 | p->rdev = p->replacement; | ||
1577 | clear_bit(Replacement, &p->replacement->flags); | ||
1578 | smp_mb(); /* Make sure other CPUs may see both as identical | ||
1579 | * but will never see neither -- if they are careful. | ||
1580 | */ | ||
1581 | p->replacement = NULL; | ||
1582 | clear_bit(WantReplacement, &rdev->flags); | ||
1583 | } else | ||
1584 | /* We might have just remove the Replacement as faulty | ||
1585 | * Clear the flag just in case | ||
1586 | */ | ||
1587 | clear_bit(WantReplacement, &rdev->flags); | ||
1588 | |||
1589 | err = md_integrity_register(mddev); | ||
1590 | |||
1422 | abort: | 1591 | abort: |
1423 | 1592 | ||
1424 | print_conf(conf); | 1593 | print_conf(conf); |
@@ -1432,7 +1601,7 @@ static void end_sync_read(struct bio *bio, int error) | |||
1432 | struct r10conf *conf = r10_bio->mddev->private; | 1601 | struct r10conf *conf = r10_bio->mddev->private; |
1433 | int d; | 1602 | int d; |
1434 | 1603 | ||
1435 | d = find_bio_disk(conf, r10_bio, bio, NULL); | 1604 | d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); |
1436 | 1605 | ||
1437 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 1606 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
1438 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 1607 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
@@ -1493,19 +1662,34 @@ static void end_sync_write(struct bio *bio, int error) | |||
1493 | sector_t first_bad; | 1662 | sector_t first_bad; |
1494 | int bad_sectors; | 1663 | int bad_sectors; |
1495 | int slot; | 1664 | int slot; |
1496 | 1665 | int repl; | |
1497 | d = find_bio_disk(conf, r10_bio, bio, &slot); | 1666 | struct md_rdev *rdev = NULL; |
1667 | |||
1668 | d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); | ||
1669 | if (repl) | ||
1670 | rdev = conf->mirrors[d].replacement; | ||
1671 | if (!rdev) { | ||
1672 | smp_mb(); | ||
1673 | rdev = conf->mirrors[d].rdev; | ||
1674 | } | ||
1498 | 1675 | ||
1499 | if (!uptodate) { | 1676 | if (!uptodate) { |
1500 | set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); | 1677 | if (repl) |
1501 | set_bit(R10BIO_WriteError, &r10_bio->state); | 1678 | md_error(mddev, rdev); |
1502 | } else if (is_badblock(conf->mirrors[d].rdev, | 1679 | else { |
1680 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1681 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) | ||
1682 | set_bit(MD_RECOVERY_NEEDED, | ||
1683 | &rdev->mddev->recovery); | ||
1684 | set_bit(R10BIO_WriteError, &r10_bio->state); | ||
1685 | } | ||
1686 | } else if (is_badblock(rdev, | ||
1503 | r10_bio->devs[slot].addr, | 1687 | r10_bio->devs[slot].addr, |
1504 | r10_bio->sectors, | 1688 | r10_bio->sectors, |
1505 | &first_bad, &bad_sectors)) | 1689 | &first_bad, &bad_sectors)) |
1506 | set_bit(R10BIO_MadeGood, &r10_bio->state); | 1690 | set_bit(R10BIO_MadeGood, &r10_bio->state); |
1507 | 1691 | ||
1508 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | 1692 | rdev_dec_pending(rdev, mddev); |
1509 | 1693 | ||
1510 | end_sync_request(r10_bio); | 1694 | end_sync_request(r10_bio); |
1511 | } | 1695 | } |
@@ -1609,6 +1793,29 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
1609 | generic_make_request(tbio); | 1793 | generic_make_request(tbio); |
1610 | } | 1794 | } |
1611 | 1795 | ||
1796 | /* Now write out to any replacement devices | ||
1797 | * that are active | ||
1798 | */ | ||
1799 | for (i = 0; i < conf->copies; i++) { | ||
1800 | int j, d; | ||
1801 | int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); | ||
1802 | |||
1803 | tbio = r10_bio->devs[i].repl_bio; | ||
1804 | if (!tbio || !tbio->bi_end_io) | ||
1805 | continue; | ||
1806 | if (r10_bio->devs[i].bio->bi_end_io != end_sync_write | ||
1807 | && r10_bio->devs[i].bio != fbio) | ||
1808 | for (j = 0; j < vcnt; j++) | ||
1809 | memcpy(page_address(tbio->bi_io_vec[j].bv_page), | ||
1810 | page_address(fbio->bi_io_vec[j].bv_page), | ||
1811 | PAGE_SIZE); | ||
1812 | d = r10_bio->devs[i].devnum; | ||
1813 | atomic_inc(&r10_bio->remaining); | ||
1814 | md_sync_acct(conf->mirrors[d].replacement->bdev, | ||
1815 | tbio->bi_size >> 9); | ||
1816 | generic_make_request(tbio); | ||
1817 | } | ||
1818 | |||
1612 | done: | 1819 | done: |
1613 | if (atomic_dec_and_test(&r10_bio->remaining)) { | 1820 | if (atomic_dec_and_test(&r10_bio->remaining)) { |
1614 | md_done_sync(mddev, r10_bio->sectors, 1); | 1821 | md_done_sync(mddev, r10_bio->sectors, 1); |
@@ -1668,8 +1875,13 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) | |||
1668 | s << 9, | 1875 | s << 9, |
1669 | bio->bi_io_vec[idx].bv_page, | 1876 | bio->bi_io_vec[idx].bv_page, |
1670 | WRITE, false); | 1877 | WRITE, false); |
1671 | if (!ok) | 1878 | if (!ok) { |
1672 | set_bit(WriteErrorSeen, &rdev->flags); | 1879 | set_bit(WriteErrorSeen, &rdev->flags); |
1880 | if (!test_and_set_bit(WantReplacement, | ||
1881 | &rdev->flags)) | ||
1882 | set_bit(MD_RECOVERY_NEEDED, | ||
1883 | &rdev->mddev->recovery); | ||
1884 | } | ||
1673 | } | 1885 | } |
1674 | if (!ok) { | 1886 | if (!ok) { |
1675 | /* We don't worry if we cannot set a bad block - | 1887 | /* We don't worry if we cannot set a bad block - |
@@ -1709,7 +1921,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
1709 | { | 1921 | { |
1710 | struct r10conf *conf = mddev->private; | 1922 | struct r10conf *conf = mddev->private; |
1711 | int d; | 1923 | int d; |
1712 | struct bio *wbio; | 1924 | struct bio *wbio, *wbio2; |
1713 | 1925 | ||
1714 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { | 1926 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { |
1715 | fix_recovery_read_error(r10_bio); | 1927 | fix_recovery_read_error(r10_bio); |
@@ -1721,12 +1933,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
1721 | * share the pages with the first bio | 1933 | * share the pages with the first bio |
1722 | * and submit the write request | 1934 | * and submit the write request |
1723 | */ | 1935 | */ |
1724 | wbio = r10_bio->devs[1].bio; | ||
1725 | d = r10_bio->devs[1].devnum; | 1936 | d = r10_bio->devs[1].devnum; |
1726 | 1937 | wbio = r10_bio->devs[1].bio; | |
1727 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 1938 | wbio2 = r10_bio->devs[1].repl_bio; |
1728 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); | 1939 | if (wbio->bi_end_io) { |
1729 | generic_make_request(wbio); | 1940 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
1941 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); | ||
1942 | generic_make_request(wbio); | ||
1943 | } | ||
1944 | if (wbio2 && wbio2->bi_end_io) { | ||
1945 | atomic_inc(&conf->mirrors[d].replacement->nr_pending); | ||
1946 | md_sync_acct(conf->mirrors[d].replacement->bdev, | ||
1947 | wbio2->bi_size >> 9); | ||
1948 | generic_make_request(wbio2); | ||
1949 | } | ||
1730 | } | 1950 | } |
1731 | 1951 | ||
1732 | 1952 | ||
@@ -1779,8 +1999,12 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, | |||
1779 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) | 1999 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) |
1780 | /* success */ | 2000 | /* success */ |
1781 | return 1; | 2001 | return 1; |
1782 | if (rw == WRITE) | 2002 | if (rw == WRITE) { |
1783 | set_bit(WriteErrorSeen, &rdev->flags); | 2003 | set_bit(WriteErrorSeen, &rdev->flags); |
2004 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) | ||
2005 | set_bit(MD_RECOVERY_NEEDED, | ||
2006 | &rdev->mddev->recovery); | ||
2007 | } | ||
1784 | /* need to record an error - either for the block or the device */ | 2008 | /* need to record an error - either for the block or the device */ |
1785 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) | 2009 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) |
1786 | md_error(rdev->mddev, rdev); | 2010 | md_error(rdev->mddev, rdev); |
@@ -2060,10 +2284,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) | |||
2060 | static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) | 2284 | static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) |
2061 | { | 2285 | { |
2062 | int slot = r10_bio->read_slot; | 2286 | int slot = r10_bio->read_slot; |
2063 | int mirror = r10_bio->devs[slot].devnum; | ||
2064 | struct bio *bio; | 2287 | struct bio *bio; |
2065 | struct r10conf *conf = mddev->private; | 2288 | struct r10conf *conf = mddev->private; |
2066 | struct md_rdev *rdev; | 2289 | struct md_rdev *rdev = r10_bio->devs[slot].rdev; |
2067 | char b[BDEVNAME_SIZE]; | 2290 | char b[BDEVNAME_SIZE]; |
2068 | unsigned long do_sync; | 2291 | unsigned long do_sync; |
2069 | int max_sectors; | 2292 | int max_sectors; |
@@ -2081,15 +2304,15 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) | |||
2081 | fix_read_error(conf, mddev, r10_bio); | 2304 | fix_read_error(conf, mddev, r10_bio); |
2082 | unfreeze_array(conf); | 2305 | unfreeze_array(conf); |
2083 | } | 2306 | } |
2084 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | 2307 | rdev_dec_pending(rdev, mddev); |
2085 | 2308 | ||
2086 | bio = r10_bio->devs[slot].bio; | 2309 | bio = r10_bio->devs[slot].bio; |
2087 | bdevname(bio->bi_bdev, b); | 2310 | bdevname(bio->bi_bdev, b); |
2088 | r10_bio->devs[slot].bio = | 2311 | r10_bio->devs[slot].bio = |
2089 | mddev->ro ? IO_BLOCKED : NULL; | 2312 | mddev->ro ? IO_BLOCKED : NULL; |
2090 | read_more: | 2313 | read_more: |
2091 | mirror = read_balance(conf, r10_bio, &max_sectors); | 2314 | rdev = read_balance(conf, r10_bio, &max_sectors); |
2092 | if (mirror == -1) { | 2315 | if (rdev == NULL) { |
2093 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | 2316 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" |
2094 | " read error for block %llu\n", | 2317 | " read error for block %llu\n", |
2095 | mdname(mddev), b, | 2318 | mdname(mddev), b, |
@@ -2103,7 +2326,6 @@ read_more: | |||
2103 | if (bio) | 2326 | if (bio) |
2104 | bio_put(bio); | 2327 | bio_put(bio); |
2105 | slot = r10_bio->read_slot; | 2328 | slot = r10_bio->read_slot; |
2106 | rdev = conf->mirrors[mirror].rdev; | ||
2107 | printk_ratelimited( | 2329 | printk_ratelimited( |
2108 | KERN_ERR | 2330 | KERN_ERR |
2109 | "md/raid10:%s: %s: redirecting" | 2331 | "md/raid10:%s: %s: redirecting" |
@@ -2117,6 +2339,7 @@ read_more: | |||
2117 | r10_bio->sector - bio->bi_sector, | 2339 | r10_bio->sector - bio->bi_sector, |
2118 | max_sectors); | 2340 | max_sectors); |
2119 | r10_bio->devs[slot].bio = bio; | 2341 | r10_bio->devs[slot].bio = bio; |
2342 | r10_bio->devs[slot].rdev = rdev; | ||
2120 | bio->bi_sector = r10_bio->devs[slot].addr | 2343 | bio->bi_sector = r10_bio->devs[slot].addr |
2121 | + rdev->data_offset; | 2344 | + rdev->data_offset; |
2122 | bio->bi_bdev = rdev->bdev; | 2345 | bio->bi_bdev = rdev->bdev; |
@@ -2187,6 +2410,22 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2187 | r10_bio->sectors, 0)) | 2410 | r10_bio->sectors, 0)) |
2188 | md_error(conf->mddev, rdev); | 2411 | md_error(conf->mddev, rdev); |
2189 | } | 2412 | } |
2413 | rdev = conf->mirrors[dev].replacement; | ||
2414 | if (r10_bio->devs[m].repl_bio == NULL) | ||
2415 | continue; | ||
2416 | if (test_bit(BIO_UPTODATE, | ||
2417 | &r10_bio->devs[m].repl_bio->bi_flags)) { | ||
2418 | rdev_clear_badblocks( | ||
2419 | rdev, | ||
2420 | r10_bio->devs[m].addr, | ||
2421 | r10_bio->sectors); | ||
2422 | } else { | ||
2423 | if (!rdev_set_badblocks( | ||
2424 | rdev, | ||
2425 | r10_bio->devs[m].addr, | ||
2426 | r10_bio->sectors, 0)) | ||
2427 | md_error(conf->mddev, rdev); | ||
2428 | } | ||
2190 | } | 2429 | } |
2191 | put_buf(r10_bio); | 2430 | put_buf(r10_bio); |
2192 | } else { | 2431 | } else { |
@@ -2209,6 +2448,15 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2209 | } | 2448 | } |
2210 | rdev_dec_pending(rdev, conf->mddev); | 2449 | rdev_dec_pending(rdev, conf->mddev); |
2211 | } | 2450 | } |
2451 | bio = r10_bio->devs[m].repl_bio; | ||
2452 | rdev = conf->mirrors[dev].replacement; | ||
2453 | if (rdev && bio == IO_MADE_GOOD) { | ||
2454 | rdev_clear_badblocks( | ||
2455 | rdev, | ||
2456 | r10_bio->devs[m].addr, | ||
2457 | r10_bio->sectors); | ||
2458 | rdev_dec_pending(rdev, conf->mddev); | ||
2459 | } | ||
2212 | } | 2460 | } |
2213 | if (test_bit(R10BIO_WriteError, | 2461 | if (test_bit(R10BIO_WriteError, |
2214 | &r10_bio->state)) | 2462 | &r10_bio->state)) |
@@ -2272,9 +2520,14 @@ static void raid10d(struct mddev *mddev) | |||
2272 | static int init_resync(struct r10conf *conf) | 2520 | static int init_resync(struct r10conf *conf) |
2273 | { | 2521 | { |
2274 | int buffs; | 2522 | int buffs; |
2523 | int i; | ||
2275 | 2524 | ||
2276 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; | 2525 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; |
2277 | BUG_ON(conf->r10buf_pool); | 2526 | BUG_ON(conf->r10buf_pool); |
2527 | conf->have_replacement = 0; | ||
2528 | for (i = 0; i < conf->raid_disks; i++) | ||
2529 | if (conf->mirrors[i].replacement) | ||
2530 | conf->have_replacement = 1; | ||
2278 | conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); | 2531 | conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); |
2279 | if (!conf->r10buf_pool) | 2532 | if (!conf->r10buf_pool) |
2280 | return -ENOMEM; | 2533 | return -ENOMEM; |
@@ -2355,9 +2608,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2355 | bitmap_end_sync(mddev->bitmap, sect, | 2608 | bitmap_end_sync(mddev->bitmap, sect, |
2356 | &sync_blocks, 1); | 2609 | &sync_blocks, 1); |
2357 | } | 2610 | } |
2358 | } else /* completed sync */ | 2611 | } else { |
2612 | /* completed sync */ | ||
2613 | if ((!mddev->bitmap || conf->fullsync) | ||
2614 | && conf->have_replacement | ||
2615 | && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | ||
2616 | /* Completed a full sync so the replacements | ||
2617 | * are now fully recovered. | ||
2618 | */ | ||
2619 | for (i = 0; i < conf->raid_disks; i++) | ||
2620 | if (conf->mirrors[i].replacement) | ||
2621 | conf->mirrors[i].replacement | ||
2622 | ->recovery_offset | ||
2623 | = MaxSector; | ||
2624 | } | ||
2359 | conf->fullsync = 0; | 2625 | conf->fullsync = 0; |
2360 | 2626 | } | |
2361 | bitmap_close_sync(mddev->bitmap); | 2627 | bitmap_close_sync(mddev->bitmap); |
2362 | close_sync(conf); | 2628 | close_sync(conf); |
2363 | *skipped = 1; | 2629 | *skipped = 1; |
@@ -2414,23 +2680,30 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2414 | sector_t sect; | 2680 | sector_t sect; |
2415 | int must_sync; | 2681 | int must_sync; |
2416 | int any_working; | 2682 | int any_working; |
2417 | 2683 | struct mirror_info *mirror = &conf->mirrors[i]; | |
2418 | if (conf->mirrors[i].rdev == NULL || | 2684 | |
2419 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) | 2685 | if ((mirror->rdev == NULL || |
2686 | test_bit(In_sync, &mirror->rdev->flags)) | ||
2687 | && | ||
2688 | (mirror->replacement == NULL || | ||
2689 | test_bit(Faulty, | ||
2690 | &mirror->replacement->flags))) | ||
2420 | continue; | 2691 | continue; |
2421 | 2692 | ||
2422 | still_degraded = 0; | 2693 | still_degraded = 0; |
2423 | /* want to reconstruct this device */ | 2694 | /* want to reconstruct this device */ |
2424 | rb2 = r10_bio; | 2695 | rb2 = r10_bio; |
2425 | sect = raid10_find_virt(conf, sector_nr, i); | 2696 | sect = raid10_find_virt(conf, sector_nr, i); |
2426 | /* Unless we are doing a full sync, we only need | 2697 | /* Unless we are doing a full sync, or a replacement |
2427 | * to recover the block if it is set in the bitmap | 2698 | * we only need to recover the block if it is set in |
2699 | * the bitmap | ||
2428 | */ | 2700 | */ |
2429 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | 2701 | must_sync = bitmap_start_sync(mddev->bitmap, sect, |
2430 | &sync_blocks, 1); | 2702 | &sync_blocks, 1); |
2431 | if (sync_blocks < max_sync) | 2703 | if (sync_blocks < max_sync) |
2432 | max_sync = sync_blocks; | 2704 | max_sync = sync_blocks; |
2433 | if (!must_sync && | 2705 | if (!must_sync && |
2706 | mirror->replacement == NULL && | ||
2434 | !conf->fullsync) { | 2707 | !conf->fullsync) { |
2435 | /* yep, skip the sync_blocks here, but don't assume | 2708 | /* yep, skip the sync_blocks here, but don't assume |
2436 | * that there will never be anything to do here | 2709 | * that there will never be anything to do here |
@@ -2500,33 +2773,60 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2500 | bio->bi_end_io = end_sync_read; | 2773 | bio->bi_end_io = end_sync_read; |
2501 | bio->bi_rw = READ; | 2774 | bio->bi_rw = READ; |
2502 | from_addr = r10_bio->devs[j].addr; | 2775 | from_addr = r10_bio->devs[j].addr; |
2503 | bio->bi_sector = from_addr + | 2776 | bio->bi_sector = from_addr + rdev->data_offset; |
2504 | conf->mirrors[d].rdev->data_offset; | 2777 | bio->bi_bdev = rdev->bdev; |
2505 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2778 | atomic_inc(&rdev->nr_pending); |
2506 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2779 | /* and we write to 'i' (if not in_sync) */ |
2507 | atomic_inc(&r10_bio->remaining); | ||
2508 | /* and we write to 'i' */ | ||
2509 | 2780 | ||
2510 | for (k=0; k<conf->copies; k++) | 2781 | for (k=0; k<conf->copies; k++) |
2511 | if (r10_bio->devs[k].devnum == i) | 2782 | if (r10_bio->devs[k].devnum == i) |
2512 | break; | 2783 | break; |
2513 | BUG_ON(k == conf->copies); | 2784 | BUG_ON(k == conf->copies); |
2514 | bio = r10_bio->devs[1].bio; | ||
2515 | bio->bi_next = biolist; | ||
2516 | biolist = bio; | ||
2517 | bio->bi_private = r10_bio; | ||
2518 | bio->bi_end_io = end_sync_write; | ||
2519 | bio->bi_rw = WRITE; | ||
2520 | to_addr = r10_bio->devs[k].addr; | 2785 | to_addr = r10_bio->devs[k].addr; |
2521 | bio->bi_sector = to_addr + | ||
2522 | conf->mirrors[i].rdev->data_offset; | ||
2523 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
2524 | |||
2525 | r10_bio->devs[0].devnum = d; | 2786 | r10_bio->devs[0].devnum = d; |
2526 | r10_bio->devs[0].addr = from_addr; | 2787 | r10_bio->devs[0].addr = from_addr; |
2527 | r10_bio->devs[1].devnum = i; | 2788 | r10_bio->devs[1].devnum = i; |
2528 | r10_bio->devs[1].addr = to_addr; | 2789 | r10_bio->devs[1].addr = to_addr; |
2529 | 2790 | ||
2791 | rdev = mirror->rdev; | ||
2792 | if (!test_bit(In_sync, &rdev->flags)) { | ||
2793 | bio = r10_bio->devs[1].bio; | ||
2794 | bio->bi_next = biolist; | ||
2795 | biolist = bio; | ||
2796 | bio->bi_private = r10_bio; | ||
2797 | bio->bi_end_io = end_sync_write; | ||
2798 | bio->bi_rw = WRITE; | ||
2799 | bio->bi_sector = to_addr | ||
2800 | + rdev->data_offset; | ||
2801 | bio->bi_bdev = rdev->bdev; | ||
2802 | atomic_inc(&r10_bio->remaining); | ||
2803 | } else | ||
2804 | r10_bio->devs[1].bio->bi_end_io = NULL; | ||
2805 | |||
2806 | /* and maybe write to replacement */ | ||
2807 | bio = r10_bio->devs[1].repl_bio; | ||
2808 | if (bio) | ||
2809 | bio->bi_end_io = NULL; | ||
2810 | rdev = mirror->replacement; | ||
2811 | /* Note: if rdev != NULL, then bio | ||
2812 | * cannot be NULL as r10buf_pool_alloc will | ||
2813 | * have allocated it. | ||
2814 | * So the second test here is pointless. | ||
2815 | * But it keeps semantic-checkers happy, and | ||
2816 | * this comment keeps human reviewers | ||
2817 | * happy. | ||
2818 | */ | ||
2819 | if (rdev == NULL || bio == NULL || | ||
2820 | test_bit(Faulty, &rdev->flags)) | ||
2821 | break; | ||
2822 | bio->bi_next = biolist; | ||
2823 | biolist = bio; | ||
2824 | bio->bi_private = r10_bio; | ||
2825 | bio->bi_end_io = end_sync_write; | ||
2826 | bio->bi_rw = WRITE; | ||
2827 | bio->bi_sector = to_addr + rdev->data_offset; | ||
2828 | bio->bi_bdev = rdev->bdev; | ||
2829 | atomic_inc(&r10_bio->remaining); | ||
2530 | break; | 2830 | break; |
2531 | } | 2831 | } |
2532 | if (j == conf->copies) { | 2832 | if (j == conf->copies) { |
@@ -2544,8 +2844,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2544 | for (k = 0; k < conf->copies; k++) | 2844 | for (k = 0; k < conf->copies; k++) |
2545 | if (r10_bio->devs[k].devnum == i) | 2845 | if (r10_bio->devs[k].devnum == i) |
2546 | break; | 2846 | break; |
2547 | if (!rdev_set_badblocks( | 2847 | if (!test_bit(In_sync, |
2548 | conf->mirrors[i].rdev, | 2848 | &mirror->rdev->flags) |
2849 | && !rdev_set_badblocks( | ||
2850 | mirror->rdev, | ||
2851 | r10_bio->devs[k].addr, | ||
2852 | max_sync, 0)) | ||
2853 | any_working = 0; | ||
2854 | if (mirror->replacement && | ||
2855 | !rdev_set_badblocks( | ||
2856 | mirror->replacement, | ||
2549 | r10_bio->devs[k].addr, | 2857 | r10_bio->devs[k].addr, |
2550 | max_sync, 0)) | 2858 | max_sync, 0)) |
2551 | any_working = 0; | 2859 | any_working = 0; |
@@ -2556,7 +2864,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2556 | printk(KERN_INFO "md/raid10:%s: insufficient " | 2864 | printk(KERN_INFO "md/raid10:%s: insufficient " |
2557 | "working devices for recovery.\n", | 2865 | "working devices for recovery.\n", |
2558 | mdname(mddev)); | 2866 | mdname(mddev)); |
2559 | conf->mirrors[i].recovery_disabled | 2867 | mirror->recovery_disabled |
2560 | = mddev->recovery_disabled; | 2868 | = mddev->recovery_disabled; |
2561 | } | 2869 | } |
2562 | break; | 2870 | break; |
@@ -2605,6 +2913,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2605 | sector_t first_bad, sector; | 2913 | sector_t first_bad, sector; |
2606 | int bad_sectors; | 2914 | int bad_sectors; |
2607 | 2915 | ||
2916 | if (r10_bio->devs[i].repl_bio) | ||
2917 | r10_bio->devs[i].repl_bio->bi_end_io = NULL; | ||
2918 | |||
2608 | bio = r10_bio->devs[i].bio; | 2919 | bio = r10_bio->devs[i].bio; |
2609 | bio->bi_end_io = NULL; | 2920 | bio->bi_end_io = NULL; |
2610 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | 2921 | clear_bit(BIO_UPTODATE, &bio->bi_flags); |
@@ -2635,6 +2946,27 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2635 | conf->mirrors[d].rdev->data_offset; | 2946 | conf->mirrors[d].rdev->data_offset; |
2636 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2947 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; |
2637 | count++; | 2948 | count++; |
2949 | |||
2950 | if (conf->mirrors[d].replacement == NULL || | ||
2951 | test_bit(Faulty, | ||
2952 | &conf->mirrors[d].replacement->flags)) | ||
2953 | continue; | ||
2954 | |||
2955 | /* Need to set up for writing to the replacement */ | ||
2956 | bio = r10_bio->devs[i].repl_bio; | ||
2957 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
2958 | |||
2959 | sector = r10_bio->devs[i].addr; | ||
2960 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
2961 | bio->bi_next = biolist; | ||
2962 | biolist = bio; | ||
2963 | bio->bi_private = r10_bio; | ||
2964 | bio->bi_end_io = end_sync_write; | ||
2965 | bio->bi_rw = WRITE; | ||
2966 | bio->bi_sector = sector + | ||
2967 | conf->mirrors[d].replacement->data_offset; | ||
2968 | bio->bi_bdev = conf->mirrors[d].replacement->bdev; | ||
2969 | count++; | ||
2638 | } | 2970 | } |
2639 | 2971 | ||
2640 | if (count < 2) { | 2972 | if (count < 2) { |
@@ -2643,6 +2975,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2643 | if (r10_bio->devs[i].bio->bi_end_io) | 2975 | if (r10_bio->devs[i].bio->bi_end_io) |
2644 | rdev_dec_pending(conf->mirrors[d].rdev, | 2976 | rdev_dec_pending(conf->mirrors[d].rdev, |
2645 | mddev); | 2977 | mddev); |
2978 | if (r10_bio->devs[i].repl_bio && | ||
2979 | r10_bio->devs[i].repl_bio->bi_end_io) | ||
2980 | rdev_dec_pending( | ||
2981 | conf->mirrors[d].replacement, | ||
2982 | mddev); | ||
2646 | } | 2983 | } |
2647 | put_buf(r10_bio); | 2984 | put_buf(r10_bio); |
2648 | biolist = NULL; | 2985 | biolist = NULL; |
@@ -2896,6 +3233,16 @@ static int run(struct mddev *mddev) | |||
2896 | continue; | 3233 | continue; |
2897 | disk = conf->mirrors + disk_idx; | 3234 | disk = conf->mirrors + disk_idx; |
2898 | 3235 | ||
3236 | if (test_bit(Replacement, &rdev->flags)) { | ||
3237 | if (disk->replacement) | ||
3238 | goto out_free_conf; | ||
3239 | disk->replacement = rdev; | ||
3240 | } else { | ||
3241 | if (disk->rdev) | ||
3242 | goto out_free_conf; | ||
3243 | disk->rdev = rdev; | ||
3244 | } | ||
3245 | |||
2899 | disk->rdev = rdev; | 3246 | disk->rdev = rdev; |
2900 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 3247 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
2901 | rdev->data_offset << 9); | 3248 | rdev->data_offset << 9); |
@@ -2923,6 +3270,13 @@ static int run(struct mddev *mddev) | |||
2923 | 3270 | ||
2924 | disk = conf->mirrors + i; | 3271 | disk = conf->mirrors + i; |
2925 | 3272 | ||
3273 | if (!disk->rdev && disk->replacement) { | ||
3274 | /* The replacement is all we have - use it */ | ||
3275 | disk->rdev = disk->replacement; | ||
3276 | disk->replacement = NULL; | ||
3277 | clear_bit(Replacement, &disk->rdev->flags); | ||
3278 | } | ||
3279 | |||
2926 | if (!disk->rdev || | 3280 | if (!disk->rdev || |
2927 | !test_bit(In_sync, &disk->rdev->flags)) { | 3281 | !test_bit(In_sync, &disk->rdev->flags)) { |
2928 | disk->head_position = 0; | 3282 | disk->head_position = 0; |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 7facfdf841f4..7c615613c381 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -2,7 +2,7 @@ | |||
2 | #define _RAID10_H | 2 | #define _RAID10_H |
3 | 3 | ||
4 | struct mirror_info { | 4 | struct mirror_info { |
5 | struct md_rdev *rdev; | 5 | struct md_rdev *rdev, *replacement; |
6 | sector_t head_position; | 6 | sector_t head_position; |
7 | int recovery_disabled; /* matches | 7 | int recovery_disabled; /* matches |
8 | * mddev->recovery_disabled | 8 | * mddev->recovery_disabled |
@@ -18,12 +18,13 @@ struct r10conf { | |||
18 | spinlock_t device_lock; | 18 | spinlock_t device_lock; |
19 | 19 | ||
20 | /* geometry */ | 20 | /* geometry */ |
21 | int near_copies; /* number of copies laid out raid0 style */ | 21 | int near_copies; /* number of copies laid out |
22 | * raid0 style */ | ||
22 | int far_copies; /* number of copies laid out | 23 | int far_copies; /* number of copies laid out |
23 | * at large strides across drives | 24 | * at large strides across drives |
24 | */ | 25 | */ |
25 | int far_offset; /* far_copies are offset by 1 stripe | 26 | int far_offset; /* far_copies are offset by 1 |
26 | * instead of many | 27 | * stripe instead of many |
27 | */ | 28 | */ |
28 | int copies; /* near_copies * far_copies. | 29 | int copies; /* near_copies * far_copies. |
29 | * must be <= raid_disks | 30 | * must be <= raid_disks |
@@ -34,10 +35,11 @@ struct r10conf { | |||
34 | * 1 stripe. | 35 | * 1 stripe. |
35 | */ | 36 | */ |
36 | 37 | ||
37 | sector_t dev_sectors; /* temp copy of mddev->dev_sectors */ | 38 | sector_t dev_sectors; /* temp copy of |
39 | * mddev->dev_sectors */ | ||
38 | 40 | ||
39 | int chunk_shift; /* shift from chunks to sectors */ | 41 | int chunk_shift; /* shift from chunks to sectors */ |
40 | sector_t chunk_mask; | 42 | sector_t chunk_mask; |
41 | 43 | ||
42 | struct list_head retry_list; | 44 | struct list_head retry_list; |
43 | /* queue pending writes and submit them on unplug */ | 45 | /* queue pending writes and submit them on unplug */ |
@@ -45,20 +47,22 @@ struct r10conf { | |||
45 | int pending_count; | 47 | int pending_count; |
46 | 48 | ||
47 | spinlock_t resync_lock; | 49 | spinlock_t resync_lock; |
48 | int nr_pending; | 50 | int nr_pending; |
49 | int nr_waiting; | 51 | int nr_waiting; |
50 | int nr_queued; | 52 | int nr_queued; |
51 | int barrier; | 53 | int barrier; |
52 | sector_t next_resync; | 54 | sector_t next_resync; |
53 | int fullsync; /* set to 1 if a full sync is needed, | 55 | int fullsync; /* set to 1 if a full sync is needed, |
54 | * (fresh device added). | 56 | * (fresh device added). |
55 | * Cleared when a sync completes. | 57 | * Cleared when a sync completes. |
56 | */ | 58 | */ |
57 | 59 | int have_replacement; /* There is at least one | |
60 | * replacement device. | ||
61 | */ | ||
58 | wait_queue_head_t wait_barrier; | 62 | wait_queue_head_t wait_barrier; |
59 | 63 | ||
60 | mempool_t *r10bio_pool; | 64 | mempool_t *r10bio_pool; |
61 | mempool_t *r10buf_pool; | 65 | mempool_t *r10buf_pool; |
62 | struct page *tmppage; | 66 | struct page *tmppage; |
63 | 67 | ||
64 | /* When taking over an array from a different personality, we store | 68 | /* When taking over an array from a different personality, we store |
@@ -98,11 +102,18 @@ struct r10bio { | |||
98 | * When resyncing we also use one for each copy. | 102 | * When resyncing we also use one for each copy. |
99 | * When reconstructing, we use 2 bios, one for read, one for write. | 103 | * When reconstructing, we use 2 bios, one for read, one for write. |
100 | * We choose the number when they are allocated. | 104 | * We choose the number when they are allocated. |
105 | * We sometimes need an extra bio to write to the replacement. | ||
101 | */ | 106 | */ |
102 | struct { | 107 | struct { |
103 | struct bio *bio; | 108 | struct bio *bio; |
104 | sector_t addr; | 109 | union { |
105 | int devnum; | 110 | struct bio *repl_bio; /* used for resync and |
111 | * writes */ | ||
112 | struct md_rdev *rdev; /* used for reads | ||
113 | * (read_slot >= 0) */ | ||
114 | }; | ||
115 | sector_t addr; | ||
116 | int devnum; | ||
106 | } devs[0]; | 117 | } devs[0]; |
107 | }; | 118 | }; |
108 | 119 | ||
@@ -121,17 +132,19 @@ struct r10bio { | |||
121 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | 132 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) |
122 | 133 | ||
123 | /* bits for r10bio.state */ | 134 | /* bits for r10bio.state */ |
124 | #define R10BIO_Uptodate 0 | 135 | enum r10bio_state { |
125 | #define R10BIO_IsSync 1 | 136 | R10BIO_Uptodate, |
126 | #define R10BIO_IsRecover 2 | 137 | R10BIO_IsSync, |
127 | #define R10BIO_Degraded 3 | 138 | R10BIO_IsRecover, |
139 | R10BIO_Degraded, | ||
128 | /* Set ReadError on bios that experience a read error | 140 | /* Set ReadError on bios that experience a read error |
129 | * so that raid10d knows what to do with them. | 141 | * so that raid10d knows what to do with them. |
130 | */ | 142 | */ |
131 | #define R10BIO_ReadError 4 | 143 | R10BIO_ReadError, |
132 | /* If a write for this request means we can clear some | 144 | /* If a write for this request means we can clear some |
133 | * known-bad-block records, we set this flag. | 145 | * known-bad-block records, we set this flag. |
134 | */ | 146 | */ |
135 | #define R10BIO_MadeGood 5 | 147 | R10BIO_MadeGood, |
136 | #define R10BIO_WriteError 6 | 148 | R10BIO_WriteError, |
149 | }; | ||
137 | #endif | 150 | #endif |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 858fdbb7eb07..360f2b98f62b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -370,12 +370,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, | |||
370 | * of the two sections, and some non-in_sync devices may | 370 | * of the two sections, and some non-in_sync devices may |
371 | * be insync in the section most affected by failed devices. | 371 | * be insync in the section most affected by failed devices. |
372 | */ | 372 | */ |
373 | static int has_failed(struct r5conf *conf) | 373 | static int calc_degraded(struct r5conf *conf) |
374 | { | 374 | { |
375 | int degraded; | 375 | int degraded, degraded2; |
376 | int i; | 376 | int i; |
377 | if (conf->mddev->reshape_position == MaxSector) | ||
378 | return conf->mddev->degraded > conf->max_degraded; | ||
379 | 377 | ||
380 | rcu_read_lock(); | 378 | rcu_read_lock(); |
381 | degraded = 0; | 379 | degraded = 0; |
@@ -399,14 +397,14 @@ static int has_failed(struct r5conf *conf) | |||
399 | degraded++; | 397 | degraded++; |
400 | } | 398 | } |
401 | rcu_read_unlock(); | 399 | rcu_read_unlock(); |
402 | if (degraded > conf->max_degraded) | 400 | if (conf->raid_disks == conf->previous_raid_disks) |
403 | return 1; | 401 | return degraded; |
404 | rcu_read_lock(); | 402 | rcu_read_lock(); |
405 | degraded = 0; | 403 | degraded2 = 0; |
406 | for (i = 0; i < conf->raid_disks; i++) { | 404 | for (i = 0; i < conf->raid_disks; i++) { |
407 | struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); | 405 | struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); |
408 | if (!rdev || test_bit(Faulty, &rdev->flags)) | 406 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
409 | degraded++; | 407 | degraded2++; |
410 | else if (test_bit(In_sync, &rdev->flags)) | 408 | else if (test_bit(In_sync, &rdev->flags)) |
411 | ; | 409 | ; |
412 | else | 410 | else |
@@ -416,9 +414,22 @@ static int has_failed(struct r5conf *conf) | |||
416 | * almost certainly hasn't. | 414 | * almost certainly hasn't. |
417 | */ | 415 | */ |
418 | if (conf->raid_disks <= conf->previous_raid_disks) | 416 | if (conf->raid_disks <= conf->previous_raid_disks) |
419 | degraded++; | 417 | degraded2++; |
420 | } | 418 | } |
421 | rcu_read_unlock(); | 419 | rcu_read_unlock(); |
420 | if (degraded2 > degraded) | ||
421 | return degraded2; | ||
422 | return degraded; | ||
423 | } | ||
424 | |||
425 | static int has_failed(struct r5conf *conf) | ||
426 | { | ||
427 | int degraded; | ||
428 | |||
429 | if (conf->mddev->reshape_position == MaxSector) | ||
430 | return conf->mddev->degraded > conf->max_degraded; | ||
431 | |||
432 | degraded = calc_degraded(conf); | ||
422 | if (degraded > conf->max_degraded) | 433 | if (degraded > conf->max_degraded) |
423 | return 1; | 434 | return 1; |
424 | return 0; | 435 | return 0; |
@@ -492,8 +503,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
492 | 503 | ||
493 | for (i = disks; i--; ) { | 504 | for (i = disks; i--; ) { |
494 | int rw; | 505 | int rw; |
495 | struct bio *bi; | 506 | int replace_only = 0; |
496 | struct md_rdev *rdev; | 507 | struct bio *bi, *rbi; |
508 | struct md_rdev *rdev, *rrdev = NULL; | ||
497 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { | 509 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { |
498 | if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) | 510 | if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) |
499 | rw = WRITE_FUA; | 511 | rw = WRITE_FUA; |
@@ -501,27 +513,57 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
501 | rw = WRITE; | 513 | rw = WRITE; |
502 | } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | 514 | } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) |
503 | rw = READ; | 515 | rw = READ; |
504 | else | 516 | else if (test_and_clear_bit(R5_WantReplace, |
517 | &sh->dev[i].flags)) { | ||
518 | rw = WRITE; | ||
519 | replace_only = 1; | ||
520 | } else | ||
505 | continue; | 521 | continue; |
506 | 522 | ||
507 | bi = &sh->dev[i].req; | 523 | bi = &sh->dev[i].req; |
524 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ | ||
508 | 525 | ||
509 | bi->bi_rw = rw; | 526 | bi->bi_rw = rw; |
510 | if (rw & WRITE) | 527 | rbi->bi_rw = rw; |
528 | if (rw & WRITE) { | ||
511 | bi->bi_end_io = raid5_end_write_request; | 529 | bi->bi_end_io = raid5_end_write_request; |
512 | else | 530 | rbi->bi_end_io = raid5_end_write_request; |
531 | } else | ||
513 | bi->bi_end_io = raid5_end_read_request; | 532 | bi->bi_end_io = raid5_end_read_request; |
514 | 533 | ||
515 | rcu_read_lock(); | 534 | rcu_read_lock(); |
535 | rrdev = rcu_dereference(conf->disks[i].replacement); | ||
536 | smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ | ||
516 | rdev = rcu_dereference(conf->disks[i].rdev); | 537 | rdev = rcu_dereference(conf->disks[i].rdev); |
538 | if (!rdev) { | ||
539 | rdev = rrdev; | ||
540 | rrdev = NULL; | ||
541 | } | ||
542 | if (rw & WRITE) { | ||
543 | if (replace_only) | ||
544 | rdev = NULL; | ||
545 | if (rdev == rrdev) | ||
546 | /* We raced and saw duplicates */ | ||
547 | rrdev = NULL; | ||
548 | } else { | ||
549 | if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) | ||
550 | rdev = rrdev; | ||
551 | rrdev = NULL; | ||
552 | } | ||
553 | |||
517 | if (rdev && test_bit(Faulty, &rdev->flags)) | 554 | if (rdev && test_bit(Faulty, &rdev->flags)) |
518 | rdev = NULL; | 555 | rdev = NULL; |
519 | if (rdev) | 556 | if (rdev) |
520 | atomic_inc(&rdev->nr_pending); | 557 | atomic_inc(&rdev->nr_pending); |
558 | if (rrdev && test_bit(Faulty, &rrdev->flags)) | ||
559 | rrdev = NULL; | ||
560 | if (rrdev) | ||
561 | atomic_inc(&rrdev->nr_pending); | ||
521 | rcu_read_unlock(); | 562 | rcu_read_unlock(); |
522 | 563 | ||
523 | /* We have already checked bad blocks for reads. Now | 564 | /* We have already checked bad blocks for reads. Now |
524 | * need to check for writes. | 565 | * need to check for writes. We never accept write errors |
566 | * on the replacement, so we don't to check rrdev. | ||
525 | */ | 567 | */ |
526 | while ((rw & WRITE) && rdev && | 568 | while ((rw & WRITE) && rdev && |
527 | test_bit(WriteErrorSeen, &rdev->flags)) { | 569 | test_bit(WriteErrorSeen, &rdev->flags)) { |
@@ -551,7 +593,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
551 | } | 593 | } |
552 | 594 | ||
553 | if (rdev) { | 595 | if (rdev) { |
554 | if (s->syncing || s->expanding || s->expanded) | 596 | if (s->syncing || s->expanding || s->expanded |
597 | || s->replacing) | ||
555 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 598 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
556 | 599 | ||
557 | set_bit(STRIPE_IO_STARTED, &sh->state); | 600 | set_bit(STRIPE_IO_STARTED, &sh->state); |
@@ -563,16 +606,38 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
563 | atomic_inc(&sh->count); | 606 | atomic_inc(&sh->count); |
564 | bi->bi_sector = sh->sector + rdev->data_offset; | 607 | bi->bi_sector = sh->sector + rdev->data_offset; |
565 | bi->bi_flags = 1 << BIO_UPTODATE; | 608 | bi->bi_flags = 1 << BIO_UPTODATE; |
566 | bi->bi_vcnt = 1; | ||
567 | bi->bi_max_vecs = 1; | ||
568 | bi->bi_idx = 0; | 609 | bi->bi_idx = 0; |
569 | bi->bi_io_vec = &sh->dev[i].vec; | ||
570 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 610 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
571 | bi->bi_io_vec[0].bv_offset = 0; | 611 | bi->bi_io_vec[0].bv_offset = 0; |
572 | bi->bi_size = STRIPE_SIZE; | 612 | bi->bi_size = STRIPE_SIZE; |
573 | bi->bi_next = NULL; | 613 | bi->bi_next = NULL; |
614 | if (rrdev) | ||
615 | set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); | ||
574 | generic_make_request(bi); | 616 | generic_make_request(bi); |
575 | } else { | 617 | } |
618 | if (rrdev) { | ||
619 | if (s->syncing || s->expanding || s->expanded | ||
620 | || s->replacing) | ||
621 | md_sync_acct(rrdev->bdev, STRIPE_SECTORS); | ||
622 | |||
623 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
624 | |||
625 | rbi->bi_bdev = rrdev->bdev; | ||
626 | pr_debug("%s: for %llu schedule op %ld on " | ||
627 | "replacement disc %d\n", | ||
628 | __func__, (unsigned long long)sh->sector, | ||
629 | rbi->bi_rw, i); | ||
630 | atomic_inc(&sh->count); | ||
631 | rbi->bi_sector = sh->sector + rrdev->data_offset; | ||
632 | rbi->bi_flags = 1 << BIO_UPTODATE; | ||
633 | rbi->bi_idx = 0; | ||
634 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; | ||
635 | rbi->bi_io_vec[0].bv_offset = 0; | ||
636 | rbi->bi_size = STRIPE_SIZE; | ||
637 | rbi->bi_next = NULL; | ||
638 | generic_make_request(rbi); | ||
639 | } | ||
640 | if (!rdev && !rrdev) { | ||
576 | if (rw & WRITE) | 641 | if (rw & WRITE) |
577 | set_bit(STRIPE_DEGRADED, &sh->state); | 642 | set_bit(STRIPE_DEGRADED, &sh->state); |
578 | pr_debug("skip op %ld on disc %d for sector %llu\n", | 643 | pr_debug("skip op %ld on disc %d for sector %llu\n", |
@@ -1583,7 +1648,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1583 | int disks = sh->disks, i; | 1648 | int disks = sh->disks, i; |
1584 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1649 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
1585 | char b[BDEVNAME_SIZE]; | 1650 | char b[BDEVNAME_SIZE]; |
1586 | struct md_rdev *rdev; | 1651 | struct md_rdev *rdev = NULL; |
1587 | 1652 | ||
1588 | 1653 | ||
1589 | for (i=0 ; i<disks; i++) | 1654 | for (i=0 ; i<disks; i++) |
@@ -1597,11 +1662,23 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1597 | BUG(); | 1662 | BUG(); |
1598 | return; | 1663 | return; |
1599 | } | 1664 | } |
1665 | if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) | ||
1666 | /* If replacement finished while this request was outstanding, | ||
1667 | * 'replacement' might be NULL already. | ||
1668 | * In that case it moved down to 'rdev'. | ||
1669 | * rdev is not removed until all requests are finished. | ||
1670 | */ | ||
1671 | rdev = conf->disks[i].replacement; | ||
1672 | if (!rdev) | ||
1673 | rdev = conf->disks[i].rdev; | ||
1600 | 1674 | ||
1601 | if (uptodate) { | 1675 | if (uptodate) { |
1602 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 1676 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
1603 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | 1677 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { |
1604 | rdev = conf->disks[i].rdev; | 1678 | /* Note that this cannot happen on a |
1679 | * replacement device. We just fail those on | ||
1680 | * any error | ||
1681 | */ | ||
1605 | printk_ratelimited( | 1682 | printk_ratelimited( |
1606 | KERN_INFO | 1683 | KERN_INFO |
1607 | "md/raid:%s: read error corrected" | 1684 | "md/raid:%s: read error corrected" |
@@ -1614,16 +1691,24 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1614 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1691 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
1615 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | 1692 | clear_bit(R5_ReWrite, &sh->dev[i].flags); |
1616 | } | 1693 | } |
1617 | if (atomic_read(&conf->disks[i].rdev->read_errors)) | 1694 | if (atomic_read(&rdev->read_errors)) |
1618 | atomic_set(&conf->disks[i].rdev->read_errors, 0); | 1695 | atomic_set(&rdev->read_errors, 0); |
1619 | } else { | 1696 | } else { |
1620 | const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); | 1697 | const char *bdn = bdevname(rdev->bdev, b); |
1621 | int retry = 0; | 1698 | int retry = 0; |
1622 | rdev = conf->disks[i].rdev; | ||
1623 | 1699 | ||
1624 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | 1700 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); |
1625 | atomic_inc(&rdev->read_errors); | 1701 | atomic_inc(&rdev->read_errors); |
1626 | if (conf->mddev->degraded >= conf->max_degraded) | 1702 | if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) |
1703 | printk_ratelimited( | ||
1704 | KERN_WARNING | ||
1705 | "md/raid:%s: read error on replacement device " | ||
1706 | "(sector %llu on %s).\n", | ||
1707 | mdname(conf->mddev), | ||
1708 | (unsigned long long)(sh->sector | ||
1709 | + rdev->data_offset), | ||
1710 | bdn); | ||
1711 | else if (conf->mddev->degraded >= conf->max_degraded) | ||
1627 | printk_ratelimited( | 1712 | printk_ratelimited( |
1628 | KERN_WARNING | 1713 | KERN_WARNING |
1629 | "md/raid:%s: read error not correctable " | 1714 | "md/raid:%s: read error not correctable " |
@@ -1657,7 +1742,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1657 | md_error(conf->mddev, rdev); | 1742 | md_error(conf->mddev, rdev); |
1658 | } | 1743 | } |
1659 | } | 1744 | } |
1660 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | 1745 | rdev_dec_pending(rdev, conf->mddev); |
1661 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | 1746 | clear_bit(R5_LOCKED, &sh->dev[i].flags); |
1662 | set_bit(STRIPE_HANDLE, &sh->state); | 1747 | set_bit(STRIPE_HANDLE, &sh->state); |
1663 | release_stripe(sh); | 1748 | release_stripe(sh); |
@@ -1668,14 +1753,30 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
1668 | struct stripe_head *sh = bi->bi_private; | 1753 | struct stripe_head *sh = bi->bi_private; |
1669 | struct r5conf *conf = sh->raid_conf; | 1754 | struct r5conf *conf = sh->raid_conf; |
1670 | int disks = sh->disks, i; | 1755 | int disks = sh->disks, i; |
1756 | struct md_rdev *uninitialized_var(rdev); | ||
1671 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1757 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
1672 | sector_t first_bad; | 1758 | sector_t first_bad; |
1673 | int bad_sectors; | 1759 | int bad_sectors; |
1760 | int replacement = 0; | ||
1674 | 1761 | ||
1675 | for (i=0 ; i<disks; i++) | 1762 | for (i = 0 ; i < disks; i++) { |
1676 | if (bi == &sh->dev[i].req) | 1763 | if (bi == &sh->dev[i].req) { |
1764 | rdev = conf->disks[i].rdev; | ||
1677 | break; | 1765 | break; |
1678 | 1766 | } | |
1767 | if (bi == &sh->dev[i].rreq) { | ||
1768 | rdev = conf->disks[i].replacement; | ||
1769 | if (rdev) | ||
1770 | replacement = 1; | ||
1771 | else | ||
1772 | /* rdev was removed and 'replacement' | ||
1773 | * replaced it. rdev is not removed | ||
1774 | * until all requests are finished. | ||
1775 | */ | ||
1776 | rdev = conf->disks[i].rdev; | ||
1777 | break; | ||
1778 | } | ||
1779 | } | ||
1679 | pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", | 1780 | pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", |
1680 | (unsigned long long)sh->sector, i, atomic_read(&sh->count), | 1781 | (unsigned long long)sh->sector, i, atomic_read(&sh->count), |
1681 | uptodate); | 1782 | uptodate); |
@@ -1684,21 +1785,33 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
1684 | return; | 1785 | return; |
1685 | } | 1786 | } |
1686 | 1787 | ||
1687 | if (!uptodate) { | 1788 | if (replacement) { |
1688 | set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); | 1789 | if (!uptodate) |
1689 | set_bit(R5_WriteError, &sh->dev[i].flags); | 1790 | md_error(conf->mddev, rdev); |
1690 | } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, | 1791 | else if (is_badblock(rdev, sh->sector, |
1691 | &first_bad, &bad_sectors)) | 1792 | STRIPE_SECTORS, |
1692 | set_bit(R5_MadeGood, &sh->dev[i].flags); | 1793 | &first_bad, &bad_sectors)) |
1794 | set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); | ||
1795 | } else { | ||
1796 | if (!uptodate) { | ||
1797 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1798 | set_bit(R5_WriteError, &sh->dev[i].flags); | ||
1799 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) | ||
1800 | set_bit(MD_RECOVERY_NEEDED, | ||
1801 | &rdev->mddev->recovery); | ||
1802 | } else if (is_badblock(rdev, sh->sector, | ||
1803 | STRIPE_SECTORS, | ||
1804 | &first_bad, &bad_sectors)) | ||
1805 | set_bit(R5_MadeGood, &sh->dev[i].flags); | ||
1806 | } | ||
1807 | rdev_dec_pending(rdev, conf->mddev); | ||
1693 | 1808 | ||
1694 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | 1809 | if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) |
1695 | 1810 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | |
1696 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
1697 | set_bit(STRIPE_HANDLE, &sh->state); | 1811 | set_bit(STRIPE_HANDLE, &sh->state); |
1698 | release_stripe(sh); | 1812 | release_stripe(sh); |
1699 | } | 1813 | } |
1700 | 1814 | ||
1701 | |||
1702 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); | 1815 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); |
1703 | 1816 | ||
1704 | static void raid5_build_block(struct stripe_head *sh, int i, int previous) | 1817 | static void raid5_build_block(struct stripe_head *sh, int i, int previous) |
@@ -1709,12 +1822,15 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) | |||
1709 | dev->req.bi_io_vec = &dev->vec; | 1822 | dev->req.bi_io_vec = &dev->vec; |
1710 | dev->req.bi_vcnt++; | 1823 | dev->req.bi_vcnt++; |
1711 | dev->req.bi_max_vecs++; | 1824 | dev->req.bi_max_vecs++; |
1825 | dev->req.bi_private = sh; | ||
1712 | dev->vec.bv_page = dev->page; | 1826 | dev->vec.bv_page = dev->page; |
1713 | dev->vec.bv_len = STRIPE_SIZE; | ||
1714 | dev->vec.bv_offset = 0; | ||
1715 | 1827 | ||
1716 | dev->req.bi_sector = sh->sector; | 1828 | bio_init(&dev->rreq); |
1717 | dev->req.bi_private = sh; | 1829 | dev->rreq.bi_io_vec = &dev->rvec; |
1830 | dev->rreq.bi_vcnt++; | ||
1831 | dev->rreq.bi_max_vecs++; | ||
1832 | dev->rreq.bi_private = sh; | ||
1833 | dev->rvec.bv_page = dev->page; | ||
1718 | 1834 | ||
1719 | dev->flags = 0; | 1835 | dev->flags = 0; |
1720 | dev->sector = compute_blocknr(sh, i, previous); | 1836 | dev->sector = compute_blocknr(sh, i, previous); |
@@ -1724,18 +1840,15 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) | |||
1724 | { | 1840 | { |
1725 | char b[BDEVNAME_SIZE]; | 1841 | char b[BDEVNAME_SIZE]; |
1726 | struct r5conf *conf = mddev->private; | 1842 | struct r5conf *conf = mddev->private; |
1843 | unsigned long flags; | ||
1727 | pr_debug("raid456: error called\n"); | 1844 | pr_debug("raid456: error called\n"); |
1728 | 1845 | ||
1729 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1846 | spin_lock_irqsave(&conf->device_lock, flags); |
1730 | unsigned long flags; | 1847 | clear_bit(In_sync, &rdev->flags); |
1731 | spin_lock_irqsave(&conf->device_lock, flags); | 1848 | mddev->degraded = calc_degraded(conf); |
1732 | mddev->degraded++; | 1849 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1733 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1850 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
1734 | /* | 1851 | |
1735 | * if recovery was running, make sure it aborts. | ||
1736 | */ | ||
1737 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
1738 | } | ||
1739 | set_bit(Blocked, &rdev->flags); | 1852 | set_bit(Blocked, &rdev->flags); |
1740 | set_bit(Faulty, &rdev->flags); | 1853 | set_bit(Faulty, &rdev->flags); |
1741 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1854 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
@@ -2362,8 +2475,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, | |||
2362 | md_done_sync(conf->mddev, STRIPE_SECTORS, 0); | 2475 | md_done_sync(conf->mddev, STRIPE_SECTORS, 0); |
2363 | clear_bit(STRIPE_SYNCING, &sh->state); | 2476 | clear_bit(STRIPE_SYNCING, &sh->state); |
2364 | s->syncing = 0; | 2477 | s->syncing = 0; |
2478 | s->replacing = 0; | ||
2365 | /* There is nothing more to do for sync/check/repair. | 2479 | /* There is nothing more to do for sync/check/repair. |
2366 | * For recover we need to record a bad block on all | 2480 | * For recover/replace we need to record a bad block on all |
2367 | * non-sync devices, or abort the recovery | 2481 | * non-sync devices, or abort the recovery |
2368 | */ | 2482 | */ |
2369 | if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) | 2483 | if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) |
@@ -2373,12 +2487,18 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, | |||
2373 | */ | 2487 | */ |
2374 | for (i = 0; i < conf->raid_disks; i++) { | 2488 | for (i = 0; i < conf->raid_disks; i++) { |
2375 | struct md_rdev *rdev = conf->disks[i].rdev; | 2489 | struct md_rdev *rdev = conf->disks[i].rdev; |
2376 | if (!rdev | 2490 | if (rdev |
2377 | || test_bit(Faulty, &rdev->flags) | 2491 | && !test_bit(Faulty, &rdev->flags) |
2378 | || test_bit(In_sync, &rdev->flags)) | 2492 | && !test_bit(In_sync, &rdev->flags) |
2379 | continue; | 2493 | && !rdev_set_badblocks(rdev, sh->sector, |
2380 | if (!rdev_set_badblocks(rdev, sh->sector, | 2494 | STRIPE_SECTORS, 0)) |
2381 | STRIPE_SECTORS, 0)) | 2495 | abort = 1; |
2496 | rdev = conf->disks[i].replacement; | ||
2497 | if (rdev | ||
2498 | && !test_bit(Faulty, &rdev->flags) | ||
2499 | && !test_bit(In_sync, &rdev->flags) | ||
2500 | && !rdev_set_badblocks(rdev, sh->sector, | ||
2501 | STRIPE_SECTORS, 0)) | ||
2382 | abort = 1; | 2502 | abort = 1; |
2383 | } | 2503 | } |
2384 | if (abort) { | 2504 | if (abort) { |
@@ -2387,6 +2507,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, | |||
2387 | } | 2507 | } |
2388 | } | 2508 | } |
2389 | 2509 | ||
2510 | static int want_replace(struct stripe_head *sh, int disk_idx) | ||
2511 | { | ||
2512 | struct md_rdev *rdev; | ||
2513 | int rv = 0; | ||
2514 | /* Doing recovery so rcu locking not required */ | ||
2515 | rdev = sh->raid_conf->disks[disk_idx].replacement; | ||
2516 | if (rdev | ||
2517 | && !test_bit(Faulty, &rdev->flags) | ||
2518 | && !test_bit(In_sync, &rdev->flags) | ||
2519 | && (rdev->recovery_offset <= sh->sector | ||
2520 | || rdev->mddev->recovery_cp <= sh->sector)) | ||
2521 | rv = 1; | ||
2522 | |||
2523 | return rv; | ||
2524 | } | ||
2525 | |||
2390 | /* fetch_block - checks the given member device to see if its data needs | 2526 | /* fetch_block - checks the given member device to see if its data needs |
2391 | * to be read or computed to satisfy a request. | 2527 | * to be read or computed to satisfy a request. |
2392 | * | 2528 | * |
@@ -2406,6 +2542,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, | |||
2406 | (dev->toread || | 2542 | (dev->toread || |
2407 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | 2543 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
2408 | s->syncing || s->expanding || | 2544 | s->syncing || s->expanding || |
2545 | (s->replacing && want_replace(sh, disk_idx)) || | ||
2409 | (s->failed >= 1 && fdev[0]->toread) || | 2546 | (s->failed >= 1 && fdev[0]->toread) || |
2410 | (s->failed >= 2 && fdev[1]->toread) || | 2547 | (s->failed >= 2 && fdev[1]->toread) || |
2411 | (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && | 2548 | (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && |
@@ -2959,22 +3096,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) | |||
2959 | } | 3096 | } |
2960 | } | 3097 | } |
2961 | 3098 | ||
2962 | |||
2963 | /* | 3099 | /* |
2964 | * handle_stripe - do things to a stripe. | 3100 | * handle_stripe - do things to a stripe. |
2965 | * | 3101 | * |
2966 | * We lock the stripe and then examine the state of various bits | 3102 | * We lock the stripe by setting STRIPE_ACTIVE and then examine the |
2967 | * to see what needs to be done. | 3103 | * state of various bits to see what needs to be done. |
2968 | * Possible results: | 3104 | * Possible results: |
2969 | * return some read request which now have data | 3105 | * return some read requests which now have data |
2970 | * return some write requests which are safely on disc | 3106 | * return some write requests which are safely on storage |
2971 | * schedule a read on some buffers | 3107 | * schedule a read on some buffers |
2972 | * schedule a write of some buffers | 3108 | * schedule a write of some buffers |
2973 | * return confirmation of parity correctness | 3109 | * return confirmation of parity correctness |
2974 | * | 3110 | * |
2975 | * buffers are taken off read_list or write_list, and bh_cache buffers | ||
2976 | * get BH_Lock set before the stripe lock is released. | ||
2977 | * | ||
2978 | */ | 3111 | */ |
2979 | 3112 | ||
2980 | static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | 3113 | static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) |
@@ -2983,10 +3116,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
2983 | int disks = sh->disks; | 3116 | int disks = sh->disks; |
2984 | struct r5dev *dev; | 3117 | struct r5dev *dev; |
2985 | int i; | 3118 | int i; |
3119 | int do_recovery = 0; | ||
2986 | 3120 | ||
2987 | memset(s, 0, sizeof(*s)); | 3121 | memset(s, 0, sizeof(*s)); |
2988 | 3122 | ||
2989 | s->syncing = test_bit(STRIPE_SYNCING, &sh->state); | ||
2990 | s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 3123 | s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
2991 | s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | 3124 | s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); |
2992 | s->failed_num[0] = -1; | 3125 | s->failed_num[0] = -1; |
@@ -3004,7 +3137,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3004 | dev = &sh->dev[i]; | 3137 | dev = &sh->dev[i]; |
3005 | 3138 | ||
3006 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3139 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
3007 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 3140 | i, dev->flags, |
3141 | dev->toread, dev->towrite, dev->written); | ||
3008 | /* maybe we can reply to a read | 3142 | /* maybe we can reply to a read |
3009 | * | 3143 | * |
3010 | * new wantfill requests are only permitted while | 3144 | * new wantfill requests are only permitted while |
@@ -3035,7 +3169,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3035 | } | 3169 | } |
3036 | if (dev->written) | 3170 | if (dev->written) |
3037 | s->written++; | 3171 | s->written++; |
3038 | rdev = rcu_dereference(conf->disks[i].rdev); | 3172 | /* Prefer to use the replacement for reads, but only |
3173 | * if it is recovered enough and has no bad blocks. | ||
3174 | */ | ||
3175 | rdev = rcu_dereference(conf->disks[i].replacement); | ||
3176 | if (rdev && !test_bit(Faulty, &rdev->flags) && | ||
3177 | rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && | ||
3178 | !is_badblock(rdev, sh->sector, STRIPE_SECTORS, | ||
3179 | &first_bad, &bad_sectors)) | ||
3180 | set_bit(R5_ReadRepl, &dev->flags); | ||
3181 | else { | ||
3182 | if (rdev) | ||
3183 | set_bit(R5_NeedReplace, &dev->flags); | ||
3184 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
3185 | clear_bit(R5_ReadRepl, &dev->flags); | ||
3186 | } | ||
3039 | if (rdev && test_bit(Faulty, &rdev->flags)) | 3187 | if (rdev && test_bit(Faulty, &rdev->flags)) |
3040 | rdev = NULL; | 3188 | rdev = NULL; |
3041 | if (rdev) { | 3189 | if (rdev) { |
@@ -3077,20 +3225,38 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3077 | set_bit(R5_Insync, &dev->flags); | 3225 | set_bit(R5_Insync, &dev->flags); |
3078 | 3226 | ||
3079 | if (rdev && test_bit(R5_WriteError, &dev->flags)) { | 3227 | if (rdev && test_bit(R5_WriteError, &dev->flags)) { |
3080 | clear_bit(R5_Insync, &dev->flags); | 3228 | /* This flag does not apply to '.replacement' |
3081 | if (!test_bit(Faulty, &rdev->flags)) { | 3229 | * only to .rdev, so make sure to check that*/ |
3230 | struct md_rdev *rdev2 = rcu_dereference( | ||
3231 | conf->disks[i].rdev); | ||
3232 | if (rdev2 == rdev) | ||
3233 | clear_bit(R5_Insync, &dev->flags); | ||
3234 | if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { | ||
3082 | s->handle_bad_blocks = 1; | 3235 | s->handle_bad_blocks = 1; |
3083 | atomic_inc(&rdev->nr_pending); | 3236 | atomic_inc(&rdev2->nr_pending); |
3084 | } else | 3237 | } else |
3085 | clear_bit(R5_WriteError, &dev->flags); | 3238 | clear_bit(R5_WriteError, &dev->flags); |
3086 | } | 3239 | } |
3087 | if (rdev && test_bit(R5_MadeGood, &dev->flags)) { | 3240 | if (rdev && test_bit(R5_MadeGood, &dev->flags)) { |
3088 | if (!test_bit(Faulty, &rdev->flags)) { | 3241 | /* This flag does not apply to '.replacement' |
3242 | * only to .rdev, so make sure to check that*/ | ||
3243 | struct md_rdev *rdev2 = rcu_dereference( | ||
3244 | conf->disks[i].rdev); | ||
3245 | if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { | ||
3089 | s->handle_bad_blocks = 1; | 3246 | s->handle_bad_blocks = 1; |
3090 | atomic_inc(&rdev->nr_pending); | 3247 | atomic_inc(&rdev2->nr_pending); |
3091 | } else | 3248 | } else |
3092 | clear_bit(R5_MadeGood, &dev->flags); | 3249 | clear_bit(R5_MadeGood, &dev->flags); |
3093 | } | 3250 | } |
3251 | if (test_bit(R5_MadeGoodRepl, &dev->flags)) { | ||
3252 | struct md_rdev *rdev2 = rcu_dereference( | ||
3253 | conf->disks[i].replacement); | ||
3254 | if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { | ||
3255 | s->handle_bad_blocks = 1; | ||
3256 | atomic_inc(&rdev2->nr_pending); | ||
3257 | } else | ||
3258 | clear_bit(R5_MadeGoodRepl, &dev->flags); | ||
3259 | } | ||
3094 | if (!test_bit(R5_Insync, &dev->flags)) { | 3260 | if (!test_bit(R5_Insync, &dev->flags)) { |
3095 | /* The ReadError flag will just be confusing now */ | 3261 | /* The ReadError flag will just be confusing now */ |
3096 | clear_bit(R5_ReadError, &dev->flags); | 3262 | clear_bit(R5_ReadError, &dev->flags); |
@@ -3102,9 +3268,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3102 | if (s->failed < 2) | 3268 | if (s->failed < 2) |
3103 | s->failed_num[s->failed] = i; | 3269 | s->failed_num[s->failed] = i; |
3104 | s->failed++; | 3270 | s->failed++; |
3271 | if (rdev && !test_bit(Faulty, &rdev->flags)) | ||
3272 | do_recovery = 1; | ||
3105 | } | 3273 | } |
3106 | } | 3274 | } |
3107 | spin_unlock_irq(&conf->device_lock); | 3275 | spin_unlock_irq(&conf->device_lock); |
3276 | if (test_bit(STRIPE_SYNCING, &sh->state)) { | ||
3277 | /* If there is a failed device being replaced, | ||
3278 | * we must be recovering. | ||
3279 | * else if we are after recovery_cp, we must be syncing | ||
3280 | * else we can only be replacing | ||
3281 | * sync and recovery both need to read all devices, and so | ||
3282 | * use the same flag. | ||
3283 | */ | ||
3284 | if (do_recovery || | ||
3285 | sh->sector >= conf->mddev->recovery_cp) | ||
3286 | s->syncing = 1; | ||
3287 | else | ||
3288 | s->replacing = 1; | ||
3289 | } | ||
3108 | rcu_read_unlock(); | 3290 | rcu_read_unlock(); |
3109 | } | 3291 | } |
3110 | 3292 | ||
@@ -3146,7 +3328,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
3146 | 3328 | ||
3147 | if (unlikely(s.blocked_rdev)) { | 3329 | if (unlikely(s.blocked_rdev)) { |
3148 | if (s.syncing || s.expanding || s.expanded || | 3330 | if (s.syncing || s.expanding || s.expanded || |
3149 | s.to_write || s.written) { | 3331 | s.replacing || s.to_write || s.written) { |
3150 | set_bit(STRIPE_HANDLE, &sh->state); | 3332 | set_bit(STRIPE_HANDLE, &sh->state); |
3151 | goto finish; | 3333 | goto finish; |
3152 | } | 3334 | } |
@@ -3172,7 +3354,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
3172 | sh->reconstruct_state = 0; | 3354 | sh->reconstruct_state = 0; |
3173 | if (s.to_read+s.to_write+s.written) | 3355 | if (s.to_read+s.to_write+s.written) |
3174 | handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); | 3356 | handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); |
3175 | if (s.syncing) | 3357 | if (s.syncing + s.replacing) |
3176 | handle_failed_sync(conf, sh, &s); | 3358 | handle_failed_sync(conf, sh, &s); |
3177 | } | 3359 | } |
3178 | 3360 | ||
@@ -3203,7 +3385,9 @@ static void handle_stripe(struct stripe_head *sh) | |||
3203 | */ | 3385 | */ |
3204 | if (s.to_read || s.non_overwrite | 3386 | if (s.to_read || s.non_overwrite |
3205 | || (conf->level == 6 && s.to_write && s.failed) | 3387 | || (conf->level == 6 && s.to_write && s.failed) |
3206 | || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) | 3388 | || (s.syncing && (s.uptodate + s.compute < disks)) |
3389 | || s.replacing | ||
3390 | || s.expanding) | ||
3207 | handle_stripe_fill(sh, &s, disks); | 3391 | handle_stripe_fill(sh, &s, disks); |
3208 | 3392 | ||
3209 | /* Now we check to see if any write operations have recently | 3393 | /* Now we check to see if any write operations have recently |
@@ -3265,7 +3449,20 @@ static void handle_stripe(struct stripe_head *sh) | |||
3265 | handle_parity_checks5(conf, sh, &s, disks); | 3449 | handle_parity_checks5(conf, sh, &s, disks); |
3266 | } | 3450 | } |
3267 | 3451 | ||
3268 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 3452 | if (s.replacing && s.locked == 0 |
3453 | && !test_bit(STRIPE_INSYNC, &sh->state)) { | ||
3454 | /* Write out to replacement devices where possible */ | ||
3455 | for (i = 0; i < conf->raid_disks; i++) | ||
3456 | if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && | ||
3457 | test_bit(R5_NeedReplace, &sh->dev[i].flags)) { | ||
3458 | set_bit(R5_WantReplace, &sh->dev[i].flags); | ||
3459 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3460 | s.locked++; | ||
3461 | } | ||
3462 | set_bit(STRIPE_INSYNC, &sh->state); | ||
3463 | } | ||
3464 | if ((s.syncing || s.replacing) && s.locked == 0 && | ||
3465 | test_bit(STRIPE_INSYNC, &sh->state)) { | ||
3269 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); | 3466 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); |
3270 | clear_bit(STRIPE_SYNCING, &sh->state); | 3467 | clear_bit(STRIPE_SYNCING, &sh->state); |
3271 | } | 3468 | } |
@@ -3363,6 +3560,15 @@ finish: | |||
3363 | STRIPE_SECTORS); | 3560 | STRIPE_SECTORS); |
3364 | rdev_dec_pending(rdev, conf->mddev); | 3561 | rdev_dec_pending(rdev, conf->mddev); |
3365 | } | 3562 | } |
3563 | if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { | ||
3564 | rdev = conf->disks[i].replacement; | ||
3565 | if (!rdev) | ||
3566 | /* rdev have been moved down */ | ||
3567 | rdev = conf->disks[i].rdev; | ||
3568 | rdev_clear_badblocks(rdev, sh->sector, | ||
3569 | STRIPE_SECTORS); | ||
3570 | rdev_dec_pending(rdev, conf->mddev); | ||
3571 | } | ||
3366 | } | 3572 | } |
3367 | 3573 | ||
3368 | if (s.ops_request) | 3574 | if (s.ops_request) |
@@ -3586,6 +3792,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
3586 | int dd_idx; | 3792 | int dd_idx; |
3587 | struct bio* align_bi; | 3793 | struct bio* align_bi; |
3588 | struct md_rdev *rdev; | 3794 | struct md_rdev *rdev; |
3795 | sector_t end_sector; | ||
3589 | 3796 | ||
3590 | if (!in_chunk_boundary(mddev, raid_bio)) { | 3797 | if (!in_chunk_boundary(mddev, raid_bio)) { |
3591 | pr_debug("chunk_aligned_read : non aligned\n"); | 3798 | pr_debug("chunk_aligned_read : non aligned\n"); |
@@ -3610,9 +3817,19 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
3610 | 0, | 3817 | 0, |
3611 | &dd_idx, NULL); | 3818 | &dd_idx, NULL); |
3612 | 3819 | ||
3820 | end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); | ||
3613 | rcu_read_lock(); | 3821 | rcu_read_lock(); |
3614 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); | 3822 | rdev = rcu_dereference(conf->disks[dd_idx].replacement); |
3615 | if (rdev && test_bit(In_sync, &rdev->flags)) { | 3823 | if (!rdev || test_bit(Faulty, &rdev->flags) || |
3824 | rdev->recovery_offset < end_sector) { | ||
3825 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); | ||
3826 | if (rdev && | ||
3827 | (test_bit(Faulty, &rdev->flags) || | ||
3828 | !(test_bit(In_sync, &rdev->flags) || | ||
3829 | rdev->recovery_offset >= end_sector))) | ||
3830 | rdev = NULL; | ||
3831 | } | ||
3832 | if (rdev) { | ||
3616 | sector_t first_bad; | 3833 | sector_t first_bad; |
3617 | int bad_sectors; | 3834 | int bad_sectors; |
3618 | 3835 | ||
@@ -4137,7 +4354,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int | |||
4137 | return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ | 4354 | return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ |
4138 | } | 4355 | } |
4139 | 4356 | ||
4140 | |||
4141 | bitmap_cond_end_sync(mddev->bitmap, sector_nr); | 4357 | bitmap_cond_end_sync(mddev->bitmap, sector_nr); |
4142 | 4358 | ||
4143 | sh = get_active_stripe(conf, sector_nr, 0, 1, 0); | 4359 | sh = get_active_stripe(conf, sector_nr, 0, 1, 0); |
@@ -4208,7 +4424,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
4208 | return handled; | 4424 | return handled; |
4209 | } | 4425 | } |
4210 | 4426 | ||
4211 | set_bit(R5_ReadError, &sh->dev[dd_idx].flags); | ||
4212 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { | 4427 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { |
4213 | release_stripe(sh); | 4428 | release_stripe(sh); |
4214 | raid5_set_bi_hw_segments(raid_bio, scnt); | 4429 | raid5_set_bi_hw_segments(raid_bio, scnt); |
@@ -4635,7 +4850,15 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
4635 | continue; | 4850 | continue; |
4636 | disk = conf->disks + raid_disk; | 4851 | disk = conf->disks + raid_disk; |
4637 | 4852 | ||
4638 | disk->rdev = rdev; | 4853 | if (test_bit(Replacement, &rdev->flags)) { |
4854 | if (disk->replacement) | ||
4855 | goto abort; | ||
4856 | disk->replacement = rdev; | ||
4857 | } else { | ||
4858 | if (disk->rdev) | ||
4859 | goto abort; | ||
4860 | disk->rdev = rdev; | ||
4861 | } | ||
4639 | 4862 | ||
4640 | if (test_bit(In_sync, &rdev->flags)) { | 4863 | if (test_bit(In_sync, &rdev->flags)) { |
4641 | char b[BDEVNAME_SIZE]; | 4864 | char b[BDEVNAME_SIZE]; |
@@ -4724,6 +4947,7 @@ static int run(struct mddev *mddev) | |||
4724 | int dirty_parity_disks = 0; | 4947 | int dirty_parity_disks = 0; |
4725 | struct md_rdev *rdev; | 4948 | struct md_rdev *rdev; |
4726 | sector_t reshape_offset = 0; | 4949 | sector_t reshape_offset = 0; |
4950 | int i; | ||
4727 | 4951 | ||
4728 | if (mddev->recovery_cp != MaxSector) | 4952 | if (mddev->recovery_cp != MaxSector) |
4729 | printk(KERN_NOTICE "md/raid:%s: not clean" | 4953 | printk(KERN_NOTICE "md/raid:%s: not clean" |
@@ -4813,12 +5037,25 @@ static int run(struct mddev *mddev) | |||
4813 | conf->thread = NULL; | 5037 | conf->thread = NULL; |
4814 | mddev->private = conf; | 5038 | mddev->private = conf; |
4815 | 5039 | ||
4816 | /* | 5040 | for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; |
4817 | * 0 for a fully functional array, 1 or 2 for a degraded array. | 5041 | i++) { |
4818 | */ | 5042 | rdev = conf->disks[i].rdev; |
4819 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 5043 | if (!rdev && conf->disks[i].replacement) { |
4820 | if (rdev->raid_disk < 0) | 5044 | /* The replacement is all we have yet */ |
5045 | rdev = conf->disks[i].replacement; | ||
5046 | conf->disks[i].replacement = NULL; | ||
5047 | clear_bit(Replacement, &rdev->flags); | ||
5048 | conf->disks[i].rdev = rdev; | ||
5049 | } | ||
5050 | if (!rdev) | ||
4821 | continue; | 5051 | continue; |
5052 | if (conf->disks[i].replacement && | ||
5053 | conf->reshape_progress != MaxSector) { | ||
5054 | /* replacements and reshape simply do not mix. */ | ||
5055 | printk(KERN_ERR "md: cannot handle concurrent " | ||
5056 | "replacement and reshape.\n"); | ||
5057 | goto abort; | ||
5058 | } | ||
4822 | if (test_bit(In_sync, &rdev->flags)) { | 5059 | if (test_bit(In_sync, &rdev->flags)) { |
4823 | working_disks++; | 5060 | working_disks++; |
4824 | continue; | 5061 | continue; |
@@ -4852,8 +5089,10 @@ static int run(struct mddev *mddev) | |||
4852 | dirty_parity_disks++; | 5089 | dirty_parity_disks++; |
4853 | } | 5090 | } |
4854 | 5091 | ||
4855 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) | 5092 | /* |
4856 | - working_disks); | 5093 | * 0 for a fully functional array, 1 or 2 for a degraded array. |
5094 | */ | ||
5095 | mddev->degraded = calc_degraded(conf); | ||
4857 | 5096 | ||
4858 | if (has_failed(conf)) { | 5097 | if (has_failed(conf)) { |
4859 | printk(KERN_ERR "md/raid:%s: not enough operational devices" | 5098 | printk(KERN_ERR "md/raid:%s: not enough operational devices" |
@@ -5016,7 +5255,25 @@ static int raid5_spare_active(struct mddev *mddev) | |||
5016 | 5255 | ||
5017 | for (i = 0; i < conf->raid_disks; i++) { | 5256 | for (i = 0; i < conf->raid_disks; i++) { |
5018 | tmp = conf->disks + i; | 5257 | tmp = conf->disks + i; |
5019 | if (tmp->rdev | 5258 | if (tmp->replacement |
5259 | && tmp->replacement->recovery_offset == MaxSector | ||
5260 | && !test_bit(Faulty, &tmp->replacement->flags) | ||
5261 | && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { | ||
5262 | /* Replacement has just become active. */ | ||
5263 | if (!tmp->rdev | ||
5264 | || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) | ||
5265 | count++; | ||
5266 | if (tmp->rdev) { | ||
5267 | /* Replaced device not technically faulty, | ||
5268 | * but we need to be sure it gets removed | ||
5269 | * and never re-added. | ||
5270 | */ | ||
5271 | set_bit(Faulty, &tmp->rdev->flags); | ||
5272 | sysfs_notify_dirent_safe( | ||
5273 | tmp->rdev->sysfs_state); | ||
5274 | } | ||
5275 | sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); | ||
5276 | } else if (tmp->rdev | ||
5020 | && tmp->rdev->recovery_offset == MaxSector | 5277 | && tmp->rdev->recovery_offset == MaxSector |
5021 | && !test_bit(Faulty, &tmp->rdev->flags) | 5278 | && !test_bit(Faulty, &tmp->rdev->flags) |
5022 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | 5279 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { |
@@ -5025,49 +5282,68 @@ static int raid5_spare_active(struct mddev *mddev) | |||
5025 | } | 5282 | } |
5026 | } | 5283 | } |
5027 | spin_lock_irqsave(&conf->device_lock, flags); | 5284 | spin_lock_irqsave(&conf->device_lock, flags); |
5028 | mddev->degraded -= count; | 5285 | mddev->degraded = calc_degraded(conf); |
5029 | spin_unlock_irqrestore(&conf->device_lock, flags); | 5286 | spin_unlock_irqrestore(&conf->device_lock, flags); |
5030 | print_raid5_conf(conf); | 5287 | print_raid5_conf(conf); |
5031 | return count; | 5288 | return count; |
5032 | } | 5289 | } |
5033 | 5290 | ||
5034 | static int raid5_remove_disk(struct mddev *mddev, int number) | 5291 | static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) |
5035 | { | 5292 | { |
5036 | struct r5conf *conf = mddev->private; | 5293 | struct r5conf *conf = mddev->private; |
5037 | int err = 0; | 5294 | int err = 0; |
5038 | struct md_rdev *rdev; | 5295 | int number = rdev->raid_disk; |
5296 | struct md_rdev **rdevp; | ||
5039 | struct disk_info *p = conf->disks + number; | 5297 | struct disk_info *p = conf->disks + number; |
5040 | 5298 | ||
5041 | print_raid5_conf(conf); | 5299 | print_raid5_conf(conf); |
5042 | rdev = p->rdev; | 5300 | if (rdev == p->rdev) |
5043 | if (rdev) { | 5301 | rdevp = &p->rdev; |
5044 | if (number >= conf->raid_disks && | 5302 | else if (rdev == p->replacement) |
5045 | conf->reshape_progress == MaxSector) | 5303 | rdevp = &p->replacement; |
5046 | clear_bit(In_sync, &rdev->flags); | 5304 | else |
5305 | return 0; | ||
5047 | 5306 | ||
5048 | if (test_bit(In_sync, &rdev->flags) || | 5307 | if (number >= conf->raid_disks && |
5049 | atomic_read(&rdev->nr_pending)) { | 5308 | conf->reshape_progress == MaxSector) |
5050 | err = -EBUSY; | 5309 | clear_bit(In_sync, &rdev->flags); |
5051 | goto abort; | 5310 | |
5052 | } | 5311 | if (test_bit(In_sync, &rdev->flags) || |
5053 | /* Only remove non-faulty devices if recovery | 5312 | atomic_read(&rdev->nr_pending)) { |
5054 | * isn't possible. | 5313 | err = -EBUSY; |
5055 | */ | 5314 | goto abort; |
5056 | if (!test_bit(Faulty, &rdev->flags) && | ||
5057 | mddev->recovery_disabled != conf->recovery_disabled && | ||
5058 | !has_failed(conf) && | ||
5059 | number < conf->raid_disks) { | ||
5060 | err = -EBUSY; | ||
5061 | goto abort; | ||
5062 | } | ||
5063 | p->rdev = NULL; | ||
5064 | synchronize_rcu(); | ||
5065 | if (atomic_read(&rdev->nr_pending)) { | ||
5066 | /* lost the race, try later */ | ||
5067 | err = -EBUSY; | ||
5068 | p->rdev = rdev; | ||
5069 | } | ||
5070 | } | 5315 | } |
5316 | /* Only remove non-faulty devices if recovery | ||
5317 | * isn't possible. | ||
5318 | */ | ||
5319 | if (!test_bit(Faulty, &rdev->flags) && | ||
5320 | mddev->recovery_disabled != conf->recovery_disabled && | ||
5321 | !has_failed(conf) && | ||
5322 | (!p->replacement || p->replacement == rdev) && | ||
5323 | number < conf->raid_disks) { | ||
5324 | err = -EBUSY; | ||
5325 | goto abort; | ||
5326 | } | ||
5327 | *rdevp = NULL; | ||
5328 | synchronize_rcu(); | ||
5329 | if (atomic_read(&rdev->nr_pending)) { | ||
5330 | /* lost the race, try later */ | ||
5331 | err = -EBUSY; | ||
5332 | *rdevp = rdev; | ||
5333 | } else if (p->replacement) { | ||
5334 | /* We must have just cleared 'rdev' */ | ||
5335 | p->rdev = p->replacement; | ||
5336 | clear_bit(Replacement, &p->replacement->flags); | ||
5337 | smp_mb(); /* Make sure other CPUs may see both as identical | ||
5338 | * but will never see neither - if they are careful | ||
5339 | */ | ||
5340 | p->replacement = NULL; | ||
5341 | clear_bit(WantReplacement, &rdev->flags); | ||
5342 | } else | ||
5343 | /* We might have just removed the Replacement as faulty- | ||
5344 | * clear the bit just in case | ||
5345 | */ | ||
5346 | clear_bit(WantReplacement, &rdev->flags); | ||
5071 | abort: | 5347 | abort: |
5072 | 5348 | ||
5073 | print_raid5_conf(conf); | 5349 | print_raid5_conf(conf); |
@@ -5103,8 +5379,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
5103 | disk = rdev->saved_raid_disk; | 5379 | disk = rdev->saved_raid_disk; |
5104 | else | 5380 | else |
5105 | disk = first; | 5381 | disk = first; |
5106 | for ( ; disk <= last ; disk++) | 5382 | for ( ; disk <= last ; disk++) { |
5107 | if ((p=conf->disks + disk)->rdev == NULL) { | 5383 | p = conf->disks + disk; |
5384 | if (p->rdev == NULL) { | ||
5108 | clear_bit(In_sync, &rdev->flags); | 5385 | clear_bit(In_sync, &rdev->flags); |
5109 | rdev->raid_disk = disk; | 5386 | rdev->raid_disk = disk; |
5110 | err = 0; | 5387 | err = 0; |
@@ -5113,6 +5390,17 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
5113 | rcu_assign_pointer(p->rdev, rdev); | 5390 | rcu_assign_pointer(p->rdev, rdev); |
5114 | break; | 5391 | break; |
5115 | } | 5392 | } |
5393 | if (test_bit(WantReplacement, &p->rdev->flags) && | ||
5394 | p->replacement == NULL) { | ||
5395 | clear_bit(In_sync, &rdev->flags); | ||
5396 | set_bit(Replacement, &rdev->flags); | ||
5397 | rdev->raid_disk = disk; | ||
5398 | err = 0; | ||
5399 | conf->fullsync = 1; | ||
5400 | rcu_assign_pointer(p->replacement, rdev); | ||
5401 | break; | ||
5402 | } | ||
5403 | } | ||
5116 | print_raid5_conf(conf); | 5404 | print_raid5_conf(conf); |
5117 | return err; | 5405 | return err; |
5118 | } | 5406 | } |
@@ -5286,8 +5574,7 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5286 | * pre and post number of devices. | 5574 | * pre and post number of devices. |
5287 | */ | 5575 | */ |
5288 | spin_lock_irqsave(&conf->device_lock, flags); | 5576 | spin_lock_irqsave(&conf->device_lock, flags); |
5289 | mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) | 5577 | mddev->degraded = calc_degraded(conf); |
5290 | - added_devices; | ||
5291 | spin_unlock_irqrestore(&conf->device_lock, flags); | 5578 | spin_unlock_irqrestore(&conf->device_lock, flags); |
5292 | } | 5579 | } |
5293 | mddev->raid_disks = conf->raid_disks; | 5580 | mddev->raid_disks = conf->raid_disks; |
@@ -5356,17 +5643,15 @@ static void raid5_finish_reshape(struct mddev *mddev) | |||
5356 | revalidate_disk(mddev->gendisk); | 5643 | revalidate_disk(mddev->gendisk); |
5357 | } else { | 5644 | } else { |
5358 | int d; | 5645 | int d; |
5359 | mddev->degraded = conf->raid_disks; | 5646 | spin_lock_irq(&conf->device_lock); |
5360 | for (d = 0; d < conf->raid_disks ; d++) | 5647 | mddev->degraded = calc_degraded(conf); |
5361 | if (conf->disks[d].rdev && | 5648 | spin_unlock_irq(&conf->device_lock); |
5362 | test_bit(In_sync, | ||
5363 | &conf->disks[d].rdev->flags)) | ||
5364 | mddev->degraded--; | ||
5365 | for (d = conf->raid_disks ; | 5649 | for (d = conf->raid_disks ; |
5366 | d < conf->raid_disks - mddev->delta_disks; | 5650 | d < conf->raid_disks - mddev->delta_disks; |
5367 | d++) { | 5651 | d++) { |
5368 | struct md_rdev *rdev = conf->disks[d].rdev; | 5652 | struct md_rdev *rdev = conf->disks[d].rdev; |
5369 | if (rdev && raid5_remove_disk(mddev, d) == 0) { | 5653 | if (rdev && |
5654 | raid5_remove_disk(mddev, rdev) == 0) { | ||
5370 | sysfs_unlink_rdev(mddev, rdev); | 5655 | sysfs_unlink_rdev(mddev, rdev); |
5371 | rdev->raid_disk = -1; | 5656 | rdev->raid_disk = -1; |
5372 | } | 5657 | } |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index e10c5531f9c5..8d8e13934a48 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -27,7 +27,7 @@ | |||
27 | * The possible state transitions are: | 27 | * The possible state transitions are: |
28 | * | 28 | * |
29 | * Empty -> Want - on read or write to get old data for parity calc | 29 | * Empty -> Want - on read or write to get old data for parity calc |
30 | * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) | 30 | * Empty -> Dirty - on compute_parity to satisfy write/sync request. |
31 | * Empty -> Clean - on compute_block when computing a block for failed drive | 31 | * Empty -> Clean - on compute_block when computing a block for failed drive |
32 | * Want -> Empty - on failed read | 32 | * Want -> Empty - on failed read |
33 | * Want -> Clean - on successful completion of read request | 33 | * Want -> Clean - on successful completion of read request |
@@ -226,8 +226,11 @@ struct stripe_head { | |||
226 | #endif | 226 | #endif |
227 | } ops; | 227 | } ops; |
228 | struct r5dev { | 228 | struct r5dev { |
229 | struct bio req; | 229 | /* rreq and rvec are used for the replacement device when |
230 | struct bio_vec vec; | 230 | * writing data to both devices. |
231 | */ | ||
232 | struct bio req, rreq; | ||
233 | struct bio_vec vec, rvec; | ||
231 | struct page *page; | 234 | struct page *page; |
232 | struct bio *toread, *read, *towrite, *written; | 235 | struct bio *toread, *read, *towrite, *written; |
233 | sector_t sector; /* sector of this page */ | 236 | sector_t sector; /* sector of this page */ |
@@ -239,7 +242,13 @@ struct stripe_head { | |||
239 | * for handle_stripe. | 242 | * for handle_stripe. |
240 | */ | 243 | */ |
241 | struct stripe_head_state { | 244 | struct stripe_head_state { |
242 | int syncing, expanding, expanded; | 245 | /* 'syncing' means that we need to read all devices, either |
246 | * to check/correct parity, or to reconstruct a missing device. | ||
247 | * 'replacing' means we are replacing one or more drives and | ||
248 | * the source is valid at this point so we don't need to | ||
249 | * read all devices, just the replacement targets. | ||
250 | */ | ||
251 | int syncing, expanding, expanded, replacing; | ||
243 | int locked, uptodate, to_read, to_write, failed, written; | 252 | int locked, uptodate, to_read, to_write, failed, written; |
244 | int to_fill, compute, req_compute, non_overwrite; | 253 | int to_fill, compute, req_compute, non_overwrite; |
245 | int failed_num[2]; | 254 | int failed_num[2]; |
@@ -252,38 +261,41 @@ struct stripe_head_state { | |||
252 | int handle_bad_blocks; | 261 | int handle_bad_blocks; |
253 | }; | 262 | }; |
254 | 263 | ||
255 | /* Flags */ | 264 | /* Flags for struct r5dev.flags */ |
256 | #define R5_UPTODATE 0 /* page contains current data */ | 265 | enum r5dev_flags { |
257 | #define R5_LOCKED 1 /* IO has been submitted on "req" */ | 266 | R5_UPTODATE, /* page contains current data */ |
258 | #define R5_OVERWRITE 2 /* towrite covers whole page */ | 267 | R5_LOCKED, /* IO has been submitted on "req" */ |
268 | R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */ | ||
269 | R5_OVERWRITE, /* towrite covers whole page */ | ||
259 | /* and some that are internal to handle_stripe */ | 270 | /* and some that are internal to handle_stripe */ |
260 | #define R5_Insync 3 /* rdev && rdev->in_sync at start */ | 271 | R5_Insync, /* rdev && rdev->in_sync at start */ |
261 | #define R5_Wantread 4 /* want to schedule a read */ | 272 | R5_Wantread, /* want to schedule a read */ |
262 | #define R5_Wantwrite 5 | 273 | R5_Wantwrite, |
263 | #define R5_Overlap 7 /* There is a pending overlapping request on this block */ | 274 | R5_Overlap, /* There is a pending overlapping request |
264 | #define R5_ReadError 8 /* seen a read error here recently */ | 275 | * on this block */ |
265 | #define R5_ReWrite 9 /* have tried to over-write the readerror */ | 276 | R5_ReadError, /* seen a read error here recently */ |
277 | R5_ReWrite, /* have tried to over-write the readerror */ | ||
266 | 278 | ||
267 | #define R5_Expanded 10 /* This block now has post-expand data */ | 279 | R5_Expanded, /* This block now has post-expand data */ |
268 | #define R5_Wantcompute 11 /* compute_block in progress treat as | 280 | R5_Wantcompute, /* compute_block in progress treat as |
269 | * uptodate | 281 | * uptodate |
270 | */ | 282 | */ |
271 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs | 283 | R5_Wantfill, /* dev->toread contains a bio that needs |
272 | * filling | 284 | * filling |
273 | */ | 285 | */ |
274 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ | 286 | R5_Wantdrain, /* dev->towrite needs to be drained */ |
275 | #define R5_WantFUA 14 /* Write should be FUA */ | 287 | R5_WantFUA, /* Write should be FUA */ |
276 | #define R5_WriteError 15 /* got a write error - need to record it */ | 288 | R5_WriteError, /* got a write error - need to record it */ |
277 | #define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/ | 289 | R5_MadeGood, /* A bad block has been fixed by writing to it */ |
278 | /* | 290 | R5_ReadRepl, /* Will/did read from replacement rather than orig */ |
279 | * Write method | 291 | R5_MadeGoodRepl,/* A bad block on the replacement device has been |
280 | */ | 292 | * fixed by writing to it */ |
281 | #define RECONSTRUCT_WRITE 1 | 293 | R5_NeedReplace, /* This device has a replacement which is not |
282 | #define READ_MODIFY_WRITE 2 | 294 | * up-to-date at this stripe. */ |
283 | /* not a write method, but a compute_parity mode */ | 295 | R5_WantReplace, /* We need to update the replacement, we have read |
284 | #define CHECK_PARITY 3 | 296 | * data in, and now is a good time to write it out. |
285 | /* Additional compute_parity mode -- updates the parity w/o LOCKING */ | 297 | */ |
286 | #define UPDATE_PARITY 4 | 298 | }; |
287 | 299 | ||
288 | /* | 300 | /* |
289 | * Stripe state | 301 | * Stripe state |
@@ -311,13 +323,14 @@ enum { | |||
311 | /* | 323 | /* |
312 | * Operation request flags | 324 | * Operation request flags |
313 | */ | 325 | */ |
314 | #define STRIPE_OP_BIOFILL 0 | 326 | enum { |
315 | #define STRIPE_OP_COMPUTE_BLK 1 | 327 | STRIPE_OP_BIOFILL, |
316 | #define STRIPE_OP_PREXOR 2 | 328 | STRIPE_OP_COMPUTE_BLK, |
317 | #define STRIPE_OP_BIODRAIN 3 | 329 | STRIPE_OP_PREXOR, |
318 | #define STRIPE_OP_RECONSTRUCT 4 | 330 | STRIPE_OP_BIODRAIN, |
319 | #define STRIPE_OP_CHECK 5 | 331 | STRIPE_OP_RECONSTRUCT, |
320 | 332 | STRIPE_OP_CHECK, | |
333 | }; | ||
321 | /* | 334 | /* |
322 | * Plugging: | 335 | * Plugging: |
323 | * | 336 | * |
@@ -344,13 +357,12 @@ enum { | |||
344 | 357 | ||
345 | 358 | ||
346 | struct disk_info { | 359 | struct disk_info { |
347 | struct md_rdev *rdev; | 360 | struct md_rdev *rdev, *replacement; |
348 | }; | 361 | }; |
349 | 362 | ||
350 | struct r5conf { | 363 | struct r5conf { |
351 | struct hlist_head *stripe_hashtbl; | 364 | struct hlist_head *stripe_hashtbl; |
352 | struct mddev *mddev; | 365 | struct mddev *mddev; |
353 | struct disk_info *spare; | ||
354 | int chunk_sectors; | 366 | int chunk_sectors; |
355 | int level, algorithm; | 367 | int level, algorithm; |
356 | int max_degraded; | 368 | int max_degraded; |