aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-28 08:50:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-28 08:50:27 -0400
commit6140333d3656f62ac7e6a5af87e7fe92cfb8d655 (patch)
treed96f7ad2196b4383f5ca4396c956e24c82b2952c /drivers/md
parent6f56c218666b5c7eff354364357307d18c10058b (diff)
parent58c54fcca3bac5bf9290cfed31c76e4c4bfbabaf (diff)
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (75 commits) md/raid10: handle further errors during fix_read_error better. md/raid10: Handle read errors during recovery better. md/raid10: simplify read error handling during recovery. md/raid10: record bad blocks due to write errors during resync/recovery. md/raid10: attempt to fix read errors during resync/check md/raid10: Handle write errors by updating badblock log. md/raid10: clear bad-block record when write succeeds. md/raid10: avoid writing to known bad blocks on known bad drives. md/raid10 record bad blocks as needed during recovery. md/raid10: avoid reading known bad blocks during resync/recovery. md/raid10 - avoid reading from known bad blocks - part 3 md/raid10: avoid reading from known bad blocks - part 2 md/raid10: avoid reading from known bad blocks - part 1 md/raid10: Split handle_read_error out from raid10d. md/raid10: simplify/reindent some loops. md/raid5: Clear bad blocks on successful write. md/raid5. Don't write to known bad block on doubtful devices. md/raid5: write errors should be recorded as bad blocks if possible. md/raid5: use bad-block log to improve handling of uncorrectable read errors. md/raid5: avoid reading from known bad blocks. ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c137
-rw-r--r--drivers/md/bitmap.h5
-rw-r--r--drivers/md/md.c871
-rw-r--r--drivers/md/md.h110
-rw-r--r--drivers/md/raid1.c962
-rw-r--r--drivers/md/raid1.h26
-rw-r--r--drivers/md/raid10.c1183
-rw-r--r--drivers/md/raid10.h21
-rw-r--r--drivers/md/raid5.c1015
-rw-r--r--drivers/md/raid5.h99
10 files changed, 3060 insertions, 1369 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 574b09afedd3..0dc6546b77a8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -29,7 +29,6 @@
29#include "md.h" 29#include "md.h"
30#include "bitmap.h" 30#include "bitmap.h"
31 31
32#include <linux/dm-dirty-log.h>
33/* debug macros */ 32/* debug macros */
34 33
35#define DEBUG 0 34#define DEBUG 0
@@ -775,10 +774,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon
775 * 0 or page 1 774 * 0 or page 1
776 */ 775 */
777static inline struct page *filemap_get_page(struct bitmap *bitmap, 776static inline struct page *filemap_get_page(struct bitmap *bitmap,
778 unsigned long chunk) 777 unsigned long chunk)
779{ 778{
780 if (bitmap->filemap == NULL)
781 return NULL;
782 if (file_page_index(bitmap, chunk) >= bitmap->file_pages) 779 if (file_page_index(bitmap, chunk) >= bitmap->file_pages)
783 return NULL; 780 return NULL;
784 return bitmap->filemap[file_page_index(bitmap, chunk) 781 return bitmap->filemap[file_page_index(bitmap, chunk)
@@ -878,28 +875,19 @@ enum bitmap_page_attr {
878static inline void set_page_attr(struct bitmap *bitmap, struct page *page, 875static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
879 enum bitmap_page_attr attr) 876 enum bitmap_page_attr attr)
880{ 877{
881 if (page) 878 __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
882 __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
883 else
884 __set_bit(attr, &bitmap->logattrs);
885} 879}
886 880
887static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, 881static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
888 enum bitmap_page_attr attr) 882 enum bitmap_page_attr attr)
889{ 883{
890 if (page) 884 __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
891 __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
892 else
893 __clear_bit(attr, &bitmap->logattrs);
894} 885}
895 886
896static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, 887static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
897 enum bitmap_page_attr attr) 888 enum bitmap_page_attr attr)
898{ 889{
899 if (page) 890 return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
900 return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
901 else
902 return test_bit(attr, &bitmap->logattrs);
903} 891}
904 892
905/* 893/*
@@ -912,30 +900,26 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p
912static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) 900static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
913{ 901{
914 unsigned long bit; 902 unsigned long bit;
915 struct page *page = NULL; 903 struct page *page;
916 void *kaddr; 904 void *kaddr;
917 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); 905 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
918 906
919 if (!bitmap->filemap) { 907 if (!bitmap->filemap)
920 struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; 908 return;
921 if (log)
922 log->type->mark_region(log, chunk);
923 } else {
924 909
925 page = filemap_get_page(bitmap, chunk); 910 page = filemap_get_page(bitmap, chunk);
926 if (!page) 911 if (!page)
927 return; 912 return;
928 bit = file_page_offset(bitmap, chunk); 913 bit = file_page_offset(bitmap, chunk);
929 914
930 /* set the bit */ 915 /* set the bit */
931 kaddr = kmap_atomic(page, KM_USER0); 916 kaddr = kmap_atomic(page, KM_USER0);
932 if (bitmap->flags & BITMAP_HOSTENDIAN) 917 if (bitmap->flags & BITMAP_HOSTENDIAN)
933 set_bit(bit, kaddr); 918 set_bit(bit, kaddr);
934 else 919 else
935 __test_and_set_bit_le(bit, kaddr); 920 __set_bit_le(bit, kaddr);
936 kunmap_atomic(kaddr, KM_USER0); 921 kunmap_atomic(kaddr, KM_USER0);
937 PRINTK("set file bit %lu page %lu\n", bit, page->index); 922 PRINTK("set file bit %lu page %lu\n", bit, page->index);
938 }
939 /* record page number so it gets flushed to disk when unplug occurs */ 923 /* record page number so it gets flushed to disk when unplug occurs */
940 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); 924 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
941} 925}
@@ -952,16 +936,6 @@ void bitmap_unplug(struct bitmap *bitmap)
952 936
953 if (!bitmap) 937 if (!bitmap)
954 return; 938 return;
955 if (!bitmap->filemap) {
956 /* Must be using a dirty_log */
957 struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log;
958 dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs);
959 need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs);
960 if (dirty || need_write)
961 if (log->type->flush(log))
962 bitmap->flags |= BITMAP_WRITE_ERROR;
963 goto out;
964 }
965 939
966 /* look at each page to see if there are any set bits that need to be 940 /* look at each page to see if there are any set bits that need to be
967 * flushed out to disk */ 941 * flushed out to disk */
@@ -990,7 +964,6 @@ void bitmap_unplug(struct bitmap *bitmap)
990 else 964 else
991 md_super_wait(bitmap->mddev); 965 md_super_wait(bitmap->mddev);
992 } 966 }
993out:
994 if (bitmap->flags & BITMAP_WRITE_ERROR) 967 if (bitmap->flags & BITMAP_WRITE_ERROR)
995 bitmap_file_kick(bitmap); 968 bitmap_file_kick(bitmap);
996} 969}
@@ -1199,7 +1172,6 @@ void bitmap_daemon_work(mddev_t *mddev)
1199 struct page *page = NULL, *lastpage = NULL; 1172 struct page *page = NULL, *lastpage = NULL;
1200 sector_t blocks; 1173 sector_t blocks;
1201 void *paddr; 1174 void *paddr;
1202 struct dm_dirty_log *log = mddev->bitmap_info.log;
1203 1175
1204 /* Use a mutex to guard daemon_work against 1176 /* Use a mutex to guard daemon_work against
1205 * bitmap_destroy. 1177 * bitmap_destroy.
@@ -1224,12 +1196,11 @@ void bitmap_daemon_work(mddev_t *mddev)
1224 spin_lock_irqsave(&bitmap->lock, flags); 1196 spin_lock_irqsave(&bitmap->lock, flags);
1225 for (j = 0; j < bitmap->chunks; j++) { 1197 for (j = 0; j < bitmap->chunks; j++) {
1226 bitmap_counter_t *bmc; 1198 bitmap_counter_t *bmc;
1227 if (!bitmap->filemap) { 1199 if (!bitmap->filemap)
1228 if (!log) 1200 /* error or shutdown */
1229 /* error or shutdown */ 1201 break;
1230 break; 1202
1231 } else 1203 page = filemap_get_page(bitmap, j);
1232 page = filemap_get_page(bitmap, j);
1233 1204
1234 if (page != lastpage) { 1205 if (page != lastpage) {
1235 /* skip this page unless it's marked as needing cleaning */ 1206 /* skip this page unless it's marked as needing cleaning */
@@ -1298,17 +1269,16 @@ void bitmap_daemon_work(mddev_t *mddev)
1298 -1); 1269 -1);
1299 1270
1300 /* clear the bit */ 1271 /* clear the bit */
1301 if (page) { 1272 paddr = kmap_atomic(page, KM_USER0);
1302 paddr = kmap_atomic(page, KM_USER0); 1273 if (bitmap->flags & BITMAP_HOSTENDIAN)
1303 if (bitmap->flags & BITMAP_HOSTENDIAN) 1274 clear_bit(file_page_offset(bitmap, j),
1304 clear_bit(file_page_offset(bitmap, j), 1275 paddr);
1305 paddr); 1276 else
1306 else 1277 __clear_bit_le(
1307 __test_and_clear_bit_le(file_page_offset(bitmap, j), 1278 file_page_offset(bitmap,
1308 paddr); 1279 j),
1309 kunmap_atomic(paddr, KM_USER0); 1280 paddr);
1310 } else 1281 kunmap_atomic(paddr, KM_USER0);
1311 log->type->clear_region(log, j);
1312 } 1282 }
1313 } else 1283 } else
1314 j |= PAGE_COUNTER_MASK; 1284 j |= PAGE_COUNTER_MASK;
@@ -1316,16 +1286,12 @@ void bitmap_daemon_work(mddev_t *mddev)
1316 spin_unlock_irqrestore(&bitmap->lock, flags); 1286 spin_unlock_irqrestore(&bitmap->lock, flags);
1317 1287
1318 /* now sync the final page */ 1288 /* now sync the final page */
1319 if (lastpage != NULL || log != NULL) { 1289 if (lastpage != NULL) {
1320 spin_lock_irqsave(&bitmap->lock, flags); 1290 spin_lock_irqsave(&bitmap->lock, flags);
1321 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { 1291 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
1322 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1292 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1323 spin_unlock_irqrestore(&bitmap->lock, flags); 1293 spin_unlock_irqrestore(&bitmap->lock, flags);
1324 if (lastpage) 1294 write_page(bitmap, lastpage, 0);
1325 write_page(bitmap, lastpage, 0);
1326 else
1327 if (log->type->flush(log))
1328 bitmap->flags |= BITMAP_WRITE_ERROR;
1329 } else { 1295 } else {
1330 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1296 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1331 spin_unlock_irqrestore(&bitmap->lock, flags); 1297 spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1767,12 +1733,10 @@ int bitmap_create(mddev_t *mddev)
1767 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); 1733 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
1768 1734
1769 if (!file 1735 if (!file
1770 && !mddev->bitmap_info.offset 1736 && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
1771 && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */
1772 return 0; 1737 return 0;
1773 1738
1774 BUG_ON(file && mddev->bitmap_info.offset); 1739 BUG_ON(file && mddev->bitmap_info.offset);
1775 BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log);
1776 1740
1777 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); 1741 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
1778 if (!bitmap) 1742 if (!bitmap)
@@ -1863,6 +1827,7 @@ int bitmap_create(mddev_t *mddev)
1863int bitmap_load(mddev_t *mddev) 1827int bitmap_load(mddev_t *mddev)
1864{ 1828{
1865 int err = 0; 1829 int err = 0;
1830 sector_t start = 0;
1866 sector_t sector = 0; 1831 sector_t sector = 0;
1867 struct bitmap *bitmap = mddev->bitmap; 1832 struct bitmap *bitmap = mddev->bitmap;
1868 1833
@@ -1881,24 +1846,14 @@ int bitmap_load(mddev_t *mddev)
1881 } 1846 }
1882 bitmap_close_sync(bitmap); 1847 bitmap_close_sync(bitmap);
1883 1848
1884 if (mddev->bitmap_info.log) { 1849 if (mddev->degraded == 0
1885 unsigned long i; 1850 || bitmap->events_cleared == mddev->events)
1886 struct dm_dirty_log *log = mddev->bitmap_info.log; 1851 /* no need to keep dirty bits to optimise a
1887 for (i = 0; i < bitmap->chunks; i++) 1852 * re-add of a missing device */
1888 if (!log->type->in_sync(log, i, 1)) 1853 start = mddev->recovery_cp;
1889 bitmap_set_memory_bits(bitmap, 1854
1890 (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), 1855 err = bitmap_init_from_disk(bitmap, start);
1891 1); 1856
1892 } else {
1893 sector_t start = 0;
1894 if (mddev->degraded == 0
1895 || bitmap->events_cleared == mddev->events)
1896 /* no need to keep dirty bits to optimise a
1897 * re-add of a missing device */
1898 start = mddev->recovery_cp;
1899
1900 err = bitmap_init_from_disk(bitmap, start);
1901 }
1902 if (err) 1857 if (err)
1903 goto out; 1858 goto out;
1904 1859
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index b2a127e891ac..a28f2e5588c6 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -212,10 +212,6 @@ struct bitmap {
212 unsigned long file_pages; /* number of pages in the file */ 212 unsigned long file_pages; /* number of pages in the file */
213 int last_page_size; /* bytes in the last page */ 213 int last_page_size; /* bytes in the last page */
214 214
215 unsigned long logattrs; /* used when filemap_attr doesn't exist
216 * because we are working with a dirty_log
217 */
218
219 unsigned long flags; 215 unsigned long flags;
220 216
221 int allclean; 217 int allclean;
@@ -237,7 +233,6 @@ struct bitmap {
237 wait_queue_head_t behind_wait; 233 wait_queue_head_t behind_wait;
238 234
239 struct sysfs_dirent *sysfs_can_clear; 235 struct sysfs_dirent *sysfs_can_clear;
240
241}; 236};
242 237
243/* the bitmap API */ 238/* the bitmap API */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index dfc9425db70b..8e221a20f5d9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
215} 215}
216EXPORT_SYMBOL_GPL(bio_clone_mddev); 216EXPORT_SYMBOL_GPL(bio_clone_mddev);
217 217
218void md_trim_bio(struct bio *bio, int offset, int size)
219{
220 /* 'bio' is a cloned bio which we need to trim to match
221 * the given offset and size.
222 * This requires adjusting bi_sector, bi_size, and bi_io_vec
223 */
224 int i;
225 struct bio_vec *bvec;
226 int sofar = 0;
227
228 size <<= 9;
229 if (offset == 0 && size == bio->bi_size)
230 return;
231
232 bio->bi_sector += offset;
233 bio->bi_size = size;
234 offset <<= 9;
235 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
236
237 while (bio->bi_idx < bio->bi_vcnt &&
238 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
239 /* remove this whole bio_vec */
240 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
241 bio->bi_idx++;
242 }
243 if (bio->bi_idx < bio->bi_vcnt) {
244 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
245 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
246 }
247 /* avoid any complications with bi_idx being non-zero*/
248 if (bio->bi_idx) {
249 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
250 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
251 bio->bi_vcnt -= bio->bi_idx;
252 bio->bi_idx = 0;
253 }
254 /* Make sure vcnt and last bv are not too big */
255 bio_for_each_segment(bvec, bio, i) {
256 if (sofar + bvec->bv_len > size)
257 bvec->bv_len = size - sofar;
258 if (bvec->bv_len == 0) {
259 bio->bi_vcnt = i;
260 break;
261 }
262 sofar += bvec->bv_len;
263 }
264}
265EXPORT_SYMBOL_GPL(md_trim_bio);
266
218/* 267/*
219 * We have a system wide 'event count' that is incremented 268 * We have a system wide 'event count' that is incremented
220 * on any 'interesting' event, and readers of /proc/mdstat 269 * on any 'interesting' event, and readers of /proc/mdstat
@@ -757,6 +806,10 @@ static void free_disk_sb(mdk_rdev_t * rdev)
757 rdev->sb_start = 0; 806 rdev->sb_start = 0;
758 rdev->sectors = 0; 807 rdev->sectors = 0;
759 } 808 }
809 if (rdev->bb_page) {
810 put_page(rdev->bb_page);
811 rdev->bb_page = NULL;
812 }
760} 813}
761 814
762 815
@@ -1025,7 +1078,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1025 ret = -EINVAL; 1078 ret = -EINVAL;
1026 1079
1027 bdevname(rdev->bdev, b); 1080 bdevname(rdev->bdev, b);
1028 sb = (mdp_super_t*)page_address(rdev->sb_page); 1081 sb = page_address(rdev->sb_page);
1029 1082
1030 if (sb->md_magic != MD_SB_MAGIC) { 1083 if (sb->md_magic != MD_SB_MAGIC) {
1031 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 1084 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
@@ -1054,6 +1107,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1054 rdev->preferred_minor = sb->md_minor; 1107 rdev->preferred_minor = sb->md_minor;
1055 rdev->data_offset = 0; 1108 rdev->data_offset = 0;
1056 rdev->sb_size = MD_SB_BYTES; 1109 rdev->sb_size = MD_SB_BYTES;
1110 rdev->badblocks.shift = -1;
1057 1111
1058 if (sb->level == LEVEL_MULTIPATH) 1112 if (sb->level == LEVEL_MULTIPATH)
1059 rdev->desc_nr = -1; 1113 rdev->desc_nr = -1;
@@ -1064,7 +1118,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1064 ret = 1; 1118 ret = 1;
1065 } else { 1119 } else {
1066 __u64 ev1, ev2; 1120 __u64 ev1, ev2;
1067 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 1121 mdp_super_t *refsb = page_address(refdev->sb_page);
1068 if (!uuid_equal(refsb, sb)) { 1122 if (!uuid_equal(refsb, sb)) {
1069 printk(KERN_WARNING "md: %s has different UUID to %s\n", 1123 printk(KERN_WARNING "md: %s has different UUID to %s\n",
1070 b, bdevname(refdev->bdev,b2)); 1124 b, bdevname(refdev->bdev,b2));
@@ -1099,7 +1153,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1099static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1153static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1100{ 1154{
1101 mdp_disk_t *desc; 1155 mdp_disk_t *desc;
1102 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 1156 mdp_super_t *sb = page_address(rdev->sb_page);
1103 __u64 ev1 = md_event(sb); 1157 __u64 ev1 = md_event(sb);
1104 1158
1105 rdev->raid_disk = -1; 1159 rdev->raid_disk = -1;
@@ -1230,7 +1284,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1230 1284
1231 rdev->sb_size = MD_SB_BYTES; 1285 rdev->sb_size = MD_SB_BYTES;
1232 1286
1233 sb = (mdp_super_t*)page_address(rdev->sb_page); 1287 sb = page_address(rdev->sb_page);
1234 1288
1235 memset(sb, 0, sizeof(*sb)); 1289 memset(sb, 0, sizeof(*sb));
1236 1290
@@ -1395,6 +1449,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1395 return cpu_to_le32(csum); 1449 return cpu_to_le32(csum);
1396} 1450}
1397 1451
1452static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1453 int acknowledged);
1398static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1454static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1399{ 1455{
1400 struct mdp_superblock_1 *sb; 1456 struct mdp_superblock_1 *sb;
@@ -1435,7 +1491,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1435 if (ret) return ret; 1491 if (ret) return ret;
1436 1492
1437 1493
1438 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1494 sb = page_address(rdev->sb_page);
1439 1495
1440 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1496 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1441 sb->major_version != cpu_to_le32(1) || 1497 sb->major_version != cpu_to_le32(1) ||
@@ -1473,12 +1529,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1473 else 1529 else
1474 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1530 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1475 1531
1532 if (!rdev->bb_page) {
1533 rdev->bb_page = alloc_page(GFP_KERNEL);
1534 if (!rdev->bb_page)
1535 return -ENOMEM;
1536 }
1537 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1538 rdev->badblocks.count == 0) {
1539 /* need to load the bad block list.
1540 * Currently we limit it to one page.
1541 */
1542 s32 offset;
1543 sector_t bb_sector;
1544 u64 *bbp;
1545 int i;
1546 int sectors = le16_to_cpu(sb->bblog_size);
1547 if (sectors > (PAGE_SIZE / 512))
1548 return -EINVAL;
1549 offset = le32_to_cpu(sb->bblog_offset);
1550 if (offset == 0)
1551 return -EINVAL;
1552 bb_sector = (long long)offset;
1553 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1554 rdev->bb_page, READ, true))
1555 return -EIO;
1556 bbp = (u64 *)page_address(rdev->bb_page);
1557 rdev->badblocks.shift = sb->bblog_shift;
1558 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1559 u64 bb = le64_to_cpu(*bbp);
1560 int count = bb & (0x3ff);
1561 u64 sector = bb >> 10;
1562 sector <<= sb->bblog_shift;
1563 count <<= sb->bblog_shift;
1564 if (bb + 1 == 0)
1565 break;
1566 if (md_set_badblocks(&rdev->badblocks,
1567 sector, count, 1) == 0)
1568 return -EINVAL;
1569 }
1570 } else if (sb->bblog_offset == 0)
1571 rdev->badblocks.shift = -1;
1572
1476 if (!refdev) { 1573 if (!refdev) {
1477 ret = 1; 1574 ret = 1;
1478 } else { 1575 } else {
1479 __u64 ev1, ev2; 1576 __u64 ev1, ev2;
1480 struct mdp_superblock_1 *refsb = 1577 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1481 (struct mdp_superblock_1*)page_address(refdev->sb_page);
1482 1578
1483 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1579 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1484 sb->level != refsb->level || 1580 sb->level != refsb->level ||
@@ -1513,7 +1609,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1513 1609
1514static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1610static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1515{ 1611{
1516 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1612 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1517 __u64 ev1 = le64_to_cpu(sb->events); 1613 __u64 ev1 = le64_to_cpu(sb->events);
1518 1614
1519 rdev->raid_disk = -1; 1615 rdev->raid_disk = -1;
@@ -1619,13 +1715,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1619 int max_dev, i; 1715 int max_dev, i;
1620 /* make rdev->sb match mddev and rdev data. */ 1716 /* make rdev->sb match mddev and rdev data. */
1621 1717
1622 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1718 sb = page_address(rdev->sb_page);
1623 1719
1624 sb->feature_map = 0; 1720 sb->feature_map = 0;
1625 sb->pad0 = 0; 1721 sb->pad0 = 0;
1626 sb->recovery_offset = cpu_to_le64(0); 1722 sb->recovery_offset = cpu_to_le64(0);
1627 memset(sb->pad1, 0, sizeof(sb->pad1)); 1723 memset(sb->pad1, 0, sizeof(sb->pad1));
1628 memset(sb->pad2, 0, sizeof(sb->pad2));
1629 memset(sb->pad3, 0, sizeof(sb->pad3)); 1724 memset(sb->pad3, 0, sizeof(sb->pad3));
1630 1725
1631 sb->utime = cpu_to_le64((__u64)mddev->utime); 1726 sb->utime = cpu_to_le64((__u64)mddev->utime);
@@ -1665,6 +1760,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1665 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1760 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1666 } 1761 }
1667 1762
1763 if (rdev->badblocks.count == 0)
1764 /* Nothing to do for bad blocks*/ ;
1765 else if (sb->bblog_offset == 0)
1766 /* Cannot record bad blocks on this device */
1767 md_error(mddev, rdev);
1768 else {
1769 struct badblocks *bb = &rdev->badblocks;
1770 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1771 u64 *p = bb->page;
1772 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1773 if (bb->changed) {
1774 unsigned seq;
1775
1776retry:
1777 seq = read_seqbegin(&bb->lock);
1778
1779 memset(bbp, 0xff, PAGE_SIZE);
1780
1781 for (i = 0 ; i < bb->count ; i++) {
1782 u64 internal_bb = *p++;
1783 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1784 | BB_LEN(internal_bb));
1785 *bbp++ = cpu_to_le64(store_bb);
1786 }
1787 if (read_seqretry(&bb->lock, seq))
1788 goto retry;
1789
1790 bb->sector = (rdev->sb_start +
1791 (int)le32_to_cpu(sb->bblog_offset));
1792 bb->size = le16_to_cpu(sb->bblog_size);
1793 bb->changed = 0;
1794 }
1795 }
1796
1668 max_dev = 0; 1797 max_dev = 0;
1669 list_for_each_entry(rdev2, &mddev->disks, same_set) 1798 list_for_each_entry(rdev2, &mddev->disks, same_set)
1670 if (rdev2->desc_nr+1 > max_dev) 1799 if (rdev2->desc_nr+1 > max_dev)
@@ -1724,7 +1853,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1724 num_sectors = max_sectors; 1853 num_sectors = max_sectors;
1725 rdev->sb_start = sb_start; 1854 rdev->sb_start = sb_start;
1726 } 1855 }
1727 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); 1856 sb = page_address(rdev->sb_page);
1728 sb->data_size = cpu_to_le64(num_sectors); 1857 sb->data_size = cpu_to_le64(num_sectors);
1729 sb->super_offset = rdev->sb_start; 1858 sb->super_offset = rdev->sb_start;
1730 sb->sb_csum = calc_sb_1_csum(sb); 1859 sb->sb_csum = calc_sb_1_csum(sb);
@@ -1922,7 +2051,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1922 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2051 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
1923 2052
1924 /* May as well allow recovery to be retried once */ 2053 /* May as well allow recovery to be retried once */
1925 mddev->recovery_disabled = 0; 2054 mddev->recovery_disabled++;
1926 2055
1927 return 0; 2056 return 0;
1928 2057
@@ -1953,6 +2082,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1953 sysfs_remove_link(&rdev->kobj, "block"); 2082 sysfs_remove_link(&rdev->kobj, "block");
1954 sysfs_put(rdev->sysfs_state); 2083 sysfs_put(rdev->sysfs_state);
1955 rdev->sysfs_state = NULL; 2084 rdev->sysfs_state = NULL;
2085 kfree(rdev->badblocks.page);
2086 rdev->badblocks.count = 0;
2087 rdev->badblocks.page = NULL;
1956 /* We need to delay this, otherwise we can deadlock when 2088 /* We need to delay this, otherwise we can deadlock when
1957 * writing to 'remove' to "dev/state". We also need 2089 * writing to 'remove' to "dev/state". We also need
1958 * to delay it due to rcu usage. 2090 * to delay it due to rcu usage.
@@ -2127,10 +2259,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version)
2127 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); 2259 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
2128 switch (major_version) { 2260 switch (major_version) {
2129 case 0: 2261 case 0:
2130 print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); 2262 print_sb_90(page_address(rdev->sb_page));
2131 break; 2263 break;
2132 case 1: 2264 case 1:
2133 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); 2265 print_sb_1(page_address(rdev->sb_page));
2134 break; 2266 break;
2135 } 2267 }
2136 } else 2268 } else
@@ -2194,6 +2326,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
2194 mdk_rdev_t *rdev; 2326 mdk_rdev_t *rdev;
2195 int sync_req; 2327 int sync_req;
2196 int nospares = 0; 2328 int nospares = 0;
2329 int any_badblocks_changed = 0;
2197 2330
2198repeat: 2331repeat:
2199 /* First make sure individual recovery_offsets are correct */ 2332 /* First make sure individual recovery_offsets are correct */
@@ -2208,8 +2341,18 @@ repeat:
2208 if (!mddev->persistent) { 2341 if (!mddev->persistent) {
2209 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2342 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2210 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2343 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2211 if (!mddev->external) 2344 if (!mddev->external) {
2212 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2345 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2346 list_for_each_entry(rdev, &mddev->disks, same_set) {
2347 if (rdev->badblocks.changed) {
2348 md_ack_all_badblocks(&rdev->badblocks);
2349 md_error(mddev, rdev);
2350 }
2351 clear_bit(Blocked, &rdev->flags);
2352 clear_bit(BlockedBadBlocks, &rdev->flags);
2353 wake_up(&rdev->blocked_wait);
2354 }
2355 }
2213 wake_up(&mddev->sb_wait); 2356 wake_up(&mddev->sb_wait);
2214 return; 2357 return;
2215 } 2358 }
@@ -2265,6 +2408,14 @@ repeat:
2265 MD_BUG(); 2408 MD_BUG();
2266 mddev->events --; 2409 mddev->events --;
2267 } 2410 }
2411
2412 list_for_each_entry(rdev, &mddev->disks, same_set) {
2413 if (rdev->badblocks.changed)
2414 any_badblocks_changed++;
2415 if (test_bit(Faulty, &rdev->flags))
2416 set_bit(FaultRecorded, &rdev->flags);
2417 }
2418
2268 sync_sbs(mddev, nospares); 2419 sync_sbs(mddev, nospares);
2269 spin_unlock_irq(&mddev->write_lock); 2420 spin_unlock_irq(&mddev->write_lock);
2270 2421
@@ -2290,6 +2441,13 @@ repeat:
2290 bdevname(rdev->bdev,b), 2441 bdevname(rdev->bdev,b),
2291 (unsigned long long)rdev->sb_start); 2442 (unsigned long long)rdev->sb_start);
2292 rdev->sb_events = mddev->events; 2443 rdev->sb_events = mddev->events;
2444 if (rdev->badblocks.size) {
2445 md_super_write(mddev, rdev,
2446 rdev->badblocks.sector,
2447 rdev->badblocks.size << 9,
2448 rdev->bb_page);
2449 rdev->badblocks.size = 0;
2450 }
2293 2451
2294 } else 2452 } else
2295 dprintk(")\n"); 2453 dprintk(")\n");
@@ -2313,6 +2471,15 @@ repeat:
2313 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2471 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2314 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2472 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2315 2473
2474 list_for_each_entry(rdev, &mddev->disks, same_set) {
2475 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2476 clear_bit(Blocked, &rdev->flags);
2477
2478 if (any_badblocks_changed)
2479 md_ack_all_badblocks(&rdev->badblocks);
2480 clear_bit(BlockedBadBlocks, &rdev->flags);
2481 wake_up(&rdev->blocked_wait);
2482 }
2316} 2483}
2317 2484
2318/* words written to sysfs files may, or may not, be \n terminated. 2485/* words written to sysfs files may, or may not, be \n terminated.
@@ -2347,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page)
2347 char *sep = ""; 2514 char *sep = "";
2348 size_t len = 0; 2515 size_t len = 0;
2349 2516
2350 if (test_bit(Faulty, &rdev->flags)) { 2517 if (test_bit(Faulty, &rdev->flags) ||
2518 rdev->badblocks.unacked_exist) {
2351 len+= sprintf(page+len, "%sfaulty",sep); 2519 len+= sprintf(page+len, "%sfaulty",sep);
2352 sep = ","; 2520 sep = ",";
2353 } 2521 }
@@ -2359,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page)
2359 len += sprintf(page+len, "%swrite_mostly",sep); 2527 len += sprintf(page+len, "%swrite_mostly",sep);
2360 sep = ","; 2528 sep = ",";
2361 } 2529 }
2362 if (test_bit(Blocked, &rdev->flags)) { 2530 if (test_bit(Blocked, &rdev->flags) ||
2531 rdev->badblocks.unacked_exist) {
2363 len += sprintf(page+len, "%sblocked", sep); 2532 len += sprintf(page+len, "%sblocked", sep);
2364 sep = ","; 2533 sep = ",";
2365 } 2534 }
@@ -2368,6 +2537,10 @@ state_show(mdk_rdev_t *rdev, char *page)
2368 len += sprintf(page+len, "%sspare", sep); 2537 len += sprintf(page+len, "%sspare", sep);
2369 sep = ","; 2538 sep = ",";
2370 } 2539 }
2540 if (test_bit(WriteErrorSeen, &rdev->flags)) {
2541 len += sprintf(page+len, "%swrite_error", sep);
2542 sep = ",";
2543 }
2371 return len+sprintf(page+len, "\n"); 2544 return len+sprintf(page+len, "\n");
2372} 2545}
2373 2546
@@ -2375,13 +2548,15 @@ static ssize_t
2375state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2548state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2376{ 2549{
2377 /* can write 2550 /* can write
2378 * faulty - simulates and error 2551 * faulty - simulates an error
2379 * remove - disconnects the device 2552 * remove - disconnects the device
2380 * writemostly - sets write_mostly 2553 * writemostly - sets write_mostly
2381 * -writemostly - clears write_mostly 2554 * -writemostly - clears write_mostly
2382 * blocked - sets the Blocked flag 2555 * blocked - sets the Blocked flags
2383 * -blocked - clears the Blocked flag 2556 * -blocked - clears the Blocked and possibly simulates an error
2384 * insync - sets Insync providing device isn't active 2557 * insync - sets Insync providing device isn't active
2558 * write_error - sets WriteErrorSeen
2559 * -write_error - clears WriteErrorSeen
2385 */ 2560 */
2386 int err = -EINVAL; 2561 int err = -EINVAL;
2387 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2562 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
@@ -2408,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2408 set_bit(Blocked, &rdev->flags); 2583 set_bit(Blocked, &rdev->flags);
2409 err = 0; 2584 err = 0;
2410 } else if (cmd_match(buf, "-blocked")) { 2585 } else if (cmd_match(buf, "-blocked")) {
2586 if (!test_bit(Faulty, &rdev->flags) &&
2587 test_bit(BlockedBadBlocks, &rdev->flags)) {
2588 /* metadata handler doesn't understand badblocks,
2589 * so we need to fail the device
2590 */
2591 md_error(rdev->mddev, rdev);
2592 }
2411 clear_bit(Blocked, &rdev->flags); 2593 clear_bit(Blocked, &rdev->flags);
2594 clear_bit(BlockedBadBlocks, &rdev->flags);
2412 wake_up(&rdev->blocked_wait); 2595 wake_up(&rdev->blocked_wait);
2413 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2596 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2414 md_wakeup_thread(rdev->mddev->thread); 2597 md_wakeup_thread(rdev->mddev->thread);
@@ -2417,6 +2600,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2417 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2600 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2418 set_bit(In_sync, &rdev->flags); 2601 set_bit(In_sync, &rdev->flags);
2419 err = 0; 2602 err = 0;
2603 } else if (cmd_match(buf, "write_error")) {
2604 set_bit(WriteErrorSeen, &rdev->flags);
2605 err = 0;
2606 } else if (cmd_match(buf, "-write_error")) {
2607 clear_bit(WriteErrorSeen, &rdev->flags);
2608 err = 0;
2420 } 2609 }
2421 if (!err) 2610 if (!err)
2422 sysfs_notify_dirent_safe(rdev->sysfs_state); 2611 sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2459,7 +2648,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2459{ 2648{
2460 char *e; 2649 char *e;
2461 int err; 2650 int err;
2462 char nm[20];
2463 int slot = simple_strtoul(buf, &e, 10); 2651 int slot = simple_strtoul(buf, &e, 10);
2464 if (strncmp(buf, "none", 4)==0) 2652 if (strncmp(buf, "none", 4)==0)
2465 slot = -1; 2653 slot = -1;
@@ -2482,8 +2670,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2482 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2670 hot_remove_disk(rdev->mddev, rdev->raid_disk);
2483 if (err) 2671 if (err)
2484 return err; 2672 return err;
2485 sprintf(nm, "rd%d", rdev->raid_disk); 2673 sysfs_unlink_rdev(rdev->mddev, rdev);
2486 sysfs_remove_link(&rdev->mddev->kobj, nm);
2487 rdev->raid_disk = -1; 2674 rdev->raid_disk = -1;
2488 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2675 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2489 md_wakeup_thread(rdev->mddev->thread); 2676 md_wakeup_thread(rdev->mddev->thread);
@@ -2522,8 +2709,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2522 return err; 2709 return err;
2523 } else 2710 } else
2524 sysfs_notify_dirent_safe(rdev->sysfs_state); 2711 sysfs_notify_dirent_safe(rdev->sysfs_state);
2525 sprintf(nm, "rd%d", rdev->raid_disk); 2712 if (sysfs_link_rdev(rdev->mddev, rdev))
2526 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2527 /* failure here is OK */; 2713 /* failure here is OK */;
2528 /* don't wakeup anyone, leave that to userspace. */ 2714 /* don't wakeup anyone, leave that to userspace. */
2529 } else { 2715 } else {
@@ -2712,6 +2898,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le
2712static struct rdev_sysfs_entry rdev_recovery_start = 2898static struct rdev_sysfs_entry rdev_recovery_start =
2713__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 2899__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2714 2900
2901
2902static ssize_t
2903badblocks_show(struct badblocks *bb, char *page, int unack);
2904static ssize_t
2905badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
2906
2907static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
2908{
2909 return badblocks_show(&rdev->badblocks, page, 0);
2910}
2911static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
2912{
2913 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
2914 /* Maybe that ack was all we needed */
2915 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
2916 wake_up(&rdev->blocked_wait);
2917 return rv;
2918}
2919static struct rdev_sysfs_entry rdev_bad_blocks =
2920__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
2921
2922
2923static ssize_t ubb_show(mdk_rdev_t *rdev, char *page)
2924{
2925 return badblocks_show(&rdev->badblocks, page, 1);
2926}
2927static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len)
2928{
2929 return badblocks_store(&rdev->badblocks, page, len, 1);
2930}
2931static struct rdev_sysfs_entry rdev_unack_bad_blocks =
2932__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
2933
2715static struct attribute *rdev_default_attrs[] = { 2934static struct attribute *rdev_default_attrs[] = {
2716 &rdev_state.attr, 2935 &rdev_state.attr,
2717 &rdev_errors.attr, 2936 &rdev_errors.attr,
@@ -2719,6 +2938,8 @@ static struct attribute *rdev_default_attrs[] = {
2719 &rdev_offset.attr, 2938 &rdev_offset.attr,
2720 &rdev_size.attr, 2939 &rdev_size.attr,
2721 &rdev_recovery_start.attr, 2940 &rdev_recovery_start.attr,
2941 &rdev_bad_blocks.attr,
2942 &rdev_unack_bad_blocks.attr,
2722 NULL, 2943 NULL,
2723}; 2944};
2724static ssize_t 2945static ssize_t
@@ -2782,7 +3003,7 @@ static struct kobj_type rdev_ktype = {
2782 .default_attrs = rdev_default_attrs, 3003 .default_attrs = rdev_default_attrs,
2783}; 3004};
2784 3005
2785void md_rdev_init(mdk_rdev_t *rdev) 3006int md_rdev_init(mdk_rdev_t *rdev)
2786{ 3007{
2787 rdev->desc_nr = -1; 3008 rdev->desc_nr = -1;
2788 rdev->saved_raid_disk = -1; 3009 rdev->saved_raid_disk = -1;
@@ -2792,12 +3013,27 @@ void md_rdev_init(mdk_rdev_t *rdev)
2792 rdev->sb_events = 0; 3013 rdev->sb_events = 0;
2793 rdev->last_read_error.tv_sec = 0; 3014 rdev->last_read_error.tv_sec = 0;
2794 rdev->last_read_error.tv_nsec = 0; 3015 rdev->last_read_error.tv_nsec = 0;
3016 rdev->sb_loaded = 0;
3017 rdev->bb_page = NULL;
2795 atomic_set(&rdev->nr_pending, 0); 3018 atomic_set(&rdev->nr_pending, 0);
2796 atomic_set(&rdev->read_errors, 0); 3019 atomic_set(&rdev->read_errors, 0);
2797 atomic_set(&rdev->corrected_errors, 0); 3020 atomic_set(&rdev->corrected_errors, 0);
2798 3021
2799 INIT_LIST_HEAD(&rdev->same_set); 3022 INIT_LIST_HEAD(&rdev->same_set);
2800 init_waitqueue_head(&rdev->blocked_wait); 3023 init_waitqueue_head(&rdev->blocked_wait);
3024
3025 /* Add space to store bad block list.
3026 * This reserves the space even on arrays where it cannot
3027 * be used - I wonder if that matters
3028 */
3029 rdev->badblocks.count = 0;
3030 rdev->badblocks.shift = 0;
3031 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3032 seqlock_init(&rdev->badblocks.lock);
3033 if (rdev->badblocks.page == NULL)
3034 return -ENOMEM;
3035
3036 return 0;
2801} 3037}
2802EXPORT_SYMBOL_GPL(md_rdev_init); 3038EXPORT_SYMBOL_GPL(md_rdev_init);
2803/* 3039/*
@@ -2823,8 +3059,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2823 return ERR_PTR(-ENOMEM); 3059 return ERR_PTR(-ENOMEM);
2824 } 3060 }
2825 3061
2826 md_rdev_init(rdev); 3062 err = md_rdev_init(rdev);
2827 if ((err = alloc_disk_sb(rdev))) 3063 if (err)
3064 goto abort_free;
3065 err = alloc_disk_sb(rdev);
3066 if (err)
2828 goto abort_free; 3067 goto abort_free;
2829 3068
2830 err = lock_rdev(rdev, newdev, super_format == -2); 3069 err = lock_rdev(rdev, newdev, super_format == -2);
@@ -2860,15 +3099,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2860 goto abort_free; 3099 goto abort_free;
2861 } 3100 }
2862 } 3101 }
3102 if (super_format == -1)
3103 /* hot-add for 0.90, or non-persistent: so no badblocks */
3104 rdev->badblocks.shift = -1;
2863 3105
2864 return rdev; 3106 return rdev;
2865 3107
2866abort_free: 3108abort_free:
2867 if (rdev->sb_page) { 3109 if (rdev->bdev)
2868 if (rdev->bdev) 3110 unlock_rdev(rdev);
2869 unlock_rdev(rdev); 3111 free_disk_sb(rdev);
2870 free_disk_sb(rdev); 3112 kfree(rdev->badblocks.page);
2871 }
2872 kfree(rdev); 3113 kfree(rdev);
2873 return ERR_PTR(err); 3114 return ERR_PTR(err);
2874} 3115}
@@ -3149,15 +3390,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3149 } 3390 }
3150 3391
3151 list_for_each_entry(rdev, &mddev->disks, same_set) { 3392 list_for_each_entry(rdev, &mddev->disks, same_set) {
3152 char nm[20];
3153 if (rdev->raid_disk < 0) 3393 if (rdev->raid_disk < 0)
3154 continue; 3394 continue;
3155 if (rdev->new_raid_disk >= mddev->raid_disks) 3395 if (rdev->new_raid_disk >= mddev->raid_disks)
3156 rdev->new_raid_disk = -1; 3396 rdev->new_raid_disk = -1;
3157 if (rdev->new_raid_disk == rdev->raid_disk) 3397 if (rdev->new_raid_disk == rdev->raid_disk)
3158 continue; 3398 continue;
3159 sprintf(nm, "rd%d", rdev->raid_disk); 3399 sysfs_unlink_rdev(mddev, rdev);
3160 sysfs_remove_link(&mddev->kobj, nm);
3161 } 3400 }
3162 list_for_each_entry(rdev, &mddev->disks, same_set) { 3401 list_for_each_entry(rdev, &mddev->disks, same_set) {
3163 if (rdev->raid_disk < 0) 3402 if (rdev->raid_disk < 0)
@@ -3168,11 +3407,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3168 if (rdev->raid_disk < 0) 3407 if (rdev->raid_disk < 0)
3169 clear_bit(In_sync, &rdev->flags); 3408 clear_bit(In_sync, &rdev->flags);
3170 else { 3409 else {
3171 char nm[20]; 3410 if (sysfs_link_rdev(mddev, rdev))
3172 sprintf(nm, "rd%d", rdev->raid_disk); 3411 printk(KERN_WARNING "md: cannot register rd%d"
3173 if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 3412 " for %s after level change\n",
3174 printk("md: cannot register %s for %s after level change\n", 3413 rdev->raid_disk, mdname(mddev));
3175 nm, mdname(mddev));
3176 } 3414 }
3177 } 3415 }
3178 3416
@@ -4504,7 +4742,8 @@ int md_run(mddev_t *mddev)
4504 } 4742 }
4505 4743
4506 if (mddev->bio_set == NULL) 4744 if (mddev->bio_set == NULL)
4507 mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); 4745 mddev->bio_set = bioset_create(BIO_POOL_SIZE,
4746 sizeof(mddev_t *));
4508 4747
4509 spin_lock(&pers_lock); 4748 spin_lock(&pers_lock);
4510 pers = find_pers(mddev->level, mddev->clevel); 4749 pers = find_pers(mddev->level, mddev->clevel);
@@ -4621,12 +4860,9 @@ int md_run(mddev_t *mddev)
4621 smp_wmb(); 4860 smp_wmb();
4622 mddev->ready = 1; 4861 mddev->ready = 1;
4623 list_for_each_entry(rdev, &mddev->disks, same_set) 4862 list_for_each_entry(rdev, &mddev->disks, same_set)
4624 if (rdev->raid_disk >= 0) { 4863 if (rdev->raid_disk >= 0)
4625 char nm[20]; 4864 if (sysfs_link_rdev(mddev, rdev))
4626 sprintf(nm, "rd%d", rdev->raid_disk);
4627 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
4628 /* failure here is OK */; 4865 /* failure here is OK */;
4629 }
4630 4866
4631 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4867 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4632 4868
@@ -4854,11 +5090,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4854 sysfs_notify_dirent_safe(mddev->sysfs_state); 5090 sysfs_notify_dirent_safe(mddev->sysfs_state);
4855 5091
4856 list_for_each_entry(rdev, &mddev->disks, same_set) 5092 list_for_each_entry(rdev, &mddev->disks, same_set)
4857 if (rdev->raid_disk >= 0) { 5093 if (rdev->raid_disk >= 0)
4858 char nm[20]; 5094 sysfs_unlink_rdev(mddev, rdev);
4859 sprintf(nm, "rd%d", rdev->raid_disk);
4860 sysfs_remove_link(&mddev->kobj, nm);
4861 }
4862 5095
4863 set_capacity(disk, 0); 5096 set_capacity(disk, 0);
4864 mutex_unlock(&mddev->open_mutex); 5097 mutex_unlock(&mddev->open_mutex);
@@ -6198,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
6198 if (!rdev || test_bit(Faulty, &rdev->flags)) 6431 if (!rdev || test_bit(Faulty, &rdev->flags))
6199 return; 6432 return;
6200 6433
6201 if (mddev->external) 6434 if (!mddev->pers || !mddev->pers->error_handler)
6202 set_bit(Blocked, &rdev->flags);
6203/*
6204 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
6205 mdname(mddev),
6206 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
6207 __builtin_return_address(0),__builtin_return_address(1),
6208 __builtin_return_address(2),__builtin_return_address(3));
6209*/
6210 if (!mddev->pers)
6211 return;
6212 if (!mddev->pers->error_handler)
6213 return; 6435 return;
6214 mddev->pers->error_handler(mddev,rdev); 6436 mddev->pers->error_handler(mddev,rdev);
6215 if (mddev->degraded) 6437 if (mddev->degraded)
@@ -6933,11 +7155,14 @@ void md_do_sync(mddev_t *mddev)
6933 atomic_add(sectors, &mddev->recovery_active); 7155 atomic_add(sectors, &mddev->recovery_active);
6934 } 7156 }
6935 7157
7158 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7159 break;
7160
6936 j += sectors; 7161 j += sectors;
6937 if (j>1) mddev->curr_resync = j; 7162 if (j>1) mddev->curr_resync = j;
6938 mddev->curr_mark_cnt = io_sectors; 7163 mddev->curr_mark_cnt = io_sectors;
6939 if (last_check == 0) 7164 if (last_check == 0)
6940 /* this is the earliers that rebuilt will be 7165 /* this is the earliest that rebuild will be
6941 * visible in /proc/mdstat 7166 * visible in /proc/mdstat
6942 */ 7167 */
6943 md_new_event(mddev); 7168 md_new_event(mddev);
@@ -6946,10 +7171,6 @@ void md_do_sync(mddev_t *mddev)
6946 continue; 7171 continue;
6947 7172
6948 last_check = io_sectors; 7173 last_check = io_sectors;
6949
6950 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6951 break;
6952
6953 repeat: 7174 repeat:
6954 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 7175 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
6955 /* step marks */ 7176 /* step marks */
@@ -7067,29 +7288,23 @@ static int remove_and_add_spares(mddev_t *mddev)
7067 atomic_read(&rdev->nr_pending)==0) { 7288 atomic_read(&rdev->nr_pending)==0) {
7068 if (mddev->pers->hot_remove_disk( 7289 if (mddev->pers->hot_remove_disk(
7069 mddev, rdev->raid_disk)==0) { 7290 mddev, rdev->raid_disk)==0) {
7070 char nm[20]; 7291 sysfs_unlink_rdev(mddev, rdev);
7071 sprintf(nm,"rd%d", rdev->raid_disk);
7072 sysfs_remove_link(&mddev->kobj, nm);
7073 rdev->raid_disk = -1; 7292 rdev->raid_disk = -1;
7074 } 7293 }
7075 } 7294 }
7076 7295
7077 if (mddev->degraded && !mddev->recovery_disabled) { 7296 if (mddev->degraded) {
7078 list_for_each_entry(rdev, &mddev->disks, same_set) { 7297 list_for_each_entry(rdev, &mddev->disks, same_set) {
7079 if (rdev->raid_disk >= 0 && 7298 if (rdev->raid_disk >= 0 &&
7080 !test_bit(In_sync, &rdev->flags) && 7299 !test_bit(In_sync, &rdev->flags) &&
7081 !test_bit(Faulty, &rdev->flags) && 7300 !test_bit(Faulty, &rdev->flags))
7082 !test_bit(Blocked, &rdev->flags))
7083 spares++; 7301 spares++;
7084 if (rdev->raid_disk < 0 7302 if (rdev->raid_disk < 0
7085 && !test_bit(Faulty, &rdev->flags)) { 7303 && !test_bit(Faulty, &rdev->flags)) {
7086 rdev->recovery_offset = 0; 7304 rdev->recovery_offset = 0;
7087 if (mddev->pers-> 7305 if (mddev->pers->
7088 hot_add_disk(mddev, rdev) == 0) { 7306 hot_add_disk(mddev, rdev) == 0) {
7089 char nm[20]; 7307 if (sysfs_link_rdev(mddev, rdev))
7090 sprintf(nm, "rd%d", rdev->raid_disk);
7091 if (sysfs_create_link(&mddev->kobj,
7092 &rdev->kobj, nm))
7093 /* failure here is OK */; 7308 /* failure here is OK */;
7094 spares++; 7309 spares++;
7095 md_new_event(mddev); 7310 md_new_event(mddev);
@@ -7138,6 +7353,8 @@ static void reap_sync_thread(mddev_t *mddev)
7138 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7353 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7139 sysfs_notify_dirent_safe(mddev->sysfs_action); 7354 sysfs_notify_dirent_safe(mddev->sysfs_action);
7140 md_new_event(mddev); 7355 md_new_event(mddev);
7356 if (mddev->event_work.func)
7357 queue_work(md_misc_wq, &mddev->event_work);
7141} 7358}
7142 7359
7143/* 7360/*
@@ -7170,9 +7387,6 @@ void md_check_recovery(mddev_t *mddev)
7170 if (mddev->bitmap) 7387 if (mddev->bitmap)
7171 bitmap_daemon_work(mddev); 7388 bitmap_daemon_work(mddev);
7172 7389
7173 if (mddev->ro)
7174 return;
7175
7176 if (signal_pending(current)) { 7390 if (signal_pending(current)) {
7177 if (mddev->pers->sync_request && !mddev->external) { 7391 if (mddev->pers->sync_request && !mddev->external) {
7178 printk(KERN_INFO "md: %s in immediate safe mode\n", 7392 printk(KERN_INFO "md: %s in immediate safe mode\n",
@@ -7209,9 +7423,7 @@ void md_check_recovery(mddev_t *mddev)
7209 atomic_read(&rdev->nr_pending)==0) { 7423 atomic_read(&rdev->nr_pending)==0) {
7210 if (mddev->pers->hot_remove_disk( 7424 if (mddev->pers->hot_remove_disk(
7211 mddev, rdev->raid_disk)==0) { 7425 mddev, rdev->raid_disk)==0) {
7212 char nm[20]; 7426 sysfs_unlink_rdev(mddev, rdev);
7213 sprintf(nm,"rd%d", rdev->raid_disk);
7214 sysfs_remove_link(&mddev->kobj, nm);
7215 rdev->raid_disk = -1; 7427 rdev->raid_disk = -1;
7216 } 7428 }
7217 } 7429 }
@@ -7331,12 +7543,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
7331{ 7543{
7332 sysfs_notify_dirent_safe(rdev->sysfs_state); 7544 sysfs_notify_dirent_safe(rdev->sysfs_state);
7333 wait_event_timeout(rdev->blocked_wait, 7545 wait_event_timeout(rdev->blocked_wait,
7334 !test_bit(Blocked, &rdev->flags), 7546 !test_bit(Blocked, &rdev->flags) &&
7547 !test_bit(BlockedBadBlocks, &rdev->flags),
7335 msecs_to_jiffies(5000)); 7548 msecs_to_jiffies(5000));
7336 rdev_dec_pending(rdev, mddev); 7549 rdev_dec_pending(rdev, mddev);
7337} 7550}
7338EXPORT_SYMBOL(md_wait_for_blocked_rdev); 7551EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7339 7552
7553
7554/* Bad block management.
7555 * We can record which blocks on each device are 'bad' and so just
7556 * fail those blocks, or that stripe, rather than the whole device.
7557 * Entries in the bad-block table are 64bits wide. This comprises:
7558 * Length of bad-range, in sectors: 0-511 for lengths 1-512
7559 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
7560 * A 'shift' can be set so that larger blocks are tracked and
7561 * consequently larger devices can be covered.
7562 * 'Acknowledged' flag - 1 bit. - the most significant bit.
7563 *
7564 * Locking of the bad-block table uses a seqlock so md_is_badblock
7565 * might need to retry if it is very unlucky.
7566 * We will sometimes want to check for bad blocks in a bi_end_io function,
7567 * so we use the write_seqlock_irq variant.
7568 *
7569 * When looking for a bad block we specify a range and want to
7570 * know if any block in the range is bad. So we binary-search
7571 * to the last range that starts at-or-before the given endpoint,
7572 * (or "before the sector after the target range")
7573 * then see if it ends after the given start.
7574 * We return
7575 * 0 if there are no known bad blocks in the range
7576 * 1 if there are known bad block which are all acknowledged
7577 * -1 if there are bad blocks which have not yet been acknowledged in metadata.
7578 * plus the start/length of the first bad section we overlap.
7579 */
7580int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7581 sector_t *first_bad, int *bad_sectors)
7582{
7583 int hi;
7584 int lo = 0;
7585 u64 *p = bb->page;
7586 int rv = 0;
7587 sector_t target = s + sectors;
7588 unsigned seq;
7589
7590 if (bb->shift > 0) {
7591 /* round the start down, and the end up */
7592 s >>= bb->shift;
7593 target += (1<<bb->shift) - 1;
7594 target >>= bb->shift;
7595 sectors = target - s;
7596 }
7597 /* 'target' is now the first block after the bad range */
7598
7599retry:
7600 seq = read_seqbegin(&bb->lock);
7601
7602 hi = bb->count;
7603
7604 /* Binary search between lo and hi for 'target'
7605 * i.e. for the last range that starts before 'target'
7606 */
7607 /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
7608 * are known not to be the last range before target.
7609 * VARIANT: hi-lo is the number of possible
7610 * ranges, and decreases until it reaches 1
7611 */
7612 while (hi - lo > 1) {
7613 int mid = (lo + hi) / 2;
7614 sector_t a = BB_OFFSET(p[mid]);
7615 if (a < target)
7616 /* This could still be the one, earlier ranges
7617 * could not. */
7618 lo = mid;
7619 else
7620 /* This and later ranges are definitely out. */
7621 hi = mid;
7622 }
7623 /* 'lo' might be the last that started before target, but 'hi' isn't */
7624 if (hi > lo) {
7625 /* need to check all range that end after 's' to see if
7626 * any are unacknowledged.
7627 */
7628 while (lo >= 0 &&
7629 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7630 if (BB_OFFSET(p[lo]) < target) {
7631 /* starts before the end, and finishes after
7632 * the start, so they must overlap
7633 */
7634 if (rv != -1 && BB_ACK(p[lo]))
7635 rv = 1;
7636 else
7637 rv = -1;
7638 *first_bad = BB_OFFSET(p[lo]);
7639 *bad_sectors = BB_LEN(p[lo]);
7640 }
7641 lo--;
7642 }
7643 }
7644
7645 if (read_seqretry(&bb->lock, seq))
7646 goto retry;
7647
7648 return rv;
7649}
7650EXPORT_SYMBOL_GPL(md_is_badblock);
7651
7652/*
7653 * Add a range of bad blocks to the table.
7654 * This might extend the table, or might contract it
7655 * if two adjacent ranges can be merged.
7656 * We binary-search to find the 'insertion' point, then
7657 * decide how best to handle it.
7658 */
7659static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
7660 int acknowledged)
7661{
7662 u64 *p;
7663 int lo, hi;
7664 int rv = 1;
7665
7666 if (bb->shift < 0)
7667 /* badblocks are disabled */
7668 return 0;
7669
7670 if (bb->shift) {
7671 /* round the start down, and the end up */
7672 sector_t next = s + sectors;
7673 s >>= bb->shift;
7674 next += (1<<bb->shift) - 1;
7675 next >>= bb->shift;
7676 sectors = next - s;
7677 }
7678
7679 write_seqlock_irq(&bb->lock);
7680
7681 p = bb->page;
7682 lo = 0;
7683 hi = bb->count;
7684 /* Find the last range that starts at-or-before 's' */
7685 while (hi - lo > 1) {
7686 int mid = (lo + hi) / 2;
7687 sector_t a = BB_OFFSET(p[mid]);
7688 if (a <= s)
7689 lo = mid;
7690 else
7691 hi = mid;
7692 }
7693 if (hi > lo && BB_OFFSET(p[lo]) > s)
7694 hi = lo;
7695
7696 if (hi > lo) {
7697 /* we found a range that might merge with the start
7698 * of our new range
7699 */
7700 sector_t a = BB_OFFSET(p[lo]);
7701 sector_t e = a + BB_LEN(p[lo]);
7702 int ack = BB_ACK(p[lo]);
7703 if (e >= s) {
7704 /* Yes, we can merge with a previous range */
7705 if (s == a && s + sectors >= e)
7706 /* new range covers old */
7707 ack = acknowledged;
7708 else
7709 ack = ack && acknowledged;
7710
7711 if (e < s + sectors)
7712 e = s + sectors;
7713 if (e - a <= BB_MAX_LEN) {
7714 p[lo] = BB_MAKE(a, e-a, ack);
7715 s = e;
7716 } else {
7717 /* does not all fit in one range,
7718 * make p[lo] maximal
7719 */
7720 if (BB_LEN(p[lo]) != BB_MAX_LEN)
7721 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
7722 s = a + BB_MAX_LEN;
7723 }
7724 sectors = e - s;
7725 }
7726 }
7727 if (sectors && hi < bb->count) {
7728 /* 'hi' points to the first range that starts after 's'.
7729 * Maybe we can merge with the start of that range */
7730 sector_t a = BB_OFFSET(p[hi]);
7731 sector_t e = a + BB_LEN(p[hi]);
7732 int ack = BB_ACK(p[hi]);
7733 if (a <= s + sectors) {
7734 /* merging is possible */
7735 if (e <= s + sectors) {
7736 /* full overlap */
7737 e = s + sectors;
7738 ack = acknowledged;
7739 } else
7740 ack = ack && acknowledged;
7741
7742 a = s;
7743 if (e - a <= BB_MAX_LEN) {
7744 p[hi] = BB_MAKE(a, e-a, ack);
7745 s = e;
7746 } else {
7747 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
7748 s = a + BB_MAX_LEN;
7749 }
7750 sectors = e - s;
7751 lo = hi;
7752 hi++;
7753 }
7754 }
7755 if (sectors == 0 && hi < bb->count) {
7756 /* we might be able to combine lo and hi */
7757 /* Note: 's' is at the end of 'lo' */
7758 sector_t a = BB_OFFSET(p[hi]);
7759 int lolen = BB_LEN(p[lo]);
7760 int hilen = BB_LEN(p[hi]);
7761 int newlen = lolen + hilen - (s - a);
7762 if (s >= a && newlen < BB_MAX_LEN) {
7763 /* yes, we can combine them */
7764 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
7765 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
7766 memmove(p + hi, p + hi + 1,
7767 (bb->count - hi - 1) * 8);
7768 bb->count--;
7769 }
7770 }
7771 while (sectors) {
7772 /* didn't merge (it all).
7773 * Need to add a range just before 'hi' */
7774 if (bb->count >= MD_MAX_BADBLOCKS) {
7775 /* No room for more */
7776 rv = 0;
7777 break;
7778 } else {
7779 int this_sectors = sectors;
7780 memmove(p + hi + 1, p + hi,
7781 (bb->count - hi) * 8);
7782 bb->count++;
7783
7784 if (this_sectors > BB_MAX_LEN)
7785 this_sectors = BB_MAX_LEN;
7786 p[hi] = BB_MAKE(s, this_sectors, acknowledged);
7787 sectors -= this_sectors;
7788 s += this_sectors;
7789 }
7790 }
7791
7792 bb->changed = 1;
7793 if (!acknowledged)
7794 bb->unacked_exist = 1;
7795 write_sequnlock_irq(&bb->lock);
7796
7797 return rv;
7798}
7799
7800int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
7801 int acknowledged)
7802{
7803 int rv = md_set_badblocks(&rdev->badblocks,
7804 s + rdev->data_offset, sectors, acknowledged);
7805 if (rv) {
7806 /* Make sure they get written out promptly */
7807 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
7808 md_wakeup_thread(rdev->mddev->thread);
7809 }
7810 return rv;
7811}
7812EXPORT_SYMBOL_GPL(rdev_set_badblocks);
7813
7814/*
7815 * Remove a range of bad blocks from the table.
7816 * This may involve extending the table if we spilt a region,
7817 * but it must not fail. So if the table becomes full, we just
7818 * drop the remove request.
7819 */
7820static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
7821{
7822 u64 *p;
7823 int lo, hi;
7824 sector_t target = s + sectors;
7825 int rv = 0;
7826
7827 if (bb->shift > 0) {
7828 /* When clearing we round the start up and the end down.
7829 * This should not matter as the shift should align with
7830 * the block size and no rounding should ever be needed.
7831 * However it is better the think a block is bad when it
7832 * isn't than to think a block is not bad when it is.
7833 */
7834 s += (1<<bb->shift) - 1;
7835 s >>= bb->shift;
7836 target >>= bb->shift;
7837 sectors = target - s;
7838 }
7839
7840 write_seqlock_irq(&bb->lock);
7841
7842 p = bb->page;
7843 lo = 0;
7844 hi = bb->count;
7845 /* Find the last range that starts before 'target' */
7846 while (hi - lo > 1) {
7847 int mid = (lo + hi) / 2;
7848 sector_t a = BB_OFFSET(p[mid]);
7849 if (a < target)
7850 lo = mid;
7851 else
7852 hi = mid;
7853 }
7854 if (hi > lo) {
7855 /* p[lo] is the last range that could overlap the
7856 * current range. Earlier ranges could also overlap,
7857 * but only this one can overlap the end of the range.
7858 */
7859 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
7860 /* Partial overlap, leave the tail of this range */
7861 int ack = BB_ACK(p[lo]);
7862 sector_t a = BB_OFFSET(p[lo]);
7863 sector_t end = a + BB_LEN(p[lo]);
7864
7865 if (a < s) {
7866 /* we need to split this range */
7867 if (bb->count >= MD_MAX_BADBLOCKS) {
7868 rv = 0;
7869 goto out;
7870 }
7871 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
7872 bb->count++;
7873 p[lo] = BB_MAKE(a, s-a, ack);
7874 lo++;
7875 }
7876 p[lo] = BB_MAKE(target, end - target, ack);
7877 /* there is no longer an overlap */
7878 hi = lo;
7879 lo--;
7880 }
7881 while (lo >= 0 &&
7882 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7883 /* This range does overlap */
7884 if (BB_OFFSET(p[lo]) < s) {
7885 /* Keep the early parts of this range. */
7886 int ack = BB_ACK(p[lo]);
7887 sector_t start = BB_OFFSET(p[lo]);
7888 p[lo] = BB_MAKE(start, s - start, ack);
7889 /* now low doesn't overlap, so.. */
7890 break;
7891 }
7892 lo--;
7893 }
7894 /* 'lo' is strictly before, 'hi' is strictly after,
7895 * anything between needs to be discarded
7896 */
7897 if (hi - lo > 1) {
7898 memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
7899 bb->count -= (hi - lo - 1);
7900 }
7901 }
7902
7903 bb->changed = 1;
7904out:
7905 write_sequnlock_irq(&bb->lock);
7906 return rv;
7907}
7908
7909int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors)
7910{
7911 return md_clear_badblocks(&rdev->badblocks,
7912 s + rdev->data_offset,
7913 sectors);
7914}
7915EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
7916
7917/*
7918 * Acknowledge all bad blocks in a list.
7919 * This only succeeds if ->changed is clear. It is used by
7920 * in-kernel metadata updates
7921 */
7922void md_ack_all_badblocks(struct badblocks *bb)
7923{
7924 if (bb->page == NULL || bb->changed)
7925 /* no point even trying */
7926 return;
7927 write_seqlock_irq(&bb->lock);
7928
7929 if (bb->changed == 0) {
7930 u64 *p = bb->page;
7931 int i;
7932 for (i = 0; i < bb->count ; i++) {
7933 if (!BB_ACK(p[i])) {
7934 sector_t start = BB_OFFSET(p[i]);
7935 int len = BB_LEN(p[i]);
7936 p[i] = BB_MAKE(start, len, 1);
7937 }
7938 }
7939 bb->unacked_exist = 0;
7940 }
7941 write_sequnlock_irq(&bb->lock);
7942}
7943EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
7944
7945/* sysfs access to bad-blocks list.
7946 * We present two files.
7947 * 'bad-blocks' lists sector numbers and lengths of ranges that
7948 * are recorded as bad. The list is truncated to fit within
7949 * the one-page limit of sysfs.
7950 * Writing "sector length" to this file adds an acknowledged
7951 * bad block list.
7952 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
7953 * been acknowledged. Writing to this file adds bad blocks
7954 * without acknowledging them. This is largely for testing.
7955 */
7956
7957static ssize_t
7958badblocks_show(struct badblocks *bb, char *page, int unack)
7959{
7960 size_t len;
7961 int i;
7962 u64 *p = bb->page;
7963 unsigned seq;
7964
7965 if (bb->shift < 0)
7966 return 0;
7967
7968retry:
7969 seq = read_seqbegin(&bb->lock);
7970
7971 len = 0;
7972 i = 0;
7973
7974 while (len < PAGE_SIZE && i < bb->count) {
7975 sector_t s = BB_OFFSET(p[i]);
7976 unsigned int length = BB_LEN(p[i]);
7977 int ack = BB_ACK(p[i]);
7978 i++;
7979
7980 if (unack && ack)
7981 continue;
7982
7983 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
7984 (unsigned long long)s << bb->shift,
7985 length << bb->shift);
7986 }
7987 if (unack && len == 0)
7988 bb->unacked_exist = 0;
7989
7990 if (read_seqretry(&bb->lock, seq))
7991 goto retry;
7992
7993 return len;
7994}
7995
7996#define DO_DEBUG 1
7997
7998static ssize_t
7999badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8000{
8001 unsigned long long sector;
8002 int length;
8003 char newline;
8004#ifdef DO_DEBUG
8005 /* Allow clearing via sysfs *only* for testing/debugging.
8006 * Normally only a successful write may clear a badblock
8007 */
8008 int clear = 0;
8009 if (page[0] == '-') {
8010 clear = 1;
8011 page++;
8012 }
8013#endif /* DO_DEBUG */
8014
8015 switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
8016 case 3:
8017 if (newline != '\n')
8018 return -EINVAL;
8019 case 2:
8020 if (length <= 0)
8021 return -EINVAL;
8022 break;
8023 default:
8024 return -EINVAL;
8025 }
8026
8027#ifdef DO_DEBUG
8028 if (clear) {
8029 md_clear_badblocks(bb, sector, length);
8030 return len;
8031 }
8032#endif /* DO_DEBUG */
8033 if (md_set_badblocks(bb, sector, length, !unack))
8034 return len;
8035 else
8036 return -ENOSPC;
8037}
8038
7340static int md_notify_reboot(struct notifier_block *this, 8039static int md_notify_reboot(struct notifier_block *this,
7341 unsigned long code, void *x) 8040 unsigned long code, void *x)
7342{ 8041{
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1c26c7a08ae6..1e586bb4452e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,6 +29,13 @@
29typedef struct mddev_s mddev_t; 29typedef struct mddev_s mddev_t;
30typedef struct mdk_rdev_s mdk_rdev_t; 30typedef struct mdk_rdev_s mdk_rdev_t;
31 31
32/* Bad block numbers are stored sorted in a single page.
33 * 64bits is used for each block or extent.
34 * 54 bits are sector number, 9 bits are extent size,
35 * 1 bit is an 'acknowledged' flag.
36 */
37#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
38
32/* 39/*
33 * MD's 'extended' device 40 * MD's 'extended' device
34 */ 41 */
@@ -48,7 +55,7 @@ struct mdk_rdev_s
48 struct block_device *meta_bdev; 55 struct block_device *meta_bdev;
49 struct block_device *bdev; /* block device handle */ 56 struct block_device *bdev; /* block device handle */
50 57
51 struct page *sb_page; 58 struct page *sb_page, *bb_page;
52 int sb_loaded; 59 int sb_loaded;
53 __u64 sb_events; 60 __u64 sb_events;
54 sector_t data_offset; /* start of data in array */ 61 sector_t data_offset; /* start of data in array */
@@ -74,9 +81,29 @@ struct mdk_rdev_s
74#define In_sync 2 /* device is in_sync with rest of array */ 81#define In_sync 2 /* device is in_sync with rest of array */
75#define WriteMostly 4 /* Avoid reading if at all possible */ 82#define WriteMostly 4 /* Avoid reading if at all possible */
76#define AutoDetected 7 /* added by auto-detect */ 83#define AutoDetected 7 /* added by auto-detect */
77#define Blocked 8 /* An error occurred on an externally 84#define Blocked 8 /* An error occurred but has not yet
78 * managed array, don't allow writes 85 * been acknowledged by the metadata
86 * handler, so don't allow writes
79 * until it is cleared */ 87 * until it is cleared */
88#define WriteErrorSeen 9 /* A write error has been seen on this
89 * device
90 */
91#define FaultRecorded 10 /* Intermediate state for clearing
92 * Blocked. The Fault is/will-be
93 * recorded in the metadata, but that
94 * metadata hasn't been stored safely
95 * on disk yet.
96 */
97#define BlockedBadBlocks 11 /* A writer is blocked because they
98 * found an unacknowledged bad-block.
99 * This can safely be cleared at any
100 * time, and the writer will re-check.
101 * It may be set at any time, and at
102 * worst the writer will timeout and
103 * re-check. So setting it as
104 * accurately as possible is good, but
105 * not absolutely critical.
106 */
80 wait_queue_head_t blocked_wait; 107 wait_queue_head_t blocked_wait;
81 108
82 int desc_nr; /* descriptor index in the superblock */ 109 int desc_nr; /* descriptor index in the superblock */
@@ -111,8 +138,54 @@ struct mdk_rdev_s
111 138
112 struct sysfs_dirent *sysfs_state; /* handle for 'state' 139 struct sysfs_dirent *sysfs_state; /* handle for 'state'
113 * sysfs entry */ 140 * sysfs entry */
141
142 struct badblocks {
143 int count; /* count of bad blocks */
144 int unacked_exist; /* there probably are unacknowledged
145 * bad blocks. This is only cleared
146 * when a read discovers none
147 */
148 int shift; /* shift from sectors to block size
149 * a -ve shift means badblocks are
150 * disabled.*/
151 u64 *page; /* badblock list */
152 int changed;
153 seqlock_t lock;
154
155 sector_t sector;
156 sector_t size; /* in sectors */
157 } badblocks;
114}; 158};
115 159
160#define BB_LEN_MASK (0x00000000000001FFULL)
161#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
162#define BB_ACK_MASK (0x8000000000000000ULL)
163#define BB_MAX_LEN 512
164#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
165#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
166#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
167#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
168
169extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
170 sector_t *first_bad, int *bad_sectors);
171static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors,
172 sector_t *first_bad, int *bad_sectors)
173{
174 if (unlikely(rdev->badblocks.count)) {
175 int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
176 sectors,
177 first_bad, bad_sectors);
178 if (rv)
179 *first_bad -= rdev->data_offset;
180 return rv;
181 }
182 return 0;
183}
184extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
185 int acknowledged);
186extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors);
187extern void md_ack_all_badblocks(struct badblocks *bb);
188
116struct mddev_s 189struct mddev_s
117{ 190{
118 void *private; 191 void *private;
@@ -239,9 +312,12 @@ struct mddev_s
239#define MD_RECOVERY_FROZEN 9 312#define MD_RECOVERY_FROZEN 9
240 313
241 unsigned long recovery; 314 unsigned long recovery;
242 int recovery_disabled; /* if we detect that recovery 315 /* If a RAID personality determines that recovery (of a particular
243 * will always fail, set this 316 * device) will fail due to a read error on the source device, it
244 * so we don't loop trying */ 317 * takes a copy of this number and does not attempt recovery again
318 * until this number changes.
319 */
320 int recovery_disabled;
245 321
246 int in_sync; /* know to not need resync */ 322 int in_sync; /* know to not need resync */
247 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so 323 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
@@ -304,11 +380,6 @@ struct mddev_s
304 * hot-adding a bitmap. It should 380 * hot-adding a bitmap. It should
305 * eventually be settable by sysfs. 381 * eventually be settable by sysfs.
306 */ 382 */
307 /* When md is serving under dm, it might use a
308 * dirty_log to store the bits.
309 */
310 struct dm_dirty_log *log;
311
312 struct mutex mutex; 383 struct mutex mutex;
313 unsigned long chunksize; 384 unsigned long chunksize;
314 unsigned long daemon_sleep; /* how many jiffies between updates? */ 385 unsigned long daemon_sleep; /* how many jiffies between updates? */
@@ -413,6 +484,20 @@ static inline char * mdname (mddev_t * mddev)
413 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; 484 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
414} 485}
415 486
487static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
488{
489 char nm[20];
490 sprintf(nm, "rd%d", rdev->raid_disk);
491 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
492}
493
494static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
495{
496 char nm[20];
497 sprintf(nm, "rd%d", rdev->raid_disk);
498 sysfs_remove_link(&mddev->kobj, nm);
499}
500
416/* 501/*
417 * iterates through some rdev ringlist. It's safe to remove the 502 * iterates through some rdev ringlist. It's safe to remove the
418 * current 'rdev'. Dont touch 'tmp' though. 503 * current 'rdev'. Dont touch 'tmp' though.
@@ -505,7 +590,7 @@ extern void mddev_init(mddev_t *mddev);
505extern int md_run(mddev_t *mddev); 590extern int md_run(mddev_t *mddev);
506extern void md_stop(mddev_t *mddev); 591extern void md_stop(mddev_t *mddev);
507extern void md_stop_writes(mddev_t *mddev); 592extern void md_stop_writes(mddev_t *mddev);
508extern void md_rdev_init(mdk_rdev_t *rdev); 593extern int md_rdev_init(mdk_rdev_t *rdev);
509 594
510extern void mddev_suspend(mddev_t *mddev); 595extern void mddev_suspend(mddev_t *mddev);
511extern void mddev_resume(mddev_t *mddev); 596extern void mddev_resume(mddev_t *mddev);
@@ -514,4 +599,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
514extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 599extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
515 mddev_t *mddev); 600 mddev_t *mddev);
516extern int mddev_check_plugged(mddev_t *mddev); 601extern int mddev_check_plugged(mddev_t *mddev);
602extern void md_trim_bio(struct bio *bio, int offset, int size);
517#endif /* _MD_MD_H */ 603#endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f7431b6d8447..32323f0afd89 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -35,16 +35,13 @@
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/seq_file.h> 37#include <linux/seq_file.h>
38#include <linux/ratelimit.h>
38#include "md.h" 39#include "md.h"
39#include "raid1.h" 40#include "raid1.h"
40#include "bitmap.h" 41#include "bitmap.h"
41 42
42#define DEBUG 0 43#define DEBUG 0
43#if DEBUG 44#define PRINTK(x...) do { if (DEBUG) printk(x); } while (0)
44#define PRINTK(x...) printk(x)
45#else
46#define PRINTK(x...)
47#endif
48 45
49/* 46/*
50 * Number of guaranteed r1bios in case of extreme VM load: 47 * Number of guaranteed r1bios in case of extreme VM load:
@@ -166,7 +163,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
166 163
167 for (i = 0; i < conf->raid_disks; i++) { 164 for (i = 0; i < conf->raid_disks; i++) {
168 struct bio **bio = r1_bio->bios + i; 165 struct bio **bio = r1_bio->bios + i;
169 if (*bio && *bio != IO_BLOCKED) 166 if (!BIO_SPECIAL(*bio))
170 bio_put(*bio); 167 bio_put(*bio);
171 *bio = NULL; 168 *bio = NULL;
172 } 169 }
@@ -176,12 +173,6 @@ static void free_r1bio(r1bio_t *r1_bio)
176{ 173{
177 conf_t *conf = r1_bio->mddev->private; 174 conf_t *conf = r1_bio->mddev->private;
178 175
179 /*
180 * Wake up any possible resync thread that waits for the device
181 * to go idle.
182 */
183 allow_barrier(conf);
184
185 put_all_bios(conf, r1_bio); 176 put_all_bios(conf, r1_bio);
186 mempool_free(r1_bio, conf->r1bio_pool); 177 mempool_free(r1_bio, conf->r1bio_pool);
187} 178}
@@ -222,6 +213,33 @@ static void reschedule_retry(r1bio_t *r1_bio)
222 * operation and are ready to return a success/failure code to the buffer 213 * operation and are ready to return a success/failure code to the buffer
223 * cache layer. 214 * cache layer.
224 */ 215 */
216static void call_bio_endio(r1bio_t *r1_bio)
217{
218 struct bio *bio = r1_bio->master_bio;
219 int done;
220 conf_t *conf = r1_bio->mddev->private;
221
222 if (bio->bi_phys_segments) {
223 unsigned long flags;
224 spin_lock_irqsave(&conf->device_lock, flags);
225 bio->bi_phys_segments--;
226 done = (bio->bi_phys_segments == 0);
227 spin_unlock_irqrestore(&conf->device_lock, flags);
228 } else
229 done = 1;
230
231 if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
232 clear_bit(BIO_UPTODATE, &bio->bi_flags);
233 if (done) {
234 bio_endio(bio, 0);
235 /*
236 * Wake up any possible resync thread that waits for the device
237 * to go idle.
238 */
239 allow_barrier(conf);
240 }
241}
242
225static void raid_end_bio_io(r1bio_t *r1_bio) 243static void raid_end_bio_io(r1bio_t *r1_bio)
226{ 244{
227 struct bio *bio = r1_bio->master_bio; 245 struct bio *bio = r1_bio->master_bio;
@@ -234,8 +252,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
234 (unsigned long long) bio->bi_sector + 252 (unsigned long long) bio->bi_sector +
235 (bio->bi_size >> 9) - 1); 253 (bio->bi_size >> 9) - 1);
236 254
237 bio_endio(bio, 255 call_bio_endio(r1_bio);
238 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
239 } 256 }
240 free_r1bio(r1_bio); 257 free_r1bio(r1_bio);
241} 258}
@@ -287,36 +304,52 @@ static void raid1_end_read_request(struct bio *bio, int error)
287 * oops, read error: 304 * oops, read error:
288 */ 305 */
289 char b[BDEVNAME_SIZE]; 306 char b[BDEVNAME_SIZE];
290 if (printk_ratelimit()) 307 printk_ratelimited(
291 printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", 308 KERN_ERR "md/raid1:%s: %s: "
292 mdname(conf->mddev), 309 "rescheduling sector %llu\n",
293 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); 310 mdname(conf->mddev),
311 bdevname(conf->mirrors[mirror].rdev->bdev,
312 b),
313 (unsigned long long)r1_bio->sector);
314 set_bit(R1BIO_ReadError, &r1_bio->state);
294 reschedule_retry(r1_bio); 315 reschedule_retry(r1_bio);
295 } 316 }
296 317
297 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 318 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
298} 319}
299 320
321static void close_write(r1bio_t *r1_bio)
322{
323 /* it really is the end of this request */
324 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
325 /* free extra copy of the data pages */
326 int i = r1_bio->behind_page_count;
327 while (i--)
328 safe_put_page(r1_bio->behind_bvecs[i].bv_page);
329 kfree(r1_bio->behind_bvecs);
330 r1_bio->behind_bvecs = NULL;
331 }
332 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
334 r1_bio->sectors,
335 !test_bit(R1BIO_Degraded, &r1_bio->state),
336 test_bit(R1BIO_BehindIO, &r1_bio->state));
337 md_write_end(r1_bio->mddev);
338}
339
300static void r1_bio_write_done(r1bio_t *r1_bio) 340static void r1_bio_write_done(r1bio_t *r1_bio)
301{ 341{
302 if (atomic_dec_and_test(&r1_bio->remaining)) 342 if (!atomic_dec_and_test(&r1_bio->remaining))
303 { 343 return;
304 /* it really is the end of this request */ 344
305 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 345 if (test_bit(R1BIO_WriteError, &r1_bio->state))
306 /* free extra copy of the data pages */ 346 reschedule_retry(r1_bio);
307 int i = r1_bio->behind_page_count; 347 else {
308 while (i--) 348 close_write(r1_bio);
309 safe_put_page(r1_bio->behind_pages[i]); 349 if (test_bit(R1BIO_MadeGood, &r1_bio->state))
310 kfree(r1_bio->behind_pages); 350 reschedule_retry(r1_bio);
311 r1_bio->behind_pages = NULL; 351 else
312 } 352 raid_end_bio_io(r1_bio);
313 /* clear the bitmap if all writes complete successfully */
314 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
315 r1_bio->sectors,
316 !test_bit(R1BIO_Degraded, &r1_bio->state),
317 test_bit(R1BIO_BehindIO, &r1_bio->state));
318 md_write_end(r1_bio->mddev);
319 raid_end_bio_io(r1_bio);
320 } 353 }
321} 354}
322 355
@@ -336,13 +369,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
336 /* 369 /*
337 * 'one mirror IO has finished' event handler: 370 * 'one mirror IO has finished' event handler:
338 */ 371 */
339 r1_bio->bios[mirror] = NULL;
340 to_put = bio;
341 if (!uptodate) { 372 if (!uptodate) {
342 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 373 set_bit(WriteErrorSeen,
343 /* an I/O failed, we can't clear the bitmap */ 374 &conf->mirrors[mirror].rdev->flags);
344 set_bit(R1BIO_Degraded, &r1_bio->state); 375 set_bit(R1BIO_WriteError, &r1_bio->state);
345 } else 376 } else {
346 /* 377 /*
347 * Set R1BIO_Uptodate in our master bio, so that we 378 * Set R1BIO_Uptodate in our master bio, so that we
348 * will return a good error code for to the higher 379 * will return a good error code for to the higher
@@ -353,8 +384,22 @@ static void raid1_end_write_request(struct bio *bio, int error)
353 * to user-side. So if something waits for IO, then it 384 * to user-side. So if something waits for IO, then it
354 * will wait for the 'master' bio. 385 * will wait for the 'master' bio.
355 */ 386 */
387 sector_t first_bad;
388 int bad_sectors;
389
390 r1_bio->bios[mirror] = NULL;
391 to_put = bio;
356 set_bit(R1BIO_Uptodate, &r1_bio->state); 392 set_bit(R1BIO_Uptodate, &r1_bio->state);
357 393
394 /* Maybe we can clear some bad blocks. */
395 if (is_badblock(conf->mirrors[mirror].rdev,
396 r1_bio->sector, r1_bio->sectors,
397 &first_bad, &bad_sectors)) {
398 r1_bio->bios[mirror] = IO_MADE_GOOD;
399 set_bit(R1BIO_MadeGood, &r1_bio->state);
400 }
401 }
402
358 update_head_pos(mirror, r1_bio); 403 update_head_pos(mirror, r1_bio);
359 404
360 if (behind) { 405 if (behind) {
@@ -377,11 +422,13 @@ static void raid1_end_write_request(struct bio *bio, int error)
377 (unsigned long long) mbio->bi_sector, 422 (unsigned long long) mbio->bi_sector,
378 (unsigned long long) mbio->bi_sector + 423 (unsigned long long) mbio->bi_sector +
379 (mbio->bi_size >> 9) - 1); 424 (mbio->bi_size >> 9) - 1);
380 bio_endio(mbio, 0); 425 call_bio_endio(r1_bio);
381 } 426 }
382 } 427 }
383 } 428 }
384 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 429 if (r1_bio->bios[mirror] == NULL)
430 rdev_dec_pending(conf->mirrors[mirror].rdev,
431 conf->mddev);
385 432
386 /* 433 /*
387 * Let's see if all mirrored write operations have finished 434 * Let's see if all mirrored write operations have finished
@@ -408,10 +455,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
408 * 455 *
409 * The rdev for the device selected will have nr_pending incremented. 456 * The rdev for the device selected will have nr_pending incremented.
410 */ 457 */
411static int read_balance(conf_t *conf, r1bio_t *r1_bio) 458static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors)
412{ 459{
413 const sector_t this_sector = r1_bio->sector; 460 const sector_t this_sector = r1_bio->sector;
414 const int sectors = r1_bio->sectors; 461 int sectors;
462 int best_good_sectors;
415 int start_disk; 463 int start_disk;
416 int best_disk; 464 int best_disk;
417 int i; 465 int i;
@@ -426,8 +474,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
426 * We take the first readable disk when above the resync window. 474 * We take the first readable disk when above the resync window.
427 */ 475 */
428 retry: 476 retry:
477 sectors = r1_bio->sectors;
429 best_disk = -1; 478 best_disk = -1;
430 best_dist = MaxSector; 479 best_dist = MaxSector;
480 best_good_sectors = 0;
481
431 if (conf->mddev->recovery_cp < MaxSector && 482 if (conf->mddev->recovery_cp < MaxSector &&
432 (this_sector + sectors >= conf->next_resync)) { 483 (this_sector + sectors >= conf->next_resync)) {
433 choose_first = 1; 484 choose_first = 1;
@@ -439,6 +490,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
439 490
440 for (i = 0 ; i < conf->raid_disks ; i++) { 491 for (i = 0 ; i < conf->raid_disks ; i++) {
441 sector_t dist; 492 sector_t dist;
493 sector_t first_bad;
494 int bad_sectors;
495
442 int disk = start_disk + i; 496 int disk = start_disk + i;
443 if (disk >= conf->raid_disks) 497 if (disk >= conf->raid_disks)
444 disk -= conf->raid_disks; 498 disk -= conf->raid_disks;
@@ -461,6 +515,35 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
461 /* This is a reasonable device to use. It might 515 /* This is a reasonable device to use. It might
462 * even be best. 516 * even be best.
463 */ 517 */
518 if (is_badblock(rdev, this_sector, sectors,
519 &first_bad, &bad_sectors)) {
520 if (best_dist < MaxSector)
521 /* already have a better device */
522 continue;
523 if (first_bad <= this_sector) {
524 /* cannot read here. If this is the 'primary'
525 * device, then we must not read beyond
526 * bad_sectors from another device..
527 */
528 bad_sectors -= (this_sector - first_bad);
529 if (choose_first && sectors > bad_sectors)
530 sectors = bad_sectors;
531 if (best_good_sectors > sectors)
532 best_good_sectors = sectors;
533
534 } else {
535 sector_t good_sectors = first_bad - this_sector;
536 if (good_sectors > best_good_sectors) {
537 best_good_sectors = good_sectors;
538 best_disk = disk;
539 }
540 if (choose_first)
541 break;
542 }
543 continue;
544 } else
545 best_good_sectors = sectors;
546
464 dist = abs(this_sector - conf->mirrors[disk].head_position); 547 dist = abs(this_sector - conf->mirrors[disk].head_position);
465 if (choose_first 548 if (choose_first
466 /* Don't change to another disk for sequential reads */ 549 /* Don't change to another disk for sequential reads */
@@ -489,10 +572,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
489 rdev_dec_pending(rdev, conf->mddev); 572 rdev_dec_pending(rdev, conf->mddev);
490 goto retry; 573 goto retry;
491 } 574 }
575 sectors = best_good_sectors;
492 conf->next_seq_sect = this_sector + sectors; 576 conf->next_seq_sect = this_sector + sectors;
493 conf->last_used = best_disk; 577 conf->last_used = best_disk;
494 } 578 }
495 rcu_read_unlock(); 579 rcu_read_unlock();
580 *max_sectors = sectors;
496 581
497 return best_disk; 582 return best_disk;
498} 583}
@@ -672,30 +757,31 @@ static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
672{ 757{
673 int i; 758 int i;
674 struct bio_vec *bvec; 759 struct bio_vec *bvec;
675 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), 760 struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
676 GFP_NOIO); 761 GFP_NOIO);
677 if (unlikely(!pages)) 762 if (unlikely(!bvecs))
678 return; 763 return;
679 764
680 bio_for_each_segment(bvec, bio, i) { 765 bio_for_each_segment(bvec, bio, i) {
681 pages[i] = alloc_page(GFP_NOIO); 766 bvecs[i] = *bvec;
682 if (unlikely(!pages[i])) 767 bvecs[i].bv_page = alloc_page(GFP_NOIO);
768 if (unlikely(!bvecs[i].bv_page))
683 goto do_sync_io; 769 goto do_sync_io;
684 memcpy(kmap(pages[i]) + bvec->bv_offset, 770 memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
685 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); 771 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
686 kunmap(pages[i]); 772 kunmap(bvecs[i].bv_page);
687 kunmap(bvec->bv_page); 773 kunmap(bvec->bv_page);
688 } 774 }
689 r1_bio->behind_pages = pages; 775 r1_bio->behind_bvecs = bvecs;
690 r1_bio->behind_page_count = bio->bi_vcnt; 776 r1_bio->behind_page_count = bio->bi_vcnt;
691 set_bit(R1BIO_BehindIO, &r1_bio->state); 777 set_bit(R1BIO_BehindIO, &r1_bio->state);
692 return; 778 return;
693 779
694do_sync_io: 780do_sync_io:
695 for (i = 0; i < bio->bi_vcnt; i++) 781 for (i = 0; i < bio->bi_vcnt; i++)
696 if (pages[i]) 782 if (bvecs[i].bv_page)
697 put_page(pages[i]); 783 put_page(bvecs[i].bv_page);
698 kfree(pages); 784 kfree(bvecs);
699 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 785 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
700} 786}
701 787
@@ -705,7 +791,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
705 mirror_info_t *mirror; 791 mirror_info_t *mirror;
706 r1bio_t *r1_bio; 792 r1bio_t *r1_bio;
707 struct bio *read_bio; 793 struct bio *read_bio;
708 int i, targets = 0, disks; 794 int i, disks;
709 struct bitmap *bitmap; 795 struct bitmap *bitmap;
710 unsigned long flags; 796 unsigned long flags;
711 const int rw = bio_data_dir(bio); 797 const int rw = bio_data_dir(bio);
@@ -713,6 +799,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
713 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 799 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
714 mdk_rdev_t *blocked_rdev; 800 mdk_rdev_t *blocked_rdev;
715 int plugged; 801 int plugged;
802 int first_clone;
803 int sectors_handled;
804 int max_sectors;
716 805
717 /* 806 /*
718 * Register the new request and wait if the reconstruction 807 * Register the new request and wait if the reconstruction
@@ -759,11 +848,24 @@ static int make_request(mddev_t *mddev, struct bio * bio)
759 r1_bio->mddev = mddev; 848 r1_bio->mddev = mddev;
760 r1_bio->sector = bio->bi_sector; 849 r1_bio->sector = bio->bi_sector;
761 850
851 /* We might need to issue multiple reads to different
852 * devices if there are bad blocks around, so we keep
853 * track of the number of reads in bio->bi_phys_segments.
854 * If this is 0, there is only one r1_bio and no locking
855 * will be needed when requests complete. If it is
856 * non-zero, then it is the number of not-completed requests.
857 */
858 bio->bi_phys_segments = 0;
859 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
860
762 if (rw == READ) { 861 if (rw == READ) {
763 /* 862 /*
764 * read balancing logic: 863 * read balancing logic:
765 */ 864 */
766 int rdisk = read_balance(conf, r1_bio); 865 int rdisk;
866
867read_again:
868 rdisk = read_balance(conf, r1_bio, &max_sectors);
767 869
768 if (rdisk < 0) { 870 if (rdisk < 0) {
769 /* couldn't find anywhere to read from */ 871 /* couldn't find anywhere to read from */
@@ -784,6 +886,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
784 r1_bio->read_disk = rdisk; 886 r1_bio->read_disk = rdisk;
785 887
786 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 888 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
889 md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector,
890 max_sectors);
787 891
788 r1_bio->bios[rdisk] = read_bio; 892 r1_bio->bios[rdisk] = read_bio;
789 893
@@ -793,16 +897,52 @@ static int make_request(mddev_t *mddev, struct bio * bio)
793 read_bio->bi_rw = READ | do_sync; 897 read_bio->bi_rw = READ | do_sync;
794 read_bio->bi_private = r1_bio; 898 read_bio->bi_private = r1_bio;
795 899
796 generic_make_request(read_bio); 900 if (max_sectors < r1_bio->sectors) {
901 /* could not read all from this device, so we will
902 * need another r1_bio.
903 */
904
905 sectors_handled = (r1_bio->sector + max_sectors
906 - bio->bi_sector);
907 r1_bio->sectors = max_sectors;
908 spin_lock_irq(&conf->device_lock);
909 if (bio->bi_phys_segments == 0)
910 bio->bi_phys_segments = 2;
911 else
912 bio->bi_phys_segments++;
913 spin_unlock_irq(&conf->device_lock);
914 /* Cannot call generic_make_request directly
915 * as that will be queued in __make_request
916 * and subsequent mempool_alloc might block waiting
917 * for it. So hand bio over to raid1d.
918 */
919 reschedule_retry(r1_bio);
920
921 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
922
923 r1_bio->master_bio = bio;
924 r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
925 r1_bio->state = 0;
926 r1_bio->mddev = mddev;
927 r1_bio->sector = bio->bi_sector + sectors_handled;
928 goto read_again;
929 } else
930 generic_make_request(read_bio);
797 return 0; 931 return 0;
798 } 932 }
799 933
800 /* 934 /*
801 * WRITE: 935 * WRITE:
802 */ 936 */
803 /* first select target devices under spinlock and 937 /* first select target devices under rcu_lock and
804 * inc refcount on their rdev. Record them by setting 938 * inc refcount on their rdev. Record them by setting
805 * bios[x] to bio 939 * bios[x] to bio
940 * If there are known/acknowledged bad blocks on any device on
941 * which we have seen a write error, we want to avoid writing those
942 * blocks.
943 * This potentially requires several writes to write around
944 * the bad blocks. Each set of writes gets it's own r1bio
945 * with a set of bios attached.
806 */ 946 */
807 plugged = mddev_check_plugged(mddev); 947 plugged = mddev_check_plugged(mddev);
808 948
@@ -810,6 +950,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
810 retry_write: 950 retry_write:
811 blocked_rdev = NULL; 951 blocked_rdev = NULL;
812 rcu_read_lock(); 952 rcu_read_lock();
953 max_sectors = r1_bio->sectors;
813 for (i = 0; i < disks; i++) { 954 for (i = 0; i < disks; i++) {
814 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 955 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
815 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 956 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -817,17 +958,56 @@ static int make_request(mddev_t *mddev, struct bio * bio)
817 blocked_rdev = rdev; 958 blocked_rdev = rdev;
818 break; 959 break;
819 } 960 }
820 if (rdev && !test_bit(Faulty, &rdev->flags)) { 961 r1_bio->bios[i] = NULL;
821 atomic_inc(&rdev->nr_pending); 962 if (!rdev || test_bit(Faulty, &rdev->flags)) {
822 if (test_bit(Faulty, &rdev->flags)) { 963 set_bit(R1BIO_Degraded, &r1_bio->state);
964 continue;
965 }
966
967 atomic_inc(&rdev->nr_pending);
968 if (test_bit(WriteErrorSeen, &rdev->flags)) {
969 sector_t first_bad;
970 int bad_sectors;
971 int is_bad;
972
973 is_bad = is_badblock(rdev, r1_bio->sector,
974 max_sectors,
975 &first_bad, &bad_sectors);
976 if (is_bad < 0) {
977 /* mustn't write here until the bad block is
978 * acknowledged*/
979 set_bit(BlockedBadBlocks, &rdev->flags);
980 blocked_rdev = rdev;
981 break;
982 }
983 if (is_bad && first_bad <= r1_bio->sector) {
984 /* Cannot write here at all */
985 bad_sectors -= (r1_bio->sector - first_bad);
986 if (bad_sectors < max_sectors)
987 /* mustn't write more than bad_sectors
988 * to other devices yet
989 */
990 max_sectors = bad_sectors;
823 rdev_dec_pending(rdev, mddev); 991 rdev_dec_pending(rdev, mddev);
824 r1_bio->bios[i] = NULL; 992 /* We don't set R1BIO_Degraded as that
825 } else { 993 * only applies if the disk is
826 r1_bio->bios[i] = bio; 994 * missing, so it might be re-added,
827 targets++; 995 * and we want to know to recover this
996 * chunk.
997 * In this case the device is here,
998 * and the fact that this chunk is not
999 * in-sync is recorded in the bad
1000 * block log
1001 */
1002 continue;
828 } 1003 }
829 } else 1004 if (is_bad) {
830 r1_bio->bios[i] = NULL; 1005 int good_sectors = first_bad - r1_bio->sector;
1006 if (good_sectors < max_sectors)
1007 max_sectors = good_sectors;
1008 }
1009 }
1010 r1_bio->bios[i] = bio;
831 } 1011 }
832 rcu_read_unlock(); 1012 rcu_read_unlock();
833 1013
@@ -838,51 +1018,57 @@ static int make_request(mddev_t *mddev, struct bio * bio)
838 for (j = 0; j < i; j++) 1018 for (j = 0; j < i; j++)
839 if (r1_bio->bios[j]) 1019 if (r1_bio->bios[j])
840 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1020 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
841 1021 r1_bio->state = 0;
842 allow_barrier(conf); 1022 allow_barrier(conf);
843 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1023 md_wait_for_blocked_rdev(blocked_rdev, mddev);
844 wait_barrier(conf); 1024 wait_barrier(conf);
845 goto retry_write; 1025 goto retry_write;
846 } 1026 }
847 1027
848 BUG_ON(targets == 0); /* we never fail the last device */ 1028 if (max_sectors < r1_bio->sectors) {
849 1029 /* We are splitting this write into multiple parts, so
850 if (targets < conf->raid_disks) { 1030 * we need to prepare for allocating another r1_bio.
851 /* array is degraded, we will not clear the bitmap 1031 */
852 * on I/O completion (see raid1_end_write_request) */ 1032 r1_bio->sectors = max_sectors;
853 set_bit(R1BIO_Degraded, &r1_bio->state); 1033 spin_lock_irq(&conf->device_lock);
1034 if (bio->bi_phys_segments == 0)
1035 bio->bi_phys_segments = 2;
1036 else
1037 bio->bi_phys_segments++;
1038 spin_unlock_irq(&conf->device_lock);
854 } 1039 }
855 1040 sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
856 /* do behind I/O ?
857 * Not if there are too many, or cannot allocate memory,
858 * or a reader on WriteMostly is waiting for behind writes
859 * to flush */
860 if (bitmap &&
861 (atomic_read(&bitmap->behind_writes)
862 < mddev->bitmap_info.max_write_behind) &&
863 !waitqueue_active(&bitmap->behind_wait))
864 alloc_behind_pages(bio, r1_bio);
865 1041
866 atomic_set(&r1_bio->remaining, 1); 1042 atomic_set(&r1_bio->remaining, 1);
867 atomic_set(&r1_bio->behind_remaining, 0); 1043 atomic_set(&r1_bio->behind_remaining, 0);
868 1044
869 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, 1045 first_clone = 1;
870 test_bit(R1BIO_BehindIO, &r1_bio->state));
871 for (i = 0; i < disks; i++) { 1046 for (i = 0; i < disks; i++) {
872 struct bio *mbio; 1047 struct bio *mbio;
873 if (!r1_bio->bios[i]) 1048 if (!r1_bio->bios[i])
874 continue; 1049 continue;
875 1050
876 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1051 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
877 r1_bio->bios[i] = mbio; 1052 md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
878 1053
879 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 1054 if (first_clone) {
880 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1055 /* do behind I/O ?
881 mbio->bi_end_io = raid1_end_write_request; 1056 * Not if there are too many, or cannot
882 mbio->bi_rw = WRITE | do_flush_fua | do_sync; 1057 * allocate memory, or a reader on WriteMostly
883 mbio->bi_private = r1_bio; 1058 * is waiting for behind writes to flush */
884 1059 if (bitmap &&
885 if (r1_bio->behind_pages) { 1060 (atomic_read(&bitmap->behind_writes)
1061 < mddev->bitmap_info.max_write_behind) &&
1062 !waitqueue_active(&bitmap->behind_wait))
1063 alloc_behind_pages(mbio, r1_bio);
1064
1065 bitmap_startwrite(bitmap, r1_bio->sector,
1066 r1_bio->sectors,
1067 test_bit(R1BIO_BehindIO,
1068 &r1_bio->state));
1069 first_clone = 0;
1070 }
1071 if (r1_bio->behind_bvecs) {
886 struct bio_vec *bvec; 1072 struct bio_vec *bvec;
887 int j; 1073 int j;
888 1074
@@ -894,11 +1080,20 @@ static int make_request(mddev_t *mddev, struct bio * bio)
894 * them all 1080 * them all
895 */ 1081 */
896 __bio_for_each_segment(bvec, mbio, j, 0) 1082 __bio_for_each_segment(bvec, mbio, j, 0)
897 bvec->bv_page = r1_bio->behind_pages[j]; 1083 bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
898 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 1084 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
899 atomic_inc(&r1_bio->behind_remaining); 1085 atomic_inc(&r1_bio->behind_remaining);
900 } 1086 }
901 1087
1088 r1_bio->bios[i] = mbio;
1089
1090 mbio->bi_sector = (r1_bio->sector +
1091 conf->mirrors[i].rdev->data_offset);
1092 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1093 mbio->bi_end_io = raid1_end_write_request;
1094 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
1095 mbio->bi_private = r1_bio;
1096
902 atomic_inc(&r1_bio->remaining); 1097 atomic_inc(&r1_bio->remaining);
903 spin_lock_irqsave(&conf->device_lock, flags); 1098 spin_lock_irqsave(&conf->device_lock, flags);
904 bio_list_add(&conf->pending_bio_list, mbio); 1099 bio_list_add(&conf->pending_bio_list, mbio);
@@ -909,6 +1104,19 @@ static int make_request(mddev_t *mddev, struct bio * bio)
909 /* In case raid1d snuck in to freeze_array */ 1104 /* In case raid1d snuck in to freeze_array */
910 wake_up(&conf->wait_barrier); 1105 wake_up(&conf->wait_barrier);
911 1106
1107 if (sectors_handled < (bio->bi_size >> 9)) {
1108 /* We need another r1_bio. It has already been counted
1109 * in bio->bi_phys_segments
1110 */
1111 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1112 r1_bio->master_bio = bio;
1113 r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1114 r1_bio->state = 0;
1115 r1_bio->mddev = mddev;
1116 r1_bio->sector = bio->bi_sector + sectors_handled;
1117 goto retry_write;
1118 }
1119
912 if (do_sync || !bitmap || !plugged) 1120 if (do_sync || !bitmap || !plugged)
913 md_wakeup_thread(mddev->thread); 1121 md_wakeup_thread(mddev->thread);
914 1122
@@ -952,9 +1160,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
952 * However don't try a recovery from this drive as 1160 * However don't try a recovery from this drive as
953 * it is very likely to fail. 1161 * it is very likely to fail.
954 */ 1162 */
955 mddev->recovery_disabled = 1; 1163 conf->recovery_disabled = mddev->recovery_disabled;
956 return; 1164 return;
957 } 1165 }
1166 set_bit(Blocked, &rdev->flags);
958 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1167 if (test_and_clear_bit(In_sync, &rdev->flags)) {
959 unsigned long flags; 1168 unsigned long flags;
960 spin_lock_irqsave(&conf->device_lock, flags); 1169 spin_lock_irqsave(&conf->device_lock, flags);
@@ -1027,7 +1236,7 @@ static int raid1_spare_active(mddev_t *mddev)
1027 && !test_bit(Faulty, &rdev->flags) 1236 && !test_bit(Faulty, &rdev->flags)
1028 && !test_and_set_bit(In_sync, &rdev->flags)) { 1237 && !test_and_set_bit(In_sync, &rdev->flags)) {
1029 count++; 1238 count++;
1030 sysfs_notify_dirent(rdev->sysfs_state); 1239 sysfs_notify_dirent_safe(rdev->sysfs_state);
1031 } 1240 }
1032 } 1241 }
1033 spin_lock_irqsave(&conf->device_lock, flags); 1242 spin_lock_irqsave(&conf->device_lock, flags);
@@ -1048,6 +1257,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1048 int first = 0; 1257 int first = 0;
1049 int last = mddev->raid_disks - 1; 1258 int last = mddev->raid_disks - 1;
1050 1259
1260 if (mddev->recovery_disabled == conf->recovery_disabled)
1261 return -EBUSY;
1262
1051 if (rdev->raid_disk >= 0) 1263 if (rdev->raid_disk >= 0)
1052 first = last = rdev->raid_disk; 1264 first = last = rdev->raid_disk;
1053 1265
@@ -1103,7 +1315,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
1103 * is not possible. 1315 * is not possible.
1104 */ 1316 */
1105 if (!test_bit(Faulty, &rdev->flags) && 1317 if (!test_bit(Faulty, &rdev->flags) &&
1106 !mddev->recovery_disabled && 1318 mddev->recovery_disabled != conf->recovery_disabled &&
1107 mddev->degraded < conf->raid_disks) { 1319 mddev->degraded < conf->raid_disks) {
1108 err = -EBUSY; 1320 err = -EBUSY;
1109 goto abort; 1321 goto abort;
@@ -1155,6 +1367,8 @@ static void end_sync_write(struct bio *bio, int error)
1155 conf_t *conf = mddev->private; 1367 conf_t *conf = mddev->private;
1156 int i; 1368 int i;
1157 int mirror=0; 1369 int mirror=0;
1370 sector_t first_bad;
1371 int bad_sectors;
1158 1372
1159 for (i = 0; i < conf->raid_disks; i++) 1373 for (i = 0; i < conf->raid_disks; i++)
1160 if (r1_bio->bios[i] == bio) { 1374 if (r1_bio->bios[i] == bio) {
@@ -1172,18 +1386,48 @@ static void end_sync_write(struct bio *bio, int error)
1172 s += sync_blocks; 1386 s += sync_blocks;
1173 sectors_to_go -= sync_blocks; 1387 sectors_to_go -= sync_blocks;
1174 } while (sectors_to_go > 0); 1388 } while (sectors_to_go > 0);
1175 md_error(mddev, conf->mirrors[mirror].rdev); 1389 set_bit(WriteErrorSeen,
1176 } 1390 &conf->mirrors[mirror].rdev->flags);
1391 set_bit(R1BIO_WriteError, &r1_bio->state);
1392 } else if (is_badblock(conf->mirrors[mirror].rdev,
1393 r1_bio->sector,
1394 r1_bio->sectors,
1395 &first_bad, &bad_sectors) &&
1396 !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
1397 r1_bio->sector,
1398 r1_bio->sectors,
1399 &first_bad, &bad_sectors)
1400 )
1401 set_bit(R1BIO_MadeGood, &r1_bio->state);
1177 1402
1178 update_head_pos(mirror, r1_bio); 1403 update_head_pos(mirror, r1_bio);
1179 1404
1180 if (atomic_dec_and_test(&r1_bio->remaining)) { 1405 if (atomic_dec_and_test(&r1_bio->remaining)) {
1181 sector_t s = r1_bio->sectors; 1406 int s = r1_bio->sectors;
1182 put_buf(r1_bio); 1407 if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1183 md_done_sync(mddev, s, uptodate); 1408 test_bit(R1BIO_WriteError, &r1_bio->state))
1409 reschedule_retry(r1_bio);
1410 else {
1411 put_buf(r1_bio);
1412 md_done_sync(mddev, s, uptodate);
1413 }
1184 } 1414 }
1185} 1415}
1186 1416
1417static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
1418 int sectors, struct page *page, int rw)
1419{
1420 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1421 /* success */
1422 return 1;
1423 if (rw == WRITE)
1424 set_bit(WriteErrorSeen, &rdev->flags);
1425 /* need to record an error - either for the block or the device */
1426 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1427 md_error(rdev->mddev, rdev);
1428 return 0;
1429}
1430
1187static int fix_sync_read_error(r1bio_t *r1_bio) 1431static int fix_sync_read_error(r1bio_t *r1_bio)
1188{ 1432{
1189 /* Try some synchronous reads of other devices to get 1433 /* Try some synchronous reads of other devices to get
@@ -1193,6 +1437,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1193 * We don't need to freeze the array, because being in an 1437 * We don't need to freeze the array, because being in an
1194 * active sync request, there is no normal IO, and 1438 * active sync request, there is no normal IO, and
1195 * no overlapping syncs. 1439 * no overlapping syncs.
1440 * We don't need to check is_badblock() again as we
1441 * made sure that anything with a bad block in range
1442 * will have bi_end_io clear.
1196 */ 1443 */
1197 mddev_t *mddev = r1_bio->mddev; 1444 mddev_t *mddev = r1_bio->mddev;
1198 conf_t *conf = mddev->private; 1445 conf_t *conf = mddev->private;
@@ -1217,9 +1464,7 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1217 * active, and resync is currently active 1464 * active, and resync is currently active
1218 */ 1465 */
1219 rdev = conf->mirrors[d].rdev; 1466 rdev = conf->mirrors[d].rdev;
1220 if (sync_page_io(rdev, 1467 if (sync_page_io(rdev, sect, s<<9,
1221 sect,
1222 s<<9,
1223 bio->bi_io_vec[idx].bv_page, 1468 bio->bi_io_vec[idx].bv_page,
1224 READ, false)) { 1469 READ, false)) {
1225 success = 1; 1470 success = 1;
@@ -1233,16 +1478,36 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1233 1478
1234 if (!success) { 1479 if (!success) {
1235 char b[BDEVNAME_SIZE]; 1480 char b[BDEVNAME_SIZE];
1236 /* Cannot read from anywhere, array is toast */ 1481 int abort = 0;
1237 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 1482 /* Cannot read from anywhere, this block is lost.
1483 * Record a bad block on each device. If that doesn't
1484 * work just disable and interrupt the recovery.
1485 * Don't fail devices as that won't really help.
1486 */
1238 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" 1487 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1239 " for block %llu\n", 1488 " for block %llu\n",
1240 mdname(mddev), 1489 mdname(mddev),
1241 bdevname(bio->bi_bdev, b), 1490 bdevname(bio->bi_bdev, b),
1242 (unsigned long long)r1_bio->sector); 1491 (unsigned long long)r1_bio->sector);
1243 md_done_sync(mddev, r1_bio->sectors, 0); 1492 for (d = 0; d < conf->raid_disks; d++) {
1244 put_buf(r1_bio); 1493 rdev = conf->mirrors[d].rdev;
1245 return 0; 1494 if (!rdev || test_bit(Faulty, &rdev->flags))
1495 continue;
1496 if (!rdev_set_badblocks(rdev, sect, s, 0))
1497 abort = 1;
1498 }
1499 if (abort) {
1500 mddev->recovery_disabled = 1;
1501 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1502 md_done_sync(mddev, r1_bio->sectors, 0);
1503 put_buf(r1_bio);
1504 return 0;
1505 }
1506 /* Try next page */
1507 sectors -= s;
1508 sect += s;
1509 idx++;
1510 continue;
1246 } 1511 }
1247 1512
1248 start = d; 1513 start = d;
@@ -1254,16 +1519,12 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1254 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1519 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1255 continue; 1520 continue;
1256 rdev = conf->mirrors[d].rdev; 1521 rdev = conf->mirrors[d].rdev;
1257 if (sync_page_io(rdev, 1522 if (r1_sync_page_io(rdev, sect, s,
1258 sect, 1523 bio->bi_io_vec[idx].bv_page,
1259 s<<9, 1524 WRITE) == 0) {
1260 bio->bi_io_vec[idx].bv_page,
1261 WRITE, false) == 0) {
1262 r1_bio->bios[d]->bi_end_io = NULL; 1525 r1_bio->bios[d]->bi_end_io = NULL;
1263 rdev_dec_pending(rdev, mddev); 1526 rdev_dec_pending(rdev, mddev);
1264 md_error(mddev, rdev); 1527 }
1265 } else
1266 atomic_add(s, &rdev->corrected_errors);
1267 } 1528 }
1268 d = start; 1529 d = start;
1269 while (d != r1_bio->read_disk) { 1530 while (d != r1_bio->read_disk) {
@@ -1273,12 +1534,10 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1273 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1534 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1274 continue; 1535 continue;
1275 rdev = conf->mirrors[d].rdev; 1536 rdev = conf->mirrors[d].rdev;
1276 if (sync_page_io(rdev, 1537 if (r1_sync_page_io(rdev, sect, s,
1277 sect, 1538 bio->bi_io_vec[idx].bv_page,
1278 s<<9, 1539 READ) != 0)
1279 bio->bi_io_vec[idx].bv_page, 1540 atomic_add(s, &rdev->corrected_errors);
1280 READ, false) == 0)
1281 md_error(mddev, rdev);
1282 } 1541 }
1283 sectors -= s; 1542 sectors -= s;
1284 sect += s; 1543 sect += s;
@@ -1420,7 +1679,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1420 * 1679 *
1421 * 1. Retries failed read operations on working mirrors. 1680 * 1. Retries failed read operations on working mirrors.
1422 * 2. Updates the raid superblock when problems encounter. 1681 * 2. Updates the raid superblock when problems encounter.
1423 * 3. Performs writes following reads for array syncronising. 1682 * 3. Performs writes following reads for array synchronising.
1424 */ 1683 */
1425 1684
1426static void fix_read_error(conf_t *conf, int read_disk, 1685static void fix_read_error(conf_t *conf, int read_disk,
@@ -1443,9 +1702,14 @@ static void fix_read_error(conf_t *conf, int read_disk,
1443 * which is the thread that might remove 1702 * which is the thread that might remove
1444 * a device. If raid1d ever becomes multi-threaded.... 1703 * a device. If raid1d ever becomes multi-threaded....
1445 */ 1704 */
1705 sector_t first_bad;
1706 int bad_sectors;
1707
1446 rdev = conf->mirrors[d].rdev; 1708 rdev = conf->mirrors[d].rdev;
1447 if (rdev && 1709 if (rdev &&
1448 test_bit(In_sync, &rdev->flags) && 1710 test_bit(In_sync, &rdev->flags) &&
1711 is_badblock(rdev, sect, s,
1712 &first_bad, &bad_sectors) == 0 &&
1449 sync_page_io(rdev, sect, s<<9, 1713 sync_page_io(rdev, sect, s<<9,
1450 conf->tmppage, READ, false)) 1714 conf->tmppage, READ, false))
1451 success = 1; 1715 success = 1;
@@ -1457,8 +1721,10 @@ static void fix_read_error(conf_t *conf, int read_disk,
1457 } while (!success && d != read_disk); 1721 } while (!success && d != read_disk);
1458 1722
1459 if (!success) { 1723 if (!success) {
1460 /* Cannot read from anywhere -- bye bye array */ 1724 /* Cannot read from anywhere - mark it bad */
1461 md_error(mddev, conf->mirrors[read_disk].rdev); 1725 mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev;
1726 if (!rdev_set_badblocks(rdev, sect, s, 0))
1727 md_error(mddev, rdev);
1462 break; 1728 break;
1463 } 1729 }
1464 /* write it back and re-read */ 1730 /* write it back and re-read */
@@ -1469,13 +1735,9 @@ static void fix_read_error(conf_t *conf, int read_disk,
1469 d--; 1735 d--;
1470 rdev = conf->mirrors[d].rdev; 1736 rdev = conf->mirrors[d].rdev;
1471 if (rdev && 1737 if (rdev &&
1472 test_bit(In_sync, &rdev->flags)) { 1738 test_bit(In_sync, &rdev->flags))
1473 if (sync_page_io(rdev, sect, s<<9, 1739 r1_sync_page_io(rdev, sect, s,
1474 conf->tmppage, WRITE, false) 1740 conf->tmppage, WRITE);
1475 == 0)
1476 /* Well, this device is dead */
1477 md_error(mddev, rdev);
1478 }
1479 } 1741 }
1480 d = start; 1742 d = start;
1481 while (d != read_disk) { 1743 while (d != read_disk) {
@@ -1486,12 +1748,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
1486 rdev = conf->mirrors[d].rdev; 1748 rdev = conf->mirrors[d].rdev;
1487 if (rdev && 1749 if (rdev &&
1488 test_bit(In_sync, &rdev->flags)) { 1750 test_bit(In_sync, &rdev->flags)) {
1489 if (sync_page_io(rdev, sect, s<<9, 1751 if (r1_sync_page_io(rdev, sect, s,
1490 conf->tmppage, READ, false) 1752 conf->tmppage, READ)) {
1491 == 0)
1492 /* Well, this device is dead */
1493 md_error(mddev, rdev);
1494 else {
1495 atomic_add(s, &rdev->corrected_errors); 1753 atomic_add(s, &rdev->corrected_errors);
1496 printk(KERN_INFO 1754 printk(KERN_INFO
1497 "md/raid1:%s: read error corrected " 1755 "md/raid1:%s: read error corrected "
@@ -1508,21 +1766,255 @@ static void fix_read_error(conf_t *conf, int read_disk,
1508 } 1766 }
1509} 1767}
1510 1768
1769static void bi_complete(struct bio *bio, int error)
1770{
1771 complete((struct completion *)bio->bi_private);
1772}
1773
1774static int submit_bio_wait(int rw, struct bio *bio)
1775{
1776 struct completion event;
1777 rw |= REQ_SYNC;
1778
1779 init_completion(&event);
1780 bio->bi_private = &event;
1781 bio->bi_end_io = bi_complete;
1782 submit_bio(rw, bio);
1783 wait_for_completion(&event);
1784
1785 return test_bit(BIO_UPTODATE, &bio->bi_flags);
1786}
1787
1788static int narrow_write_error(r1bio_t *r1_bio, int i)
1789{
1790 mddev_t *mddev = r1_bio->mddev;
1791 conf_t *conf = mddev->private;
1792 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
1793 int vcnt, idx;
1794 struct bio_vec *vec;
1795
1796 /* bio has the data to be written to device 'i' where
1797 * we just recently had a write error.
1798 * We repeatedly clone the bio and trim down to one block,
1799 * then try the write. Where the write fails we record
1800 * a bad block.
1801 * It is conceivable that the bio doesn't exactly align with
1802 * blocks. We must handle this somehow.
1803 *
1804 * We currently own a reference on the rdev.
1805 */
1806
1807 int block_sectors;
1808 sector_t sector;
1809 int sectors;
1810 int sect_to_write = r1_bio->sectors;
1811 int ok = 1;
1812
1813 if (rdev->badblocks.shift < 0)
1814 return 0;
1815
1816 block_sectors = 1 << rdev->badblocks.shift;
1817 sector = r1_bio->sector;
1818 sectors = ((sector + block_sectors)
1819 & ~(sector_t)(block_sectors - 1))
1820 - sector;
1821
1822 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
1823 vcnt = r1_bio->behind_page_count;
1824 vec = r1_bio->behind_bvecs;
1825 idx = 0;
1826 while (vec[idx].bv_page == NULL)
1827 idx++;
1828 } else {
1829 vcnt = r1_bio->master_bio->bi_vcnt;
1830 vec = r1_bio->master_bio->bi_io_vec;
1831 idx = r1_bio->master_bio->bi_idx;
1832 }
1833 while (sect_to_write) {
1834 struct bio *wbio;
1835 if (sectors > sect_to_write)
1836 sectors = sect_to_write;
1837 /* Write at 'sector' for 'sectors'*/
1838
1839 wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
1840 memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
1841 wbio->bi_sector = r1_bio->sector;
1842 wbio->bi_rw = WRITE;
1843 wbio->bi_vcnt = vcnt;
1844 wbio->bi_size = r1_bio->sectors << 9;
1845 wbio->bi_idx = idx;
1846
1847 md_trim_bio(wbio, sector - r1_bio->sector, sectors);
1848 wbio->bi_sector += rdev->data_offset;
1849 wbio->bi_bdev = rdev->bdev;
1850 if (submit_bio_wait(WRITE, wbio) == 0)
1851 /* failure! */
1852 ok = rdev_set_badblocks(rdev, sector,
1853 sectors, 0)
1854 && ok;
1855
1856 bio_put(wbio);
1857 sect_to_write -= sectors;
1858 sector += sectors;
1859 sectors = block_sectors;
1860 }
1861 return ok;
1862}
1863
1864static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio)
1865{
1866 int m;
1867 int s = r1_bio->sectors;
1868 for (m = 0; m < conf->raid_disks ; m++) {
1869 mdk_rdev_t *rdev = conf->mirrors[m].rdev;
1870 struct bio *bio = r1_bio->bios[m];
1871 if (bio->bi_end_io == NULL)
1872 continue;
1873 if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
1874 test_bit(R1BIO_MadeGood, &r1_bio->state)) {
1875 rdev_clear_badblocks(rdev, r1_bio->sector, s);
1876 }
1877 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
1878 test_bit(R1BIO_WriteError, &r1_bio->state)) {
1879 if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
1880 md_error(conf->mddev, rdev);
1881 }
1882 }
1883 put_buf(r1_bio);
1884 md_done_sync(conf->mddev, s, 1);
1885}
1886
1887static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio)
1888{
1889 int m;
1890 for (m = 0; m < conf->raid_disks ; m++)
1891 if (r1_bio->bios[m] == IO_MADE_GOOD) {
1892 mdk_rdev_t *rdev = conf->mirrors[m].rdev;
1893 rdev_clear_badblocks(rdev,
1894 r1_bio->sector,
1895 r1_bio->sectors);
1896 rdev_dec_pending(rdev, conf->mddev);
1897 } else if (r1_bio->bios[m] != NULL) {
1898 /* This drive got a write error. We need to
1899 * narrow down and record precise write
1900 * errors.
1901 */
1902 if (!narrow_write_error(r1_bio, m)) {
1903 md_error(conf->mddev,
1904 conf->mirrors[m].rdev);
1905 /* an I/O failed, we can't clear the bitmap */
1906 set_bit(R1BIO_Degraded, &r1_bio->state);
1907 }
1908 rdev_dec_pending(conf->mirrors[m].rdev,
1909 conf->mddev);
1910 }
1911 if (test_bit(R1BIO_WriteError, &r1_bio->state))
1912 close_write(r1_bio);
1913 raid_end_bio_io(r1_bio);
1914}
1915
1916static void handle_read_error(conf_t *conf, r1bio_t *r1_bio)
1917{
1918 int disk;
1919 int max_sectors;
1920 mddev_t *mddev = conf->mddev;
1921 struct bio *bio;
1922 char b[BDEVNAME_SIZE];
1923 mdk_rdev_t *rdev;
1924
1925 clear_bit(R1BIO_ReadError, &r1_bio->state);
1926 /* we got a read error. Maybe the drive is bad. Maybe just
1927 * the block and we can fix it.
1928 * We freeze all other IO, and try reading the block from
1929 * other devices. When we find one, we re-write
1930 * and check it that fixes the read error.
1931 * This is all done synchronously while the array is
1932 * frozen
1933 */
1934 if (mddev->ro == 0) {
1935 freeze_array(conf);
1936 fix_read_error(conf, r1_bio->read_disk,
1937 r1_bio->sector, r1_bio->sectors);
1938 unfreeze_array(conf);
1939 } else
1940 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1941
1942 bio = r1_bio->bios[r1_bio->read_disk];
1943 bdevname(bio->bi_bdev, b);
1944read_more:
1945 disk = read_balance(conf, r1_bio, &max_sectors);
1946 if (disk == -1) {
1947 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
1948 " read error for block %llu\n",
1949 mdname(mddev), b, (unsigned long long)r1_bio->sector);
1950 raid_end_bio_io(r1_bio);
1951 } else {
1952 const unsigned long do_sync
1953 = r1_bio->master_bio->bi_rw & REQ_SYNC;
1954 if (bio) {
1955 r1_bio->bios[r1_bio->read_disk] =
1956 mddev->ro ? IO_BLOCKED : NULL;
1957 bio_put(bio);
1958 }
1959 r1_bio->read_disk = disk;
1960 bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
1961 md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors);
1962 r1_bio->bios[r1_bio->read_disk] = bio;
1963 rdev = conf->mirrors[disk].rdev;
1964 printk_ratelimited(KERN_ERR
1965 "md/raid1:%s: redirecting sector %llu"
1966 " to other mirror: %s\n",
1967 mdname(mddev),
1968 (unsigned long long)r1_bio->sector,
1969 bdevname(rdev->bdev, b));
1970 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1971 bio->bi_bdev = rdev->bdev;
1972 bio->bi_end_io = raid1_end_read_request;
1973 bio->bi_rw = READ | do_sync;
1974 bio->bi_private = r1_bio;
1975 if (max_sectors < r1_bio->sectors) {
1976 /* Drat - have to split this up more */
1977 struct bio *mbio = r1_bio->master_bio;
1978 int sectors_handled = (r1_bio->sector + max_sectors
1979 - mbio->bi_sector);
1980 r1_bio->sectors = max_sectors;
1981 spin_lock_irq(&conf->device_lock);
1982 if (mbio->bi_phys_segments == 0)
1983 mbio->bi_phys_segments = 2;
1984 else
1985 mbio->bi_phys_segments++;
1986 spin_unlock_irq(&conf->device_lock);
1987 generic_make_request(bio);
1988 bio = NULL;
1989
1990 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1991
1992 r1_bio->master_bio = mbio;
1993 r1_bio->sectors = (mbio->bi_size >> 9)
1994 - sectors_handled;
1995 r1_bio->state = 0;
1996 set_bit(R1BIO_ReadError, &r1_bio->state);
1997 r1_bio->mddev = mddev;
1998 r1_bio->sector = mbio->bi_sector + sectors_handled;
1999
2000 goto read_more;
2001 } else
2002 generic_make_request(bio);
2003 }
2004}
2005
1511static void raid1d(mddev_t *mddev) 2006static void raid1d(mddev_t *mddev)
1512{ 2007{
1513 r1bio_t *r1_bio; 2008 r1bio_t *r1_bio;
1514 struct bio *bio;
1515 unsigned long flags; 2009 unsigned long flags;
1516 conf_t *conf = mddev->private; 2010 conf_t *conf = mddev->private;
1517 struct list_head *head = &conf->retry_list; 2011 struct list_head *head = &conf->retry_list;
1518 mdk_rdev_t *rdev;
1519 struct blk_plug plug; 2012 struct blk_plug plug;
1520 2013
1521 md_check_recovery(mddev); 2014 md_check_recovery(mddev);
1522 2015
1523 blk_start_plug(&plug); 2016 blk_start_plug(&plug);
1524 for (;;) { 2017 for (;;) {
1525 char b[BDEVNAME_SIZE];
1526 2018
1527 if (atomic_read(&mddev->plug_cnt) == 0) 2019 if (atomic_read(&mddev->plug_cnt) == 0)
1528 flush_pending_writes(conf); 2020 flush_pending_writes(conf);
@@ -1539,62 +2031,26 @@ static void raid1d(mddev_t *mddev)
1539 2031
1540 mddev = r1_bio->mddev; 2032 mddev = r1_bio->mddev;
1541 conf = mddev->private; 2033 conf = mddev->private;
1542 if (test_bit(R1BIO_IsSync, &r1_bio->state)) 2034 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1543 sync_request_write(mddev, r1_bio); 2035 if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1544 else { 2036 test_bit(R1BIO_WriteError, &r1_bio->state))
1545 int disk; 2037 handle_sync_write_finished(conf, r1_bio);
1546 2038 else
1547 /* we got a read error. Maybe the drive is bad. Maybe just 2039 sync_request_write(mddev, r1_bio);
1548 * the block and we can fix it. 2040 } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1549 * We freeze all other IO, and try reading the block from 2041 test_bit(R1BIO_WriteError, &r1_bio->state))
1550 * other devices. When we find one, we re-write 2042 handle_write_finished(conf, r1_bio);
1551 * and check it that fixes the read error. 2043 else if (test_bit(R1BIO_ReadError, &r1_bio->state))
1552 * This is all done synchronously while the array is 2044 handle_read_error(conf, r1_bio);
1553 * frozen 2045 else
2046 /* just a partial read to be scheduled from separate
2047 * context
1554 */ 2048 */
1555 if (mddev->ro == 0) { 2049 generic_make_request(r1_bio->bios[r1_bio->read_disk]);
1556 freeze_array(conf); 2050
1557 fix_read_error(conf, r1_bio->read_disk,
1558 r1_bio->sector,
1559 r1_bio->sectors);
1560 unfreeze_array(conf);
1561 } else
1562 md_error(mddev,
1563 conf->mirrors[r1_bio->read_disk].rdev);
1564
1565 bio = r1_bio->bios[r1_bio->read_disk];
1566 if ((disk=read_balance(conf, r1_bio)) == -1) {
1567 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
1568 " read error for block %llu\n",
1569 mdname(mddev),
1570 bdevname(bio->bi_bdev,b),
1571 (unsigned long long)r1_bio->sector);
1572 raid_end_bio_io(r1_bio);
1573 } else {
1574 const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
1575 r1_bio->bios[r1_bio->read_disk] =
1576 mddev->ro ? IO_BLOCKED : NULL;
1577 r1_bio->read_disk = disk;
1578 bio_put(bio);
1579 bio = bio_clone_mddev(r1_bio->master_bio,
1580 GFP_NOIO, mddev);
1581 r1_bio->bios[r1_bio->read_disk] = bio;
1582 rdev = conf->mirrors[disk].rdev;
1583 if (printk_ratelimit())
1584 printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
1585 " other mirror: %s\n",
1586 mdname(mddev),
1587 (unsigned long long)r1_bio->sector,
1588 bdevname(rdev->bdev,b));
1589 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1590 bio->bi_bdev = rdev->bdev;
1591 bio->bi_end_io = raid1_end_read_request;
1592 bio->bi_rw = READ | do_sync;
1593 bio->bi_private = r1_bio;
1594 generic_make_request(bio);
1595 }
1596 }
1597 cond_resched(); 2051 cond_resched();
2052 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2053 md_check_recovery(mddev);
1598 } 2054 }
1599 blk_finish_plug(&plug); 2055 blk_finish_plug(&plug);
1600} 2056}
@@ -1636,6 +2092,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1636 int write_targets = 0, read_targets = 0; 2092 int write_targets = 0, read_targets = 0;
1637 sector_t sync_blocks; 2093 sector_t sync_blocks;
1638 int still_degraded = 0; 2094 int still_degraded = 0;
2095 int good_sectors = RESYNC_SECTORS;
2096 int min_bad = 0; /* number of sectors that are bad in all devices */
1639 2097
1640 if (!conf->r1buf_pool) 2098 if (!conf->r1buf_pool)
1641 if (init_resync(conf)) 2099 if (init_resync(conf))
@@ -1723,36 +2181,89 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1723 2181
1724 rdev = rcu_dereference(conf->mirrors[i].rdev); 2182 rdev = rcu_dereference(conf->mirrors[i].rdev);
1725 if (rdev == NULL || 2183 if (rdev == NULL ||
1726 test_bit(Faulty, &rdev->flags)) { 2184 test_bit(Faulty, &rdev->flags)) {
1727 still_degraded = 1; 2185 still_degraded = 1;
1728 continue;
1729 } else if (!test_bit(In_sync, &rdev->flags)) { 2186 } else if (!test_bit(In_sync, &rdev->flags)) {
1730 bio->bi_rw = WRITE; 2187 bio->bi_rw = WRITE;
1731 bio->bi_end_io = end_sync_write; 2188 bio->bi_end_io = end_sync_write;
1732 write_targets ++; 2189 write_targets ++;
1733 } else { 2190 } else {
1734 /* may need to read from here */ 2191 /* may need to read from here */
1735 bio->bi_rw = READ; 2192 sector_t first_bad = MaxSector;
1736 bio->bi_end_io = end_sync_read; 2193 int bad_sectors;
1737 if (test_bit(WriteMostly, &rdev->flags)) { 2194
1738 if (wonly < 0) 2195 if (is_badblock(rdev, sector_nr, good_sectors,
1739 wonly = i; 2196 &first_bad, &bad_sectors)) {
1740 } else { 2197 if (first_bad > sector_nr)
1741 if (disk < 0) 2198 good_sectors = first_bad - sector_nr;
1742 disk = i; 2199 else {
2200 bad_sectors -= (sector_nr - first_bad);
2201 if (min_bad == 0 ||
2202 min_bad > bad_sectors)
2203 min_bad = bad_sectors;
2204 }
2205 }
2206 if (sector_nr < first_bad) {
2207 if (test_bit(WriteMostly, &rdev->flags)) {
2208 if (wonly < 0)
2209 wonly = i;
2210 } else {
2211 if (disk < 0)
2212 disk = i;
2213 }
2214 bio->bi_rw = READ;
2215 bio->bi_end_io = end_sync_read;
2216 read_targets++;
1743 } 2217 }
1744 read_targets++;
1745 } 2218 }
1746 atomic_inc(&rdev->nr_pending); 2219 if (bio->bi_end_io) {
1747 bio->bi_sector = sector_nr + rdev->data_offset; 2220 atomic_inc(&rdev->nr_pending);
1748 bio->bi_bdev = rdev->bdev; 2221 bio->bi_sector = sector_nr + rdev->data_offset;
1749 bio->bi_private = r1_bio; 2222 bio->bi_bdev = rdev->bdev;
2223 bio->bi_private = r1_bio;
2224 }
1750 } 2225 }
1751 rcu_read_unlock(); 2226 rcu_read_unlock();
1752 if (disk < 0) 2227 if (disk < 0)
1753 disk = wonly; 2228 disk = wonly;
1754 r1_bio->read_disk = disk; 2229 r1_bio->read_disk = disk;
1755 2230
2231 if (read_targets == 0 && min_bad > 0) {
2232 /* These sectors are bad on all InSync devices, so we
2233 * need to mark them bad on all write targets
2234 */
2235 int ok = 1;
2236 for (i = 0 ; i < conf->raid_disks ; i++)
2237 if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
2238 mdk_rdev_t *rdev =
2239 rcu_dereference(conf->mirrors[i].rdev);
2240 ok = rdev_set_badblocks(rdev, sector_nr,
2241 min_bad, 0
2242 ) && ok;
2243 }
2244 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2245 *skipped = 1;
2246 put_buf(r1_bio);
2247
2248 if (!ok) {
2249 /* Cannot record the badblocks, so need to
2250 * abort the resync.
2251 * If there are multiple read targets, could just
2252 * fail the really bad ones ???
2253 */
2254 conf->recovery_disabled = mddev->recovery_disabled;
2255 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2256 return 0;
2257 } else
2258 return min_bad;
2259
2260 }
2261 if (min_bad > 0 && min_bad < good_sectors) {
2262 /* only resync enough to reach the next bad->good
2263 * transition */
2264 good_sectors = min_bad;
2265 }
2266
1756 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) 2267 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
1757 /* extra read targets are also write targets */ 2268 /* extra read targets are also write targets */
1758 write_targets += read_targets-1; 2269 write_targets += read_targets-1;
@@ -1769,6 +2280,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1769 2280
1770 if (max_sector > mddev->resync_max) 2281 if (max_sector > mddev->resync_max)
1771 max_sector = mddev->resync_max; /* Don't do IO beyond here */ 2282 max_sector = mddev->resync_max; /* Don't do IO beyond here */
2283 if (max_sector > sector_nr + good_sectors)
2284 max_sector = sector_nr + good_sectors;
1772 nr_sectors = 0; 2285 nr_sectors = 0;
1773 sync_blocks = 0; 2286 sync_blocks = 0;
1774 do { 2287 do {
@@ -2154,18 +2667,13 @@ static int raid1_reshape(mddev_t *mddev)
2154 for (d = d2 = 0; d < conf->raid_disks; d++) { 2667 for (d = d2 = 0; d < conf->raid_disks; d++) {
2155 mdk_rdev_t *rdev = conf->mirrors[d].rdev; 2668 mdk_rdev_t *rdev = conf->mirrors[d].rdev;
2156 if (rdev && rdev->raid_disk != d2) { 2669 if (rdev && rdev->raid_disk != d2) {
2157 char nm[20]; 2670 sysfs_unlink_rdev(mddev, rdev);
2158 sprintf(nm, "rd%d", rdev->raid_disk);
2159 sysfs_remove_link(&mddev->kobj, nm);
2160 rdev->raid_disk = d2; 2671 rdev->raid_disk = d2;
2161 sprintf(nm, "rd%d", rdev->raid_disk); 2672 sysfs_unlink_rdev(mddev, rdev);
2162 sysfs_remove_link(&mddev->kobj, nm); 2673 if (sysfs_link_rdev(mddev, rdev))
2163 if (sysfs_create_link(&mddev->kobj,
2164 &rdev->kobj, nm))
2165 printk(KERN_WARNING 2674 printk(KERN_WARNING
2166 "md/raid1:%s: cannot register " 2675 "md/raid1:%s: cannot register rd%d\n",
2167 "%s\n", 2676 mdname(mddev), rdev->raid_disk);
2168 mdname(mddev), nm);
2169 } 2677 }
2170 if (rdev) 2678 if (rdev)
2171 newmirrors[d2++].rdev = rdev; 2679 newmirrors[d2++].rdev = rdev;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e743a64fac4f..e0d676b48974 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -48,6 +48,12 @@ struct r1_private_data_s {
48 * (fresh device added). 48 * (fresh device added).
49 * Cleared when a sync completes. 49 * Cleared when a sync completes.
50 */ 50 */
51 int recovery_disabled; /* when the same as
52 * mddev->recovery_disabled
53 * we don't allow recovery
54 * to be attempted as we
55 * expect a read error
56 */
51 57
52 wait_queue_head_t wait_barrier; 58 wait_queue_head_t wait_barrier;
53 59
@@ -95,7 +101,7 @@ struct r1bio_s {
95 101
96 struct list_head retry_list; 102 struct list_head retry_list;
97 /* Next two are only valid when R1BIO_BehindIO is set */ 103 /* Next two are only valid when R1BIO_BehindIO is set */
98 struct page **behind_pages; 104 struct bio_vec *behind_bvecs;
99 int behind_page_count; 105 int behind_page_count;
100 /* 106 /*
101 * if the IO is in WRITE direction, then multiple bios are used. 107 * if the IO is in WRITE direction, then multiple bios are used.
@@ -110,13 +116,24 @@ struct r1bio_s {
110 * correct the read error. To keep track of bad blocks on a per-bio 116 * correct the read error. To keep track of bad blocks on a per-bio
111 * level, we store IO_BLOCKED in the appropriate 'bios' pointer 117 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
112 */ 118 */
113#define IO_BLOCKED ((struct bio*)1) 119#define IO_BLOCKED ((struct bio *)1)
120/* When we successfully write to a known bad-block, we need to remove the
121 * bad-block marking which must be done from process context. So we record
122 * the success by setting bios[n] to IO_MADE_GOOD
123 */
124#define IO_MADE_GOOD ((struct bio *)2)
125
126#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
114 127
115/* bits for r1bio.state */ 128/* bits for r1bio.state */
116#define R1BIO_Uptodate 0 129#define R1BIO_Uptodate 0
117#define R1BIO_IsSync 1 130#define R1BIO_IsSync 1
118#define R1BIO_Degraded 2 131#define R1BIO_Degraded 2
119#define R1BIO_BehindIO 3 132#define R1BIO_BehindIO 3
133/* Set ReadError on bios that experience a readerror so that
134 * raid1d knows what to do with them.
135 */
136#define R1BIO_ReadError 4
120/* For write-behind requests, we call bi_end_io when 137/* For write-behind requests, we call bi_end_io when
121 * the last non-write-behind device completes, providing 138 * the last non-write-behind device completes, providing
122 * any write was successful. Otherwise we call when 139 * any write was successful. Otherwise we call when
@@ -125,6 +142,11 @@ struct r1bio_s {
125 * Record that bi_end_io was called with this flag... 142 * Record that bi_end_io was called with this flag...
126 */ 143 */
127#define R1BIO_Returned 6 144#define R1BIO_Returned 6
145/* If a write for this request means we can clear some
146 * known-bad-block records, we set this flag
147 */
148#define R1BIO_MadeGood 7
149#define R1BIO_WriteError 8
128 150
129extern int md_raid1_congested(mddev_t *mddev, int bits); 151extern int md_raid1_congested(mddev_t *mddev, int bits);
130 152
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6e846688962f..8b29cd4f01c8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -22,6 +22,7 @@
22#include <linux/delay.h> 22#include <linux/delay.h>
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <linux/seq_file.h> 24#include <linux/seq_file.h>
25#include <linux/ratelimit.h>
25#include "md.h" 26#include "md.h"
26#include "raid10.h" 27#include "raid10.h"
27#include "raid0.h" 28#include "raid0.h"
@@ -123,7 +124,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
123 for (j = 0 ; j < nalloc; j++) { 124 for (j = 0 ; j < nalloc; j++) {
124 bio = r10_bio->devs[j].bio; 125 bio = r10_bio->devs[j].bio;
125 for (i = 0; i < RESYNC_PAGES; i++) { 126 for (i = 0; i < RESYNC_PAGES; i++) {
126 page = alloc_page(gfp_flags); 127 if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
128 &conf->mddev->recovery)) {
129 /* we can share bv_page's during recovery */
130 struct bio *rbio = r10_bio->devs[0].bio;
131 page = rbio->bi_io_vec[i].bv_page;
132 get_page(page);
133 } else
134 page = alloc_page(gfp_flags);
127 if (unlikely(!page)) 135 if (unlikely(!page))
128 goto out_free_pages; 136 goto out_free_pages;
129 137
@@ -173,7 +181,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
173 181
174 for (i = 0; i < conf->copies; i++) { 182 for (i = 0; i < conf->copies; i++) {
175 struct bio **bio = & r10_bio->devs[i].bio; 183 struct bio **bio = & r10_bio->devs[i].bio;
176 if (*bio && *bio != IO_BLOCKED) 184 if (!BIO_SPECIAL(*bio))
177 bio_put(*bio); 185 bio_put(*bio);
178 *bio = NULL; 186 *bio = NULL;
179 } 187 }
@@ -183,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
183{ 191{
184 conf_t *conf = r10_bio->mddev->private; 192 conf_t *conf = r10_bio->mddev->private;
185 193
186 /*
187 * Wake up any possible resync thread that waits for the device
188 * to go idle.
189 */
190 allow_barrier(conf);
191
192 put_all_bios(conf, r10_bio); 194 put_all_bios(conf, r10_bio);
193 mempool_free(r10_bio, conf->r10bio_pool); 195 mempool_free(r10_bio, conf->r10bio_pool);
194} 196}
@@ -227,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
227static void raid_end_bio_io(r10bio_t *r10_bio) 229static void raid_end_bio_io(r10bio_t *r10_bio)
228{ 230{
229 struct bio *bio = r10_bio->master_bio; 231 struct bio *bio = r10_bio->master_bio;
232 int done;
233 conf_t *conf = r10_bio->mddev->private;
230 234
231 bio_endio(bio, 235 if (bio->bi_phys_segments) {
232 test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); 236 unsigned long flags;
237 spin_lock_irqsave(&conf->device_lock, flags);
238 bio->bi_phys_segments--;
239 done = (bio->bi_phys_segments == 0);
240 spin_unlock_irqrestore(&conf->device_lock, flags);
241 } else
242 done = 1;
243 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
244 clear_bit(BIO_UPTODATE, &bio->bi_flags);
245 if (done) {
246 bio_endio(bio, 0);
247 /*
248 * Wake up any possible resync thread that waits for the device
249 * to go idle.
250 */
251 allow_barrier(conf);
252 }
233 free_r10bio(r10_bio); 253 free_r10bio(r10_bio);
234} 254}
235 255
@@ -244,6 +264,26 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)
244 r10_bio->devs[slot].addr + (r10_bio->sectors); 264 r10_bio->devs[slot].addr + (r10_bio->sectors);
245} 265}
246 266
267/*
268 * Find the disk number which triggered given bio
269 */
270static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio,
271 struct bio *bio, int *slotp)
272{
273 int slot;
274
275 for (slot = 0; slot < conf->copies; slot++)
276 if (r10_bio->devs[slot].bio == bio)
277 break;
278
279 BUG_ON(slot == conf->copies);
280 update_head_pos(slot, r10_bio);
281
282 if (slotp)
283 *slotp = slot;
284 return r10_bio->devs[slot].devnum;
285}
286
247static void raid10_end_read_request(struct bio *bio, int error) 287static void raid10_end_read_request(struct bio *bio, int error)
248{ 288{
249 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 289 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -277,34 +317,45 @@ static void raid10_end_read_request(struct bio *bio, int error)
277 * oops, read error - keep the refcount on the rdev 317 * oops, read error - keep the refcount on the rdev
278 */ 318 */
279 char b[BDEVNAME_SIZE]; 319 char b[BDEVNAME_SIZE];
280 if (printk_ratelimit()) 320 printk_ratelimited(KERN_ERR
281 printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", 321 "md/raid10:%s: %s: rescheduling sector %llu\n",
282 mdname(conf->mddev), 322 mdname(conf->mddev),
283 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); 323 bdevname(conf->mirrors[dev].rdev->bdev, b),
324 (unsigned long long)r10_bio->sector);
325 set_bit(R10BIO_ReadError, &r10_bio->state);
284 reschedule_retry(r10_bio); 326 reschedule_retry(r10_bio);
285 } 327 }
286} 328}
287 329
330static void close_write(r10bio_t *r10_bio)
331{
332 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
334 r10_bio->sectors,
335 !test_bit(R10BIO_Degraded, &r10_bio->state),
336 0);
337 md_write_end(r10_bio->mddev);
338}
339
288static void raid10_end_write_request(struct bio *bio, int error) 340static void raid10_end_write_request(struct bio *bio, int error)
289{ 341{
290 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 342 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
291 r10bio_t *r10_bio = bio->bi_private; 343 r10bio_t *r10_bio = bio->bi_private;
292 int slot, dev; 344 int dev;
345 int dec_rdev = 1;
293 conf_t *conf = r10_bio->mddev->private; 346 conf_t *conf = r10_bio->mddev->private;
347 int slot;
294 348
295 for (slot = 0; slot < conf->copies; slot++) 349 dev = find_bio_disk(conf, r10_bio, bio, &slot);
296 if (r10_bio->devs[slot].bio == bio)
297 break;
298 dev = r10_bio->devs[slot].devnum;
299 350
300 /* 351 /*
301 * this branch is our 'one mirror IO has finished' event handler: 352 * this branch is our 'one mirror IO has finished' event handler:
302 */ 353 */
303 if (!uptodate) { 354 if (!uptodate) {
304 md_error(r10_bio->mddev, conf->mirrors[dev].rdev); 355 set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags);
305 /* an I/O failed, we can't clear the bitmap */ 356 set_bit(R10BIO_WriteError, &r10_bio->state);
306 set_bit(R10BIO_Degraded, &r10_bio->state); 357 dec_rdev = 0;
307 } else 358 } else {
308 /* 359 /*
309 * Set R10BIO_Uptodate in our master bio, so that 360 * Set R10BIO_Uptodate in our master bio, so that
310 * we will return a good error code for to the higher 361 * we will return a good error code for to the higher
@@ -314,9 +365,22 @@ static void raid10_end_write_request(struct bio *bio, int error)
314 * user-side. So if something waits for IO, then it will 365 * user-side. So if something waits for IO, then it will
315 * wait for the 'master' bio. 366 * wait for the 'master' bio.
316 */ 367 */
368 sector_t first_bad;
369 int bad_sectors;
370
317 set_bit(R10BIO_Uptodate, &r10_bio->state); 371 set_bit(R10BIO_Uptodate, &r10_bio->state);
318 372
319 update_head_pos(slot, r10_bio); 373 /* Maybe we can clear some bad blocks. */
374 if (is_badblock(conf->mirrors[dev].rdev,
375 r10_bio->devs[slot].addr,
376 r10_bio->sectors,
377 &first_bad, &bad_sectors)) {
378 bio_put(bio);
379 r10_bio->devs[slot].bio = IO_MADE_GOOD;
380 dec_rdev = 0;
381 set_bit(R10BIO_MadeGood, &r10_bio->state);
382 }
383 }
320 384
321 /* 385 /*
322 * 386 *
@@ -324,16 +388,18 @@ static void raid10_end_write_request(struct bio *bio, int error)
324 * already. 388 * already.
325 */ 389 */
326 if (atomic_dec_and_test(&r10_bio->remaining)) { 390 if (atomic_dec_and_test(&r10_bio->remaining)) {
327 /* clear the bitmap if all writes complete successfully */ 391 if (test_bit(R10BIO_WriteError, &r10_bio->state))
328 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 392 reschedule_retry(r10_bio);
329 r10_bio->sectors, 393 else {
330 !test_bit(R10BIO_Degraded, &r10_bio->state), 394 close_write(r10_bio);
331 0); 395 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
332 md_write_end(r10_bio->mddev); 396 reschedule_retry(r10_bio);
333 raid_end_bio_io(r10_bio); 397 else
398 raid_end_bio_io(r10_bio);
399 }
334 } 400 }
335 401 if (dec_rdev)
336 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); 402 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
337} 403}
338 404
339 405
@@ -484,11 +550,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
484 * FIXME: possibly should rethink readbalancing and do it differently 550 * FIXME: possibly should rethink readbalancing and do it differently
485 * depending on near_copies / far_copies geometry. 551 * depending on near_copies / far_copies geometry.
486 */ 552 */
487static int read_balance(conf_t *conf, r10bio_t *r10_bio) 553static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
488{ 554{
489 const sector_t this_sector = r10_bio->sector; 555 const sector_t this_sector = r10_bio->sector;
490 int disk, slot; 556 int disk, slot;
491 const int sectors = r10_bio->sectors; 557 int sectors = r10_bio->sectors;
558 int best_good_sectors;
492 sector_t new_distance, best_dist; 559 sector_t new_distance, best_dist;
493 mdk_rdev_t *rdev; 560 mdk_rdev_t *rdev;
494 int do_balance; 561 int do_balance;
@@ -497,8 +564,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
497 raid10_find_phys(conf, r10_bio); 564 raid10_find_phys(conf, r10_bio);
498 rcu_read_lock(); 565 rcu_read_lock();
499retry: 566retry:
567 sectors = r10_bio->sectors;
500 best_slot = -1; 568 best_slot = -1;
501 best_dist = MaxSector; 569 best_dist = MaxSector;
570 best_good_sectors = 0;
502 do_balance = 1; 571 do_balance = 1;
503 /* 572 /*
504 * Check if we can balance. We can balance on the whole 573 * Check if we can balance. We can balance on the whole
@@ -511,6 +580,10 @@ retry:
511 do_balance = 0; 580 do_balance = 0;
512 581
513 for (slot = 0; slot < conf->copies ; slot++) { 582 for (slot = 0; slot < conf->copies ; slot++) {
583 sector_t first_bad;
584 int bad_sectors;
585 sector_t dev_sector;
586
514 if (r10_bio->devs[slot].bio == IO_BLOCKED) 587 if (r10_bio->devs[slot].bio == IO_BLOCKED)
515 continue; 588 continue;
516 disk = r10_bio->devs[slot].devnum; 589 disk = r10_bio->devs[slot].devnum;
@@ -520,6 +593,37 @@ retry:
520 if (!test_bit(In_sync, &rdev->flags)) 593 if (!test_bit(In_sync, &rdev->flags))
521 continue; 594 continue;
522 595
596 dev_sector = r10_bio->devs[slot].addr;
597 if (is_badblock(rdev, dev_sector, sectors,
598 &first_bad, &bad_sectors)) {
599 if (best_dist < MaxSector)
600 /* Already have a better slot */
601 continue;
602 if (first_bad <= dev_sector) {
603 /* Cannot read here. If this is the
604 * 'primary' device, then we must not read
605 * beyond 'bad_sectors' from another device.
606 */
607 bad_sectors -= (dev_sector - first_bad);
608 if (!do_balance && sectors > bad_sectors)
609 sectors = bad_sectors;
610 if (best_good_sectors > sectors)
611 best_good_sectors = sectors;
612 } else {
613 sector_t good_sectors =
614 first_bad - dev_sector;
615 if (good_sectors > best_good_sectors) {
616 best_good_sectors = good_sectors;
617 best_slot = slot;
618 }
619 if (!do_balance)
620 /* Must read from here */
621 break;
622 }
623 continue;
624 } else
625 best_good_sectors = sectors;
626
523 if (!do_balance) 627 if (!do_balance)
524 break; 628 break;
525 629
@@ -561,6 +665,7 @@ retry:
561 } else 665 } else
562 disk = -1; 666 disk = -1;
563 rcu_read_unlock(); 667 rcu_read_unlock();
668 *max_sectors = best_good_sectors;
564 669
565 return disk; 670 return disk;
566} 671}
@@ -734,6 +839,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
734 unsigned long flags; 839 unsigned long flags;
735 mdk_rdev_t *blocked_rdev; 840 mdk_rdev_t *blocked_rdev;
736 int plugged; 841 int plugged;
842 int sectors_handled;
843 int max_sectors;
737 844
738 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 845 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
739 md_flush_request(mddev, bio); 846 md_flush_request(mddev, bio);
@@ -808,12 +915,26 @@ static int make_request(mddev_t *mddev, struct bio * bio)
808 r10_bio->sector = bio->bi_sector; 915 r10_bio->sector = bio->bi_sector;
809 r10_bio->state = 0; 916 r10_bio->state = 0;
810 917
918 /* We might need to issue multiple reads to different
919 * devices if there are bad blocks around, so we keep
920 * track of the number of reads in bio->bi_phys_segments.
921 * If this is 0, there is only one r10_bio and no locking
922 * will be needed when the request completes. If it is
923 * non-zero, then it is the number of not-completed requests.
924 */
925 bio->bi_phys_segments = 0;
926 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
927
811 if (rw == READ) { 928 if (rw == READ) {
812 /* 929 /*
813 * read balancing logic: 930 * read balancing logic:
814 */ 931 */
815 int disk = read_balance(conf, r10_bio); 932 int disk;
816 int slot = r10_bio->read_slot; 933 int slot;
934
935read_again:
936 disk = read_balance(conf, r10_bio, &max_sectors);
937 slot = r10_bio->read_slot;
817 if (disk < 0) { 938 if (disk < 0) {
818 raid_end_bio_io(r10_bio); 939 raid_end_bio_io(r10_bio);
819 return 0; 940 return 0;
@@ -821,6 +942,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
821 mirror = conf->mirrors + disk; 942 mirror = conf->mirrors + disk;
822 943
823 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 944 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
945 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
946 max_sectors);
824 947
825 r10_bio->devs[slot].bio = read_bio; 948 r10_bio->devs[slot].bio = read_bio;
826 949
@@ -831,7 +954,37 @@ static int make_request(mddev_t *mddev, struct bio * bio)
831 read_bio->bi_rw = READ | do_sync; 954 read_bio->bi_rw = READ | do_sync;
832 read_bio->bi_private = r10_bio; 955 read_bio->bi_private = r10_bio;
833 956
834 generic_make_request(read_bio); 957 if (max_sectors < r10_bio->sectors) {
958 /* Could not read all from this device, so we will
959 * need another r10_bio.
960 */
961 sectors_handled = (r10_bio->sectors + max_sectors
962 - bio->bi_sector);
963 r10_bio->sectors = max_sectors;
964 spin_lock_irq(&conf->device_lock);
965 if (bio->bi_phys_segments == 0)
966 bio->bi_phys_segments = 2;
967 else
968 bio->bi_phys_segments++;
969 spin_unlock(&conf->device_lock);
970 /* Cannot call generic_make_request directly
971 * as that will be queued in __generic_make_request
972 * and subsequent mempool_alloc might block
973 * waiting for it. so hand bio over to raid10d.
974 */
975 reschedule_retry(r10_bio);
976
977 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
978
979 r10_bio->master_bio = bio;
980 r10_bio->sectors = ((bio->bi_size >> 9)
981 - sectors_handled);
982 r10_bio->state = 0;
983 r10_bio->mddev = mddev;
984 r10_bio->sector = bio->bi_sector + sectors_handled;
985 goto read_again;
986 } else
987 generic_make_request(read_bio);
835 return 0; 988 return 0;
836 } 989 }
837 990
@@ -841,13 +994,22 @@ static int make_request(mddev_t *mddev, struct bio * bio)
841 /* first select target devices under rcu_lock and 994 /* first select target devices under rcu_lock and
842 * inc refcount on their rdev. Record them by setting 995 * inc refcount on their rdev. Record them by setting
843 * bios[x] to bio 996 * bios[x] to bio
997 * If there are known/acknowledged bad blocks on any device
998 * on which we have seen a write error, we want to avoid
999 * writing to those blocks. This potentially requires several
1000 * writes to write around the bad blocks. Each set of writes
1001 * gets its own r10_bio with a set of bios attached. The number
1002 * of r10_bios is recored in bio->bi_phys_segments just as with
1003 * the read case.
844 */ 1004 */
845 plugged = mddev_check_plugged(mddev); 1005 plugged = mddev_check_plugged(mddev);
846 1006
847 raid10_find_phys(conf, r10_bio); 1007 raid10_find_phys(conf, r10_bio);
848 retry_write: 1008retry_write:
849 blocked_rdev = NULL; 1009 blocked_rdev = NULL;
850 rcu_read_lock(); 1010 rcu_read_lock();
1011 max_sectors = r10_bio->sectors;
1012
851 for (i = 0; i < conf->copies; i++) { 1013 for (i = 0; i < conf->copies; i++) {
852 int d = r10_bio->devs[i].devnum; 1014 int d = r10_bio->devs[i].devnum;
853 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); 1015 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -856,13 +1018,55 @@ static int make_request(mddev_t *mddev, struct bio * bio)
856 blocked_rdev = rdev; 1018 blocked_rdev = rdev;
857 break; 1019 break;
858 } 1020 }
859 if (rdev && !test_bit(Faulty, &rdev->flags)) { 1021 r10_bio->devs[i].bio = NULL;
860 atomic_inc(&rdev->nr_pending); 1022 if (!rdev || test_bit(Faulty, &rdev->flags)) {
861 r10_bio->devs[i].bio = bio;
862 } else {
863 r10_bio->devs[i].bio = NULL;
864 set_bit(R10BIO_Degraded, &r10_bio->state); 1023 set_bit(R10BIO_Degraded, &r10_bio->state);
1024 continue;
865 } 1025 }
1026 if (test_bit(WriteErrorSeen, &rdev->flags)) {
1027 sector_t first_bad;
1028 sector_t dev_sector = r10_bio->devs[i].addr;
1029 int bad_sectors;
1030 int is_bad;
1031
1032 is_bad = is_badblock(rdev, dev_sector,
1033 max_sectors,
1034 &first_bad, &bad_sectors);
1035 if (is_bad < 0) {
1036 /* Mustn't write here until the bad block
1037 * is acknowledged
1038 */
1039 atomic_inc(&rdev->nr_pending);
1040 set_bit(BlockedBadBlocks, &rdev->flags);
1041 blocked_rdev = rdev;
1042 break;
1043 }
1044 if (is_bad && first_bad <= dev_sector) {
1045 /* Cannot write here at all */
1046 bad_sectors -= (dev_sector - first_bad);
1047 if (bad_sectors < max_sectors)
1048 /* Mustn't write more than bad_sectors
1049 * to other devices yet
1050 */
1051 max_sectors = bad_sectors;
1052 /* We don't set R10BIO_Degraded as that
1053 * only applies if the disk is missing,
1054 * so it might be re-added, and we want to
1055 * know to recover this chunk.
1056 * In this case the device is here, and the
1057 * fact that this chunk is not in-sync is
1058 * recorded in the bad block log.
1059 */
1060 continue;
1061 }
1062 if (is_bad) {
1063 int good_sectors = first_bad - dev_sector;
1064 if (good_sectors < max_sectors)
1065 max_sectors = good_sectors;
1066 }
1067 }
1068 r10_bio->devs[i].bio = bio;
1069 atomic_inc(&rdev->nr_pending);
866 } 1070 }
867 rcu_read_unlock(); 1071 rcu_read_unlock();
868 1072
@@ -882,8 +1086,22 @@ static int make_request(mddev_t *mddev, struct bio * bio)
882 goto retry_write; 1086 goto retry_write;
883 } 1087 }
884 1088
1089 if (max_sectors < r10_bio->sectors) {
1090 /* We are splitting this into multiple parts, so
1091 * we need to prepare for allocating another r10_bio.
1092 */
1093 r10_bio->sectors = max_sectors;
1094 spin_lock_irq(&conf->device_lock);
1095 if (bio->bi_phys_segments == 0)
1096 bio->bi_phys_segments = 2;
1097 else
1098 bio->bi_phys_segments++;
1099 spin_unlock_irq(&conf->device_lock);
1100 }
1101 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1102
885 atomic_set(&r10_bio->remaining, 1); 1103 atomic_set(&r10_bio->remaining, 1);
886 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); 1104 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
887 1105
888 for (i = 0; i < conf->copies; i++) { 1106 for (i = 0; i < conf->copies; i++) {
889 struct bio *mbio; 1107 struct bio *mbio;
@@ -892,10 +1110,12 @@ static int make_request(mddev_t *mddev, struct bio * bio)
892 continue; 1110 continue;
893 1111
894 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1112 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1113 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1114 max_sectors);
895 r10_bio->devs[i].bio = mbio; 1115 r10_bio->devs[i].bio = mbio;
896 1116
897 mbio->bi_sector = r10_bio->devs[i].addr+ 1117 mbio->bi_sector = (r10_bio->devs[i].addr+
898 conf->mirrors[d].rdev->data_offset; 1118 conf->mirrors[d].rdev->data_offset);
899 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 1119 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
900 mbio->bi_end_io = raid10_end_write_request; 1120 mbio->bi_end_io = raid10_end_write_request;
901 mbio->bi_rw = WRITE | do_sync | do_fua; 1121 mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -920,6 +1140,21 @@ static int make_request(mddev_t *mddev, struct bio * bio)
920 /* In case raid10d snuck in to freeze_array */ 1140 /* In case raid10d snuck in to freeze_array */
921 wake_up(&conf->wait_barrier); 1141 wake_up(&conf->wait_barrier);
922 1142
1143 if (sectors_handled < (bio->bi_size >> 9)) {
1144 /* We need another r10_bio. It has already been counted
1145 * in bio->bi_phys_segments.
1146 */
1147 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1148
1149 r10_bio->master_bio = bio;
1150 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1151
1152 r10_bio->mddev = mddev;
1153 r10_bio->sector = bio->bi_sector + sectors_handled;
1154 r10_bio->state = 0;
1155 goto retry_write;
1156 }
1157
923 if (do_sync || !mddev->bitmap || !plugged) 1158 if (do_sync || !mddev->bitmap || !plugged)
924 md_wakeup_thread(mddev->thread); 1159 md_wakeup_thread(mddev->thread);
925 return 0; 1160 return 0;
@@ -949,6 +1184,30 @@ static void status(struct seq_file *seq, mddev_t *mddev)
949 seq_printf(seq, "]"); 1184 seq_printf(seq, "]");
950} 1185}
951 1186
1187/* check if there are enough drives for
1188 * every block to appear on atleast one.
1189 * Don't consider the device numbered 'ignore'
1190 * as we might be about to remove it.
1191 */
1192static int enough(conf_t *conf, int ignore)
1193{
1194 int first = 0;
1195
1196 do {
1197 int n = conf->copies;
1198 int cnt = 0;
1199 while (n--) {
1200 if (conf->mirrors[first].rdev &&
1201 first != ignore)
1202 cnt++;
1203 first = (first+1) % conf->raid_disks;
1204 }
1205 if (cnt == 0)
1206 return 0;
1207 } while (first != 0);
1208 return 1;
1209}
1210
952static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1211static void error(mddev_t *mddev, mdk_rdev_t *rdev)
953{ 1212{
954 char b[BDEVNAME_SIZE]; 1213 char b[BDEVNAME_SIZE];
@@ -961,13 +1220,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
961 * else mark the drive as failed 1220 * else mark the drive as failed
962 */ 1221 */
963 if (test_bit(In_sync, &rdev->flags) 1222 if (test_bit(In_sync, &rdev->flags)
964 && conf->raid_disks-mddev->degraded == 1) 1223 && !enough(conf, rdev->raid_disk))
965 /* 1224 /*
966 * Don't fail the drive, just return an IO error. 1225 * Don't fail the drive, just return an IO error.
967 * The test should really be more sophisticated than
968 * "working_disks == 1", but it isn't critical, and
969 * can wait until we do more sophisticated "is the drive
970 * really dead" tests...
971 */ 1226 */
972 return; 1227 return;
973 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1228 if (test_and_clear_bit(In_sync, &rdev->flags)) {
@@ -980,6 +1235,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
980 */ 1235 */
981 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1236 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
982 } 1237 }
1238 set_bit(Blocked, &rdev->flags);
983 set_bit(Faulty, &rdev->flags); 1239 set_bit(Faulty, &rdev->flags);
984 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1240 set_bit(MD_CHANGE_DEVS, &mddev->flags);
985 printk(KERN_ALERT 1241 printk(KERN_ALERT
@@ -1022,27 +1278,6 @@ static void close_sync(conf_t *conf)
1022 conf->r10buf_pool = NULL; 1278 conf->r10buf_pool = NULL;
1023} 1279}
1024 1280
1025/* check if there are enough drives for
1026 * every block to appear on atleast one
1027 */
1028static int enough(conf_t *conf)
1029{
1030 int first = 0;
1031
1032 do {
1033 int n = conf->copies;
1034 int cnt = 0;
1035 while (n--) {
1036 if (conf->mirrors[first].rdev)
1037 cnt++;
1038 first = (first+1) % conf->raid_disks;
1039 }
1040 if (cnt == 0)
1041 return 0;
1042 } while (first != 0);
1043 return 1;
1044}
1045
1046static int raid10_spare_active(mddev_t *mddev) 1281static int raid10_spare_active(mddev_t *mddev)
1047{ 1282{
1048 int i; 1283 int i;
@@ -1078,7 +1313,6 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1078 conf_t *conf = mddev->private; 1313 conf_t *conf = mddev->private;
1079 int err = -EEXIST; 1314 int err = -EEXIST;
1080 int mirror; 1315 int mirror;
1081 mirror_info_t *p;
1082 int first = 0; 1316 int first = 0;
1083 int last = conf->raid_disks - 1; 1317 int last = conf->raid_disks - 1;
1084 1318
@@ -1087,44 +1321,47 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1087 * very different from resync 1321 * very different from resync
1088 */ 1322 */
1089 return -EBUSY; 1323 return -EBUSY;
1090 if (!enough(conf)) 1324 if (!enough(conf, -1))
1091 return -EINVAL; 1325 return -EINVAL;
1092 1326
1093 if (rdev->raid_disk >= 0) 1327 if (rdev->raid_disk >= 0)
1094 first = last = rdev->raid_disk; 1328 first = last = rdev->raid_disk;
1095 1329
1096 if (rdev->saved_raid_disk >= 0 && 1330 if (rdev->saved_raid_disk >= first &&
1097 rdev->saved_raid_disk >= first &&
1098 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1331 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1099 mirror = rdev->saved_raid_disk; 1332 mirror = rdev->saved_raid_disk;
1100 else 1333 else
1101 mirror = first; 1334 mirror = first;
1102 for ( ; mirror <= last ; mirror++) 1335 for ( ; mirror <= last ; mirror++) {
1103 if ( !(p=conf->mirrors+mirror)->rdev) { 1336 mirror_info_t *p = &conf->mirrors[mirror];
1104 1337 if (p->recovery_disabled == mddev->recovery_disabled)
1105 disk_stack_limits(mddev->gendisk, rdev->bdev, 1338 continue;
1106 rdev->data_offset << 9); 1339 if (!p->rdev)
1107 /* as we don't honour merge_bvec_fn, we must 1340 continue;
1108 * never risk violating it, so limit
1109 * ->max_segments to one lying with a single
1110 * page, as a one page request is never in
1111 * violation.
1112 */
1113 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1114 blk_queue_max_segments(mddev->queue, 1);
1115 blk_queue_segment_boundary(mddev->queue,
1116 PAGE_CACHE_SIZE - 1);
1117 }
1118 1341
1119 p->head_position = 0; 1342 disk_stack_limits(mddev->gendisk, rdev->bdev,
1120 rdev->raid_disk = mirror; 1343 rdev->data_offset << 9);
1121 err = 0; 1344 /* as we don't honour merge_bvec_fn, we must
1122 if (rdev->saved_raid_disk != mirror) 1345 * never risk violating it, so limit
1123 conf->fullsync = 1; 1346 * ->max_segments to one lying with a single
1124 rcu_assign_pointer(p->rdev, rdev); 1347 * page, as a one page request is never in
1125 break; 1348 * violation.
1349 */
1350 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1351 blk_queue_max_segments(mddev->queue, 1);
1352 blk_queue_segment_boundary(mddev->queue,
1353 PAGE_CACHE_SIZE - 1);
1126 } 1354 }
1127 1355
1356 p->head_position = 0;
1357 rdev->raid_disk = mirror;
1358 err = 0;
1359 if (rdev->saved_raid_disk != mirror)
1360 conf->fullsync = 1;
1361 rcu_assign_pointer(p->rdev, rdev);
1362 break;
1363 }
1364
1128 md_integrity_add_rdev(rdev, mddev); 1365 md_integrity_add_rdev(rdev, mddev);
1129 print_conf(conf); 1366 print_conf(conf);
1130 return err; 1367 return err;
@@ -1149,7 +1386,8 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
1149 * is not possible. 1386 * is not possible.
1150 */ 1387 */
1151 if (!test_bit(Faulty, &rdev->flags) && 1388 if (!test_bit(Faulty, &rdev->flags) &&
1152 enough(conf)) { 1389 mddev->recovery_disabled != p->recovery_disabled &&
1390 enough(conf, -1)) {
1153 err = -EBUSY; 1391 err = -EBUSY;
1154 goto abort; 1392 goto abort;
1155 } 1393 }
@@ -1174,24 +1412,18 @@ static void end_sync_read(struct bio *bio, int error)
1174{ 1412{
1175 r10bio_t *r10_bio = bio->bi_private; 1413 r10bio_t *r10_bio = bio->bi_private;
1176 conf_t *conf = r10_bio->mddev->private; 1414 conf_t *conf = r10_bio->mddev->private;
1177 int i,d; 1415 int d;
1178 1416
1179 for (i=0; i<conf->copies; i++) 1417 d = find_bio_disk(conf, r10_bio, bio, NULL);
1180 if (r10_bio->devs[i].bio == bio)
1181 break;
1182 BUG_ON(i == conf->copies);
1183 update_head_pos(i, r10_bio);
1184 d = r10_bio->devs[i].devnum;
1185 1418
1186 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1419 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1187 set_bit(R10BIO_Uptodate, &r10_bio->state); 1420 set_bit(R10BIO_Uptodate, &r10_bio->state);
1188 else { 1421 else
1422 /* The write handler will notice the lack of
1423 * R10BIO_Uptodate and record any errors etc
1424 */
1189 atomic_add(r10_bio->sectors, 1425 atomic_add(r10_bio->sectors,
1190 &conf->mirrors[d].rdev->corrected_errors); 1426 &conf->mirrors[d].rdev->corrected_errors);
1191 if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1192 md_error(r10_bio->mddev,
1193 conf->mirrors[d].rdev);
1194 }
1195 1427
1196 /* for reconstruct, we always reschedule after a read. 1428 /* for reconstruct, we always reschedule after a read.
1197 * for resync, only after all reads 1429 * for resync, only after all reads
@@ -1206,40 +1438,60 @@ static void end_sync_read(struct bio *bio, int error)
1206 } 1438 }
1207} 1439}
1208 1440
1209static void end_sync_write(struct bio *bio, int error) 1441static void end_sync_request(r10bio_t *r10_bio)
1210{ 1442{
1211 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1212 r10bio_t *r10_bio = bio->bi_private;
1213 mddev_t *mddev = r10_bio->mddev; 1443 mddev_t *mddev = r10_bio->mddev;
1214 conf_t *conf = mddev->private;
1215 int i,d;
1216
1217 for (i = 0; i < conf->copies; i++)
1218 if (r10_bio->devs[i].bio == bio)
1219 break;
1220 d = r10_bio->devs[i].devnum;
1221 1444
1222 if (!uptodate)
1223 md_error(mddev, conf->mirrors[d].rdev);
1224
1225 update_head_pos(i, r10_bio);
1226
1227 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1228 while (atomic_dec_and_test(&r10_bio->remaining)) { 1445 while (atomic_dec_and_test(&r10_bio->remaining)) {
1229 if (r10_bio->master_bio == NULL) { 1446 if (r10_bio->master_bio == NULL) {
1230 /* the primary of several recovery bios */ 1447 /* the primary of several recovery bios */
1231 sector_t s = r10_bio->sectors; 1448 sector_t s = r10_bio->sectors;
1232 put_buf(r10_bio); 1449 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1450 test_bit(R10BIO_WriteError, &r10_bio->state))
1451 reschedule_retry(r10_bio);
1452 else
1453 put_buf(r10_bio);
1233 md_done_sync(mddev, s, 1); 1454 md_done_sync(mddev, s, 1);
1234 break; 1455 break;
1235 } else { 1456 } else {
1236 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; 1457 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1237 put_buf(r10_bio); 1458 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1459 test_bit(R10BIO_WriteError, &r10_bio->state))
1460 reschedule_retry(r10_bio);
1461 else
1462 put_buf(r10_bio);
1238 r10_bio = r10_bio2; 1463 r10_bio = r10_bio2;
1239 } 1464 }
1240 } 1465 }
1241} 1466}
1242 1467
1468static void end_sync_write(struct bio *bio, int error)
1469{
1470 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1471 r10bio_t *r10_bio = bio->bi_private;
1472 mddev_t *mddev = r10_bio->mddev;
1473 conf_t *conf = mddev->private;
1474 int d;
1475 sector_t first_bad;
1476 int bad_sectors;
1477 int slot;
1478
1479 d = find_bio_disk(conf, r10_bio, bio, &slot);
1480
1481 if (!uptodate) {
1482 set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
1483 set_bit(R10BIO_WriteError, &r10_bio->state);
1484 } else if (is_badblock(conf->mirrors[d].rdev,
1485 r10_bio->devs[slot].addr,
1486 r10_bio->sectors,
1487 &first_bad, &bad_sectors))
1488 set_bit(R10BIO_MadeGood, &r10_bio->state);
1489
1490 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1491
1492 end_sync_request(r10_bio);
1493}
1494
1243/* 1495/*
1244 * Note: sync and recover and handled very differently for raid10 1496 * Note: sync and recover and handled very differently for raid10
1245 * This code is for resync. 1497 * This code is for resync.
@@ -1299,11 +1551,12 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1299 if (j == vcnt) 1551 if (j == vcnt)
1300 continue; 1552 continue;
1301 mddev->resync_mismatches += r10_bio->sectors; 1553 mddev->resync_mismatches += r10_bio->sectors;
1554 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1555 /* Don't fix anything. */
1556 continue;
1302 } 1557 }
1303 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 1558 /* Ok, we need to write this bio, either to correct an
1304 /* Don't fix anything. */ 1559 * inconsistency or to correct an unreadable block.
1305 continue;
1306 /* Ok, we need to write this bio
1307 * First we need to fixup bv_offset, bv_len and 1560 * First we need to fixup bv_offset, bv_len and
1308 * bi_vecs, as the read request might have corrupted these 1561 * bi_vecs, as the read request might have corrupted these
1309 */ 1562 */
@@ -1355,32 +1608,107 @@ done:
1355 * The second for writing. 1608 * The second for writing.
1356 * 1609 *
1357 */ 1610 */
1611static void fix_recovery_read_error(r10bio_t *r10_bio)
1612{
1613 /* We got a read error during recovery.
1614 * We repeat the read in smaller page-sized sections.
1615 * If a read succeeds, write it to the new device or record
1616 * a bad block if we cannot.
1617 * If a read fails, record a bad block on both old and
1618 * new devices.
1619 */
1620 mddev_t *mddev = r10_bio->mddev;
1621 conf_t *conf = mddev->private;
1622 struct bio *bio = r10_bio->devs[0].bio;
1623 sector_t sect = 0;
1624 int sectors = r10_bio->sectors;
1625 int idx = 0;
1626 int dr = r10_bio->devs[0].devnum;
1627 int dw = r10_bio->devs[1].devnum;
1628
1629 while (sectors) {
1630 int s = sectors;
1631 mdk_rdev_t *rdev;
1632 sector_t addr;
1633 int ok;
1634
1635 if (s > (PAGE_SIZE>>9))
1636 s = PAGE_SIZE >> 9;
1637
1638 rdev = conf->mirrors[dr].rdev;
1639 addr = r10_bio->devs[0].addr + sect,
1640 ok = sync_page_io(rdev,
1641 addr,
1642 s << 9,
1643 bio->bi_io_vec[idx].bv_page,
1644 READ, false);
1645 if (ok) {
1646 rdev = conf->mirrors[dw].rdev;
1647 addr = r10_bio->devs[1].addr + sect;
1648 ok = sync_page_io(rdev,
1649 addr,
1650 s << 9,
1651 bio->bi_io_vec[idx].bv_page,
1652 WRITE, false);
1653 if (!ok)
1654 set_bit(WriteErrorSeen, &rdev->flags);
1655 }
1656 if (!ok) {
1657 /* We don't worry if we cannot set a bad block -
1658 * it really is bad so there is no loss in not
1659 * recording it yet
1660 */
1661 rdev_set_badblocks(rdev, addr, s, 0);
1662
1663 if (rdev != conf->mirrors[dw].rdev) {
1664 /* need bad block on destination too */
1665 mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev;
1666 addr = r10_bio->devs[1].addr + sect;
1667 ok = rdev_set_badblocks(rdev2, addr, s, 0);
1668 if (!ok) {
1669 /* just abort the recovery */
1670 printk(KERN_NOTICE
1671 "md/raid10:%s: recovery aborted"
1672 " due to read error\n",
1673 mdname(mddev));
1674
1675 conf->mirrors[dw].recovery_disabled
1676 = mddev->recovery_disabled;
1677 set_bit(MD_RECOVERY_INTR,
1678 &mddev->recovery);
1679 break;
1680 }
1681 }
1682 }
1683
1684 sectors -= s;
1685 sect += s;
1686 idx++;
1687 }
1688}
1358 1689
1359static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) 1690static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1360{ 1691{
1361 conf_t *conf = mddev->private; 1692 conf_t *conf = mddev->private;
1362 int i, d; 1693 int d;
1363 struct bio *bio, *wbio; 1694 struct bio *wbio;
1364 1695
1696 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
1697 fix_recovery_read_error(r10_bio);
1698 end_sync_request(r10_bio);
1699 return;
1700 }
1365 1701
1366 /* move the pages across to the second bio 1702 /*
1703 * share the pages with the first bio
1367 * and submit the write request 1704 * and submit the write request
1368 */ 1705 */
1369 bio = r10_bio->devs[0].bio;
1370 wbio = r10_bio->devs[1].bio; 1706 wbio = r10_bio->devs[1].bio;
1371 for (i=0; i < wbio->bi_vcnt; i++) {
1372 struct page *p = bio->bi_io_vec[i].bv_page;
1373 bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1374 wbio->bi_io_vec[i].bv_page = p;
1375 }
1376 d = r10_bio->devs[1].devnum; 1707 d = r10_bio->devs[1].devnum;
1377 1708
1378 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 1709 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1379 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 1710 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1380 if (test_bit(R10BIO_Uptodate, &r10_bio->state)) 1711 generic_make_request(wbio);
1381 generic_make_request(wbio);
1382 else
1383 bio_endio(wbio, -EIO);
1384} 1712}
1385 1713
1386 1714
@@ -1421,6 +1749,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1421 atomic_set(&rdev->read_errors, read_errors >> hours_since_last); 1749 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1422} 1750}
1423 1751
1752static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
1753 int sectors, struct page *page, int rw)
1754{
1755 sector_t first_bad;
1756 int bad_sectors;
1757
1758 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
1759 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
1760 return -1;
1761 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1762 /* success */
1763 return 1;
1764 if (rw == WRITE)
1765 set_bit(WriteErrorSeen, &rdev->flags);
1766 /* need to record an error - either for the block or the device */
1767 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1768 md_error(rdev->mddev, rdev);
1769 return 0;
1770}
1771
1424/* 1772/*
1425 * This is a kernel thread which: 1773 * This is a kernel thread which:
1426 * 1774 *
@@ -1476,10 +1824,15 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1476 1824
1477 rcu_read_lock(); 1825 rcu_read_lock();
1478 do { 1826 do {
1827 sector_t first_bad;
1828 int bad_sectors;
1829
1479 d = r10_bio->devs[sl].devnum; 1830 d = r10_bio->devs[sl].devnum;
1480 rdev = rcu_dereference(conf->mirrors[d].rdev); 1831 rdev = rcu_dereference(conf->mirrors[d].rdev);
1481 if (rdev && 1832 if (rdev &&
1482 test_bit(In_sync, &rdev->flags)) { 1833 test_bit(In_sync, &rdev->flags) &&
1834 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
1835 &first_bad, &bad_sectors) == 0) {
1483 atomic_inc(&rdev->nr_pending); 1836 atomic_inc(&rdev->nr_pending);
1484 rcu_read_unlock(); 1837 rcu_read_unlock();
1485 success = sync_page_io(rdev, 1838 success = sync_page_io(rdev,
@@ -1499,9 +1852,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1499 rcu_read_unlock(); 1852 rcu_read_unlock();
1500 1853
1501 if (!success) { 1854 if (!success) {
1502 /* Cannot read from anywhere -- bye bye array */ 1855 /* Cannot read from anywhere, just mark the block
1856 * as bad on the first device to discourage future
1857 * reads.
1858 */
1503 int dn = r10_bio->devs[r10_bio->read_slot].devnum; 1859 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
1504 md_error(mddev, conf->mirrors[dn].rdev); 1860 rdev = conf->mirrors[dn].rdev;
1861
1862 if (!rdev_set_badblocks(
1863 rdev,
1864 r10_bio->devs[r10_bio->read_slot].addr
1865 + sect,
1866 s, 0))
1867 md_error(mddev, rdev);
1505 break; 1868 break;
1506 } 1869 }
1507 1870
@@ -1516,80 +1879,82 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1516 sl--; 1879 sl--;
1517 d = r10_bio->devs[sl].devnum; 1880 d = r10_bio->devs[sl].devnum;
1518 rdev = rcu_dereference(conf->mirrors[d].rdev); 1881 rdev = rcu_dereference(conf->mirrors[d].rdev);
1519 if (rdev && 1882 if (!rdev ||
1520 test_bit(In_sync, &rdev->flags)) { 1883 !test_bit(In_sync, &rdev->flags))
1521 atomic_inc(&rdev->nr_pending); 1884 continue;
1522 rcu_read_unlock(); 1885
1523 atomic_add(s, &rdev->corrected_errors); 1886 atomic_inc(&rdev->nr_pending);
1524 if (sync_page_io(rdev, 1887 rcu_read_unlock();
1525 r10_bio->devs[sl].addr + 1888 if (r10_sync_page_io(rdev,
1526 sect, 1889 r10_bio->devs[sl].addr +
1527 s<<9, conf->tmppage, WRITE, false) 1890 sect,
1528 == 0) { 1891 s<<9, conf->tmppage, WRITE)
1529 /* Well, this device is dead */ 1892 == 0) {
1530 printk(KERN_NOTICE 1893 /* Well, this device is dead */
1531 "md/raid10:%s: read correction " 1894 printk(KERN_NOTICE
1532 "write failed" 1895 "md/raid10:%s: read correction "
1533 " (%d sectors at %llu on %s)\n", 1896 "write failed"
1534 mdname(mddev), s, 1897 " (%d sectors at %llu on %s)\n",
1535 (unsigned long long)( 1898 mdname(mddev), s,
1536 sect + rdev->data_offset), 1899 (unsigned long long)(
1537 bdevname(rdev->bdev, b)); 1900 sect + rdev->data_offset),
1538 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 1901 bdevname(rdev->bdev, b));
1539 "drive\n", 1902 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1540 mdname(mddev), 1903 "drive\n",
1541 bdevname(rdev->bdev, b)); 1904 mdname(mddev),
1542 md_error(mddev, rdev); 1905 bdevname(rdev->bdev, b));
1543 }
1544 rdev_dec_pending(rdev, mddev);
1545 rcu_read_lock();
1546 } 1906 }
1907 rdev_dec_pending(rdev, mddev);
1908 rcu_read_lock();
1547 } 1909 }
1548 sl = start; 1910 sl = start;
1549 while (sl != r10_bio->read_slot) { 1911 while (sl != r10_bio->read_slot) {
1912 char b[BDEVNAME_SIZE];
1550 1913
1551 if (sl==0) 1914 if (sl==0)
1552 sl = conf->copies; 1915 sl = conf->copies;
1553 sl--; 1916 sl--;
1554 d = r10_bio->devs[sl].devnum; 1917 d = r10_bio->devs[sl].devnum;
1555 rdev = rcu_dereference(conf->mirrors[d].rdev); 1918 rdev = rcu_dereference(conf->mirrors[d].rdev);
1556 if (rdev && 1919 if (!rdev ||
1557 test_bit(In_sync, &rdev->flags)) { 1920 !test_bit(In_sync, &rdev->flags))
1558 char b[BDEVNAME_SIZE]; 1921 continue;
1559 atomic_inc(&rdev->nr_pending);
1560 rcu_read_unlock();
1561 if (sync_page_io(rdev,
1562 r10_bio->devs[sl].addr +
1563 sect,
1564 s<<9, conf->tmppage,
1565 READ, false) == 0) {
1566 /* Well, this device is dead */
1567 printk(KERN_NOTICE
1568 "md/raid10:%s: unable to read back "
1569 "corrected sectors"
1570 " (%d sectors at %llu on %s)\n",
1571 mdname(mddev), s,
1572 (unsigned long long)(
1573 sect + rdev->data_offset),
1574 bdevname(rdev->bdev, b));
1575 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
1576 mdname(mddev),
1577 bdevname(rdev->bdev, b));
1578
1579 md_error(mddev, rdev);
1580 } else {
1581 printk(KERN_INFO
1582 "md/raid10:%s: read error corrected"
1583 " (%d sectors at %llu on %s)\n",
1584 mdname(mddev), s,
1585 (unsigned long long)(
1586 sect + rdev->data_offset),
1587 bdevname(rdev->bdev, b));
1588 }
1589 1922
1590 rdev_dec_pending(rdev, mddev); 1923 atomic_inc(&rdev->nr_pending);
1591 rcu_read_lock(); 1924 rcu_read_unlock();
1925 switch (r10_sync_page_io(rdev,
1926 r10_bio->devs[sl].addr +
1927 sect,
1928 s<<9, conf->tmppage,
1929 READ)) {
1930 case 0:
1931 /* Well, this device is dead */
1932 printk(KERN_NOTICE
1933 "md/raid10:%s: unable to read back "
1934 "corrected sectors"
1935 " (%d sectors at %llu on %s)\n",
1936 mdname(mddev), s,
1937 (unsigned long long)(
1938 sect + rdev->data_offset),
1939 bdevname(rdev->bdev, b));
1940 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1941 "drive\n",
1942 mdname(mddev),
1943 bdevname(rdev->bdev, b));
1944 break;
1945 case 1:
1946 printk(KERN_INFO
1947 "md/raid10:%s: read error corrected"
1948 " (%d sectors at %llu on %s)\n",
1949 mdname(mddev), s,
1950 (unsigned long long)(
1951 sect + rdev->data_offset),
1952 bdevname(rdev->bdev, b));
1953 atomic_add(s, &rdev->corrected_errors);
1592 } 1954 }
1955
1956 rdev_dec_pending(rdev, mddev);
1957 rcu_read_lock();
1593 } 1958 }
1594 rcu_read_unlock(); 1959 rcu_read_unlock();
1595 1960
@@ -1598,21 +1963,254 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1598 } 1963 }
1599} 1964}
1600 1965
1966static void bi_complete(struct bio *bio, int error)
1967{
1968 complete((struct completion *)bio->bi_private);
1969}
1970
1971static int submit_bio_wait(int rw, struct bio *bio)
1972{
1973 struct completion event;
1974 rw |= REQ_SYNC;
1975
1976 init_completion(&event);
1977 bio->bi_private = &event;
1978 bio->bi_end_io = bi_complete;
1979 submit_bio(rw, bio);
1980 wait_for_completion(&event);
1981
1982 return test_bit(BIO_UPTODATE, &bio->bi_flags);
1983}
1984
1985static int narrow_write_error(r10bio_t *r10_bio, int i)
1986{
1987 struct bio *bio = r10_bio->master_bio;
1988 mddev_t *mddev = r10_bio->mddev;
1989 conf_t *conf = mddev->private;
1990 mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
1991 /* bio has the data to be written to slot 'i' where
1992 * we just recently had a write error.
1993 * We repeatedly clone the bio and trim down to one block,
1994 * then try the write. Where the write fails we record
1995 * a bad block.
1996 * It is conceivable that the bio doesn't exactly align with
1997 * blocks. We must handle this.
1998 *
1999 * We currently own a reference to the rdev.
2000 */
2001
2002 int block_sectors;
2003 sector_t sector;
2004 int sectors;
2005 int sect_to_write = r10_bio->sectors;
2006 int ok = 1;
2007
2008 if (rdev->badblocks.shift < 0)
2009 return 0;
2010
2011 block_sectors = 1 << rdev->badblocks.shift;
2012 sector = r10_bio->sector;
2013 sectors = ((r10_bio->sector + block_sectors)
2014 & ~(sector_t)(block_sectors - 1))
2015 - sector;
2016
2017 while (sect_to_write) {
2018 struct bio *wbio;
2019 if (sectors > sect_to_write)
2020 sectors = sect_to_write;
2021 /* Write at 'sector' for 'sectors' */
2022 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2023 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2024 wbio->bi_sector = (r10_bio->devs[i].addr+
2025 rdev->data_offset+
2026 (sector - r10_bio->sector));
2027 wbio->bi_bdev = rdev->bdev;
2028 if (submit_bio_wait(WRITE, wbio) == 0)
2029 /* Failure! */
2030 ok = rdev_set_badblocks(rdev, sector,
2031 sectors, 0)
2032 && ok;
2033
2034 bio_put(wbio);
2035 sect_to_write -= sectors;
2036 sector += sectors;
2037 sectors = block_sectors;
2038 }
2039 return ok;
2040}
2041
2042static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
2043{
2044 int slot = r10_bio->read_slot;
2045 int mirror = r10_bio->devs[slot].devnum;
2046 struct bio *bio;
2047 conf_t *conf = mddev->private;
2048 mdk_rdev_t *rdev;
2049 char b[BDEVNAME_SIZE];
2050 unsigned long do_sync;
2051 int max_sectors;
2052
2053 /* we got a read error. Maybe the drive is bad. Maybe just
2054 * the block and we can fix it.
2055 * We freeze all other IO, and try reading the block from
2056 * other devices. When we find one, we re-write
2057 * and check it that fixes the read error.
2058 * This is all done synchronously while the array is
2059 * frozen.
2060 */
2061 if (mddev->ro == 0) {
2062 freeze_array(conf);
2063 fix_read_error(conf, mddev, r10_bio);
2064 unfreeze_array(conf);
2065 }
2066 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
2067
2068 bio = r10_bio->devs[slot].bio;
2069 bdevname(bio->bi_bdev, b);
2070 r10_bio->devs[slot].bio =
2071 mddev->ro ? IO_BLOCKED : NULL;
2072read_more:
2073 mirror = read_balance(conf, r10_bio, &max_sectors);
2074 if (mirror == -1) {
2075 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2076 " read error for block %llu\n",
2077 mdname(mddev), b,
2078 (unsigned long long)r10_bio->sector);
2079 raid_end_bio_io(r10_bio);
2080 bio_put(bio);
2081 return;
2082 }
2083
2084 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2085 if (bio)
2086 bio_put(bio);
2087 slot = r10_bio->read_slot;
2088 rdev = conf->mirrors[mirror].rdev;
2089 printk_ratelimited(
2090 KERN_ERR
2091 "md/raid10:%s: %s: redirecting"
2092 "sector %llu to another mirror\n",
2093 mdname(mddev),
2094 bdevname(rdev->bdev, b),
2095 (unsigned long long)r10_bio->sector);
2096 bio = bio_clone_mddev(r10_bio->master_bio,
2097 GFP_NOIO, mddev);
2098 md_trim_bio(bio,
2099 r10_bio->sector - bio->bi_sector,
2100 max_sectors);
2101 r10_bio->devs[slot].bio = bio;
2102 bio->bi_sector = r10_bio->devs[slot].addr
2103 + rdev->data_offset;
2104 bio->bi_bdev = rdev->bdev;
2105 bio->bi_rw = READ | do_sync;
2106 bio->bi_private = r10_bio;
2107 bio->bi_end_io = raid10_end_read_request;
2108 if (max_sectors < r10_bio->sectors) {
2109 /* Drat - have to split this up more */
2110 struct bio *mbio = r10_bio->master_bio;
2111 int sectors_handled =
2112 r10_bio->sector + max_sectors
2113 - mbio->bi_sector;
2114 r10_bio->sectors = max_sectors;
2115 spin_lock_irq(&conf->device_lock);
2116 if (mbio->bi_phys_segments == 0)
2117 mbio->bi_phys_segments = 2;
2118 else
2119 mbio->bi_phys_segments++;
2120 spin_unlock_irq(&conf->device_lock);
2121 generic_make_request(bio);
2122 bio = NULL;
2123
2124 r10_bio = mempool_alloc(conf->r10bio_pool,
2125 GFP_NOIO);
2126 r10_bio->master_bio = mbio;
2127 r10_bio->sectors = (mbio->bi_size >> 9)
2128 - sectors_handled;
2129 r10_bio->state = 0;
2130 set_bit(R10BIO_ReadError,
2131 &r10_bio->state);
2132 r10_bio->mddev = mddev;
2133 r10_bio->sector = mbio->bi_sector
2134 + sectors_handled;
2135
2136 goto read_more;
2137 } else
2138 generic_make_request(bio);
2139}
2140
2141static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio)
2142{
2143 /* Some sort of write request has finished and it
2144 * succeeded in writing where we thought there was a
2145 * bad block. So forget the bad block.
2146 * Or possibly if failed and we need to record
2147 * a bad block.
2148 */
2149 int m;
2150 mdk_rdev_t *rdev;
2151
2152 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2153 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2154 for (m = 0; m < conf->copies; m++) {
2155 int dev = r10_bio->devs[m].devnum;
2156 rdev = conf->mirrors[dev].rdev;
2157 if (r10_bio->devs[m].bio == NULL)
2158 continue;
2159 if (test_bit(BIO_UPTODATE,
2160 &r10_bio->devs[m].bio->bi_flags)) {
2161 rdev_clear_badblocks(
2162 rdev,
2163 r10_bio->devs[m].addr,
2164 r10_bio->sectors);
2165 } else {
2166 if (!rdev_set_badblocks(
2167 rdev,
2168 r10_bio->devs[m].addr,
2169 r10_bio->sectors, 0))
2170 md_error(conf->mddev, rdev);
2171 }
2172 }
2173 put_buf(r10_bio);
2174 } else {
2175 for (m = 0; m < conf->copies; m++) {
2176 int dev = r10_bio->devs[m].devnum;
2177 struct bio *bio = r10_bio->devs[m].bio;
2178 rdev = conf->mirrors[dev].rdev;
2179 if (bio == IO_MADE_GOOD) {
2180 rdev_clear_badblocks(
2181 rdev,
2182 r10_bio->devs[m].addr,
2183 r10_bio->sectors);
2184 rdev_dec_pending(rdev, conf->mddev);
2185 } else if (bio != NULL &&
2186 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2187 if (!narrow_write_error(r10_bio, m)) {
2188 md_error(conf->mddev, rdev);
2189 set_bit(R10BIO_Degraded,
2190 &r10_bio->state);
2191 }
2192 rdev_dec_pending(rdev, conf->mddev);
2193 }
2194 }
2195 if (test_bit(R10BIO_WriteError,
2196 &r10_bio->state))
2197 close_write(r10_bio);
2198 raid_end_bio_io(r10_bio);
2199 }
2200}
2201
1601static void raid10d(mddev_t *mddev) 2202static void raid10d(mddev_t *mddev)
1602{ 2203{
1603 r10bio_t *r10_bio; 2204 r10bio_t *r10_bio;
1604 struct bio *bio;
1605 unsigned long flags; 2205 unsigned long flags;
1606 conf_t *conf = mddev->private; 2206 conf_t *conf = mddev->private;
1607 struct list_head *head = &conf->retry_list; 2207 struct list_head *head = &conf->retry_list;
1608 mdk_rdev_t *rdev;
1609 struct blk_plug plug; 2208 struct blk_plug plug;
1610 2209
1611 md_check_recovery(mddev); 2210 md_check_recovery(mddev);
1612 2211
1613 blk_start_plug(&plug); 2212 blk_start_plug(&plug);
1614 for (;;) { 2213 for (;;) {
1615 char b[BDEVNAME_SIZE];
1616 2214
1617 flush_pending_writes(conf); 2215 flush_pending_writes(conf);
1618 2216
@@ -1628,64 +2226,26 @@ static void raid10d(mddev_t *mddev)
1628 2226
1629 mddev = r10_bio->mddev; 2227 mddev = r10_bio->mddev;
1630 conf = mddev->private; 2228 conf = mddev->private;
1631 if (test_bit(R10BIO_IsSync, &r10_bio->state)) 2229 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2230 test_bit(R10BIO_WriteError, &r10_bio->state))
2231 handle_write_completed(conf, r10_bio);
2232 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
1632 sync_request_write(mddev, r10_bio); 2233 sync_request_write(mddev, r10_bio);
1633 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 2234 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
1634 recovery_request_write(mddev, r10_bio); 2235 recovery_request_write(mddev, r10_bio);
2236 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2237 handle_read_error(mddev, r10_bio);
1635 else { 2238 else {
1636 int slot = r10_bio->read_slot; 2239 /* just a partial read to be scheduled from a
1637 int mirror = r10_bio->devs[slot].devnum; 2240 * separate context
1638 /* we got a read error. Maybe the drive is bad. Maybe just
1639 * the block and we can fix it.
1640 * We freeze all other IO, and try reading the block from
1641 * other devices. When we find one, we re-write
1642 * and check it that fixes the read error.
1643 * This is all done synchronously while the array is
1644 * frozen.
1645 */ 2241 */
1646 if (mddev->ro == 0) { 2242 int slot = r10_bio->read_slot;
1647 freeze_array(conf); 2243 generic_make_request(r10_bio->devs[slot].bio);
1648 fix_read_error(conf, mddev, r10_bio);
1649 unfreeze_array(conf);
1650 }
1651 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
1652
1653 bio = r10_bio->devs[slot].bio;
1654 r10_bio->devs[slot].bio =
1655 mddev->ro ? IO_BLOCKED : NULL;
1656 mirror = read_balance(conf, r10_bio);
1657 if (mirror == -1) {
1658 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
1659 " read error for block %llu\n",
1660 mdname(mddev),
1661 bdevname(bio->bi_bdev,b),
1662 (unsigned long long)r10_bio->sector);
1663 raid_end_bio_io(r10_bio);
1664 bio_put(bio);
1665 } else {
1666 const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
1667 bio_put(bio);
1668 slot = r10_bio->read_slot;
1669 rdev = conf->mirrors[mirror].rdev;
1670 if (printk_ratelimit())
1671 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
1672 " another mirror\n",
1673 mdname(mddev),
1674 bdevname(rdev->bdev,b),
1675 (unsigned long long)r10_bio->sector);
1676 bio = bio_clone_mddev(r10_bio->master_bio,
1677 GFP_NOIO, mddev);
1678 r10_bio->devs[slot].bio = bio;
1679 bio->bi_sector = r10_bio->devs[slot].addr
1680 + rdev->data_offset;
1681 bio->bi_bdev = rdev->bdev;
1682 bio->bi_rw = READ | do_sync;
1683 bio->bi_private = r10_bio;
1684 bio->bi_end_io = raid10_end_read_request;
1685 generic_make_request(bio);
1686 }
1687 } 2244 }
2245
1688 cond_resched(); 2246 cond_resched();
2247 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2248 md_check_recovery(mddev);
1689 } 2249 }
1690 blk_finish_plug(&plug); 2250 blk_finish_plug(&plug);
1691} 2251}
@@ -1746,7 +2306,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1746 int i; 2306 int i;
1747 int max_sync; 2307 int max_sync;
1748 sector_t sync_blocks; 2308 sector_t sync_blocks;
1749
1750 sector_t sectors_skipped = 0; 2309 sector_t sectors_skipped = 0;
1751 int chunks_skipped = 0; 2310 int chunks_skipped = 0;
1752 2311
@@ -1828,7 +2387,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1828 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); 2387 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1829 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2388 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1830 /* recovery... the complicated one */ 2389 /* recovery... the complicated one */
1831 int j, k; 2390 int j;
1832 r10_bio = NULL; 2391 r10_bio = NULL;
1833 2392
1834 for (i=0 ; i<conf->raid_disks; i++) { 2393 for (i=0 ; i<conf->raid_disks; i++) {
@@ -1836,6 +2395,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1836 r10bio_t *rb2; 2395 r10bio_t *rb2;
1837 sector_t sect; 2396 sector_t sect;
1838 int must_sync; 2397 int must_sync;
2398 int any_working;
1839 2399
1840 if (conf->mirrors[i].rdev == NULL || 2400 if (conf->mirrors[i].rdev == NULL ||
1841 test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 2401 test_bit(In_sync, &conf->mirrors[i].rdev->flags))
@@ -1887,19 +2447,42 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1887 must_sync = bitmap_start_sync(mddev->bitmap, sect, 2447 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1888 &sync_blocks, still_degraded); 2448 &sync_blocks, still_degraded);
1889 2449
2450 any_working = 0;
1890 for (j=0; j<conf->copies;j++) { 2451 for (j=0; j<conf->copies;j++) {
2452 int k;
1891 int d = r10_bio->devs[j].devnum; 2453 int d = r10_bio->devs[j].devnum;
2454 sector_t from_addr, to_addr;
2455 mdk_rdev_t *rdev;
2456 sector_t sector, first_bad;
2457 int bad_sectors;
1892 if (!conf->mirrors[d].rdev || 2458 if (!conf->mirrors[d].rdev ||
1893 !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) 2459 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
1894 continue; 2460 continue;
1895 /* This is where we read from */ 2461 /* This is where we read from */
2462 any_working = 1;
2463 rdev = conf->mirrors[d].rdev;
2464 sector = r10_bio->devs[j].addr;
2465
2466 if (is_badblock(rdev, sector, max_sync,
2467 &first_bad, &bad_sectors)) {
2468 if (first_bad > sector)
2469 max_sync = first_bad - sector;
2470 else {
2471 bad_sectors -= (sector
2472 - first_bad);
2473 if (max_sync > bad_sectors)
2474 max_sync = bad_sectors;
2475 continue;
2476 }
2477 }
1896 bio = r10_bio->devs[0].bio; 2478 bio = r10_bio->devs[0].bio;
1897 bio->bi_next = biolist; 2479 bio->bi_next = biolist;
1898 biolist = bio; 2480 biolist = bio;
1899 bio->bi_private = r10_bio; 2481 bio->bi_private = r10_bio;
1900 bio->bi_end_io = end_sync_read; 2482 bio->bi_end_io = end_sync_read;
1901 bio->bi_rw = READ; 2483 bio->bi_rw = READ;
1902 bio->bi_sector = r10_bio->devs[j].addr + 2484 from_addr = r10_bio->devs[j].addr;
2485 bio->bi_sector = from_addr +
1903 conf->mirrors[d].rdev->data_offset; 2486 conf->mirrors[d].rdev->data_offset;
1904 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2487 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1905 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2488 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
@@ -1916,26 +2499,48 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1916 bio->bi_private = r10_bio; 2499 bio->bi_private = r10_bio;
1917 bio->bi_end_io = end_sync_write; 2500 bio->bi_end_io = end_sync_write;
1918 bio->bi_rw = WRITE; 2501 bio->bi_rw = WRITE;
1919 bio->bi_sector = r10_bio->devs[k].addr + 2502 to_addr = r10_bio->devs[k].addr;
2503 bio->bi_sector = to_addr +
1920 conf->mirrors[i].rdev->data_offset; 2504 conf->mirrors[i].rdev->data_offset;
1921 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 2505 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1922 2506
1923 r10_bio->devs[0].devnum = d; 2507 r10_bio->devs[0].devnum = d;
2508 r10_bio->devs[0].addr = from_addr;
1924 r10_bio->devs[1].devnum = i; 2509 r10_bio->devs[1].devnum = i;
2510 r10_bio->devs[1].addr = to_addr;
1925 2511
1926 break; 2512 break;
1927 } 2513 }
1928 if (j == conf->copies) { 2514 if (j == conf->copies) {
1929 /* Cannot recover, so abort the recovery */ 2515 /* Cannot recover, so abort the recovery or
2516 * record a bad block */
1930 put_buf(r10_bio); 2517 put_buf(r10_bio);
1931 if (rb2) 2518 if (rb2)
1932 atomic_dec(&rb2->remaining); 2519 atomic_dec(&rb2->remaining);
1933 r10_bio = rb2; 2520 r10_bio = rb2;
1934 if (!test_and_set_bit(MD_RECOVERY_INTR, 2521 if (any_working) {
1935 &mddev->recovery)) 2522 /* problem is that there are bad blocks
1936 printk(KERN_INFO "md/raid10:%s: insufficient " 2523 * on other device(s)
1937 "working devices for recovery.\n", 2524 */
1938 mdname(mddev)); 2525 int k;
2526 for (k = 0; k < conf->copies; k++)
2527 if (r10_bio->devs[k].devnum == i)
2528 break;
2529 if (!rdev_set_badblocks(
2530 conf->mirrors[i].rdev,
2531 r10_bio->devs[k].addr,
2532 max_sync, 0))
2533 any_working = 0;
2534 }
2535 if (!any_working) {
2536 if (!test_and_set_bit(MD_RECOVERY_INTR,
2537 &mddev->recovery))
2538 printk(KERN_INFO "md/raid10:%s: insufficient "
2539 "working devices for recovery.\n",
2540 mdname(mddev));
2541 conf->mirrors[i].recovery_disabled
2542 = mddev->recovery_disabled;
2543 }
1939 break; 2544 break;
1940 } 2545 }
1941 } 2546 }
@@ -1979,12 +2584,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1979 2584
1980 for (i=0; i<conf->copies; i++) { 2585 for (i=0; i<conf->copies; i++) {
1981 int d = r10_bio->devs[i].devnum; 2586 int d = r10_bio->devs[i].devnum;
2587 sector_t first_bad, sector;
2588 int bad_sectors;
2589
1982 bio = r10_bio->devs[i].bio; 2590 bio = r10_bio->devs[i].bio;
1983 bio->bi_end_io = NULL; 2591 bio->bi_end_io = NULL;
1984 clear_bit(BIO_UPTODATE, &bio->bi_flags); 2592 clear_bit(BIO_UPTODATE, &bio->bi_flags);
1985 if (conf->mirrors[d].rdev == NULL || 2593 if (conf->mirrors[d].rdev == NULL ||
1986 test_bit(Faulty, &conf->mirrors[d].rdev->flags)) 2594 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1987 continue; 2595 continue;
2596 sector = r10_bio->devs[i].addr;
2597 if (is_badblock(conf->mirrors[d].rdev,
2598 sector, max_sync,
2599 &first_bad, &bad_sectors)) {
2600 if (first_bad > sector)
2601 max_sync = first_bad - sector;
2602 else {
2603 bad_sectors -= (sector - first_bad);
2604 if (max_sync > bad_sectors)
2605 max_sync = max_sync;
2606 continue;
2607 }
2608 }
1988 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2609 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1989 atomic_inc(&r10_bio->remaining); 2610 atomic_inc(&r10_bio->remaining);
1990 bio->bi_next = biolist; 2611 bio->bi_next = biolist;
@@ -1992,7 +2613,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1992 bio->bi_private = r10_bio; 2613 bio->bi_private = r10_bio;
1993 bio->bi_end_io = end_sync_read; 2614 bio->bi_end_io = end_sync_read;
1994 bio->bi_rw = READ; 2615 bio->bi_rw = READ;
1995 bio->bi_sector = r10_bio->devs[i].addr + 2616 bio->bi_sector = sector +
1996 conf->mirrors[d].rdev->data_offset; 2617 conf->mirrors[d].rdev->data_offset;
1997 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2618 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1998 count++; 2619 count++;
@@ -2079,7 +2700,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
2079 return sectors_skipped + nr_sectors; 2700 return sectors_skipped + nr_sectors;
2080 giveup: 2701 giveup:
2081 /* There is nowhere to write, so all non-sync 2702 /* There is nowhere to write, so all non-sync
2082 * drives must be failed, so try the next chunk... 2703 * drives must be failed or in resync, all drives
2704 * have a bad block, so try the next chunk...
2083 */ 2705 */
2084 if (sector_nr + max_sync < max_sector) 2706 if (sector_nr + max_sync < max_sector)
2085 max_sector = sector_nr + max_sync; 2707 max_sector = sector_nr + max_sync;
@@ -2249,6 +2871,7 @@ static int run(mddev_t *mddev)
2249 (conf->raid_disks / conf->near_copies)); 2871 (conf->raid_disks / conf->near_copies));
2250 2872
2251 list_for_each_entry(rdev, &mddev->disks, same_set) { 2873 list_for_each_entry(rdev, &mddev->disks, same_set) {
2874
2252 disk_idx = rdev->raid_disk; 2875 disk_idx = rdev->raid_disk;
2253 if (disk_idx >= conf->raid_disks 2876 if (disk_idx >= conf->raid_disks
2254 || disk_idx < 0) 2877 || disk_idx < 0)
@@ -2271,7 +2894,7 @@ static int run(mddev_t *mddev)
2271 disk->head_position = 0; 2894 disk->head_position = 0;
2272 } 2895 }
2273 /* need to check that every block has at least one working mirror */ 2896 /* need to check that every block has at least one working mirror */
2274 if (!enough(conf)) { 2897 if (!enough(conf, -1)) {
2275 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", 2898 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
2276 mdname(mddev)); 2899 mdname(mddev));
2277 goto out_free_conf; 2900 goto out_free_conf;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 944b1104d3b4..79cb52a0d4a2 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -6,6 +6,11 @@ typedef struct mirror_info mirror_info_t;
6struct mirror_info { 6struct mirror_info {
7 mdk_rdev_t *rdev; 7 mdk_rdev_t *rdev;
8 sector_t head_position; 8 sector_t head_position;
9 int recovery_disabled; /* matches
10 * mddev->recovery_disabled
11 * when we shouldn't try
12 * recovering this device.
13 */
9}; 14};
10 15
11typedef struct r10bio_s r10bio_t; 16typedef struct r10bio_s r10bio_t;
@@ -113,10 +118,26 @@ struct r10bio_s {
113 * level, we store IO_BLOCKED in the appropriate 'bios' pointer 118 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
114 */ 119 */
115#define IO_BLOCKED ((struct bio*)1) 120#define IO_BLOCKED ((struct bio*)1)
121/* When we successfully write to a known bad-block, we need to remove the
122 * bad-block marking which must be done from process context. So we record
123 * the success by setting devs[n].bio to IO_MADE_GOOD
124 */
125#define IO_MADE_GOOD ((struct bio *)2)
126
127#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
116 128
117/* bits for r10bio.state */ 129/* bits for r10bio.state */
118#define R10BIO_Uptodate 0 130#define R10BIO_Uptodate 0
119#define R10BIO_IsSync 1 131#define R10BIO_IsSync 1
120#define R10BIO_IsRecover 2 132#define R10BIO_IsRecover 2
121#define R10BIO_Degraded 3 133#define R10BIO_Degraded 3
134/* Set ReadError on bios that experience a read error
135 * so that raid10d knows what to do with them.
136 */
137#define R10BIO_ReadError 4
138/* If a write for this request means we can clear some
139 * known-bad-block records, we set this flag.
140 */
141#define R10BIO_MadeGood 5
142#define R10BIO_WriteError 6
122#endif 143#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b72edf35ec54..dbae459fb02d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -51,6 +51,7 @@
51#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/cpu.h> 52#include <linux/cpu.h>
53#include <linux/slab.h> 53#include <linux/slab.h>
54#include <linux/ratelimit.h>
54#include "md.h" 55#include "md.h"
55#include "raid5.h" 56#include "raid5.h"
56#include "raid0.h" 57#include "raid0.h"
@@ -96,8 +97,6 @@
96#define __inline__ 97#define __inline__
97#endif 98#endif
98 99
99#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
100
101/* 100/*
102 * We maintain a biased count of active stripes in the bottom 16 bits of 101 * We maintain a biased count of active stripes in the bottom 16 bits of
103 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 102 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -341,7 +340,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
341 (unsigned long long)sh->sector, i, dev->toread, 340 (unsigned long long)sh->sector, i, dev->toread,
342 dev->read, dev->towrite, dev->written, 341 dev->read, dev->towrite, dev->written,
343 test_bit(R5_LOCKED, &dev->flags)); 342 test_bit(R5_LOCKED, &dev->flags));
344 BUG(); 343 WARN_ON(1);
345 } 344 }
346 dev->flags = 0; 345 dev->flags = 0;
347 raid5_build_block(sh, i, previous); 346 raid5_build_block(sh, i, previous);
@@ -527,6 +526,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
527 atomic_inc(&rdev->nr_pending); 526 atomic_inc(&rdev->nr_pending);
528 rcu_read_unlock(); 527 rcu_read_unlock();
529 528
529 /* We have already checked bad blocks for reads. Now
530 * need to check for writes.
531 */
532 while ((rw & WRITE) && rdev &&
533 test_bit(WriteErrorSeen, &rdev->flags)) {
534 sector_t first_bad;
535 int bad_sectors;
536 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
537 &first_bad, &bad_sectors);
538 if (!bad)
539 break;
540
541 if (bad < 0) {
542 set_bit(BlockedBadBlocks, &rdev->flags);
543 if (!conf->mddev->external &&
544 conf->mddev->flags) {
545 /* It is very unlikely, but we might
546 * still need to write out the
547 * bad block log - better give it
548 * a chance*/
549 md_check_recovery(conf->mddev);
550 }
551 md_wait_for_blocked_rdev(rdev, conf->mddev);
552 } else {
553 /* Acknowledged bad block - skip the write */
554 rdev_dec_pending(rdev, conf->mddev);
555 rdev = NULL;
556 }
557 }
558
530 if (rdev) { 559 if (rdev) {
531 if (s->syncing || s->expanding || s->expanded) 560 if (s->syncing || s->expanding || s->expanded)
532 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 561 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
@@ -548,10 +577,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
548 bi->bi_io_vec[0].bv_offset = 0; 577 bi->bi_io_vec[0].bv_offset = 0;
549 bi->bi_size = STRIPE_SIZE; 578 bi->bi_size = STRIPE_SIZE;
550 bi->bi_next = NULL; 579 bi->bi_next = NULL;
551 if ((rw & WRITE) &&
552 test_bit(R5_ReWrite, &sh->dev[i].flags))
553 atomic_add(STRIPE_SECTORS,
554 &rdev->corrected_errors);
555 generic_make_request(bi); 580 generic_make_request(bi);
556 } else { 581 } else {
557 if (rw & WRITE) 582 if (rw & WRITE)
@@ -1020,12 +1045,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1020 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1045 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1021 struct bio *wbi; 1046 struct bio *wbi;
1022 1047
1023 spin_lock(&sh->lock); 1048 spin_lock_irq(&sh->raid_conf->device_lock);
1024 chosen = dev->towrite; 1049 chosen = dev->towrite;
1025 dev->towrite = NULL; 1050 dev->towrite = NULL;
1026 BUG_ON(dev->written); 1051 BUG_ON(dev->written);
1027 wbi = dev->written = chosen; 1052 wbi = dev->written = chosen;
1028 spin_unlock(&sh->lock); 1053 spin_unlock_irq(&sh->raid_conf->device_lock);
1029 1054
1030 while (wbi && wbi->bi_sector < 1055 while (wbi && wbi->bi_sector <
1031 dev->sector + STRIPE_SECTORS) { 1056 dev->sector + STRIPE_SECTORS) {
@@ -1315,12 +1340,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1315static int grow_one_stripe(raid5_conf_t *conf) 1340static int grow_one_stripe(raid5_conf_t *conf)
1316{ 1341{
1317 struct stripe_head *sh; 1342 struct stripe_head *sh;
1318 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1343 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
1319 if (!sh) 1344 if (!sh)
1320 return 0; 1345 return 0;
1321 memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); 1346
1322 sh->raid_conf = conf; 1347 sh->raid_conf = conf;
1323 spin_lock_init(&sh->lock);
1324 #ifdef CONFIG_MULTICORE_RAID456 1348 #ifdef CONFIG_MULTICORE_RAID456
1325 init_waitqueue_head(&sh->ops.wait_for_ops); 1349 init_waitqueue_head(&sh->ops.wait_for_ops);
1326 #endif 1350 #endif
@@ -1435,14 +1459,11 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1435 return -ENOMEM; 1459 return -ENOMEM;
1436 1460
1437 for (i = conf->max_nr_stripes; i; i--) { 1461 for (i = conf->max_nr_stripes; i; i--) {
1438 nsh = kmem_cache_alloc(sc, GFP_KERNEL); 1462 nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
1439 if (!nsh) 1463 if (!nsh)
1440 break; 1464 break;
1441 1465
1442 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
1443
1444 nsh->raid_conf = conf; 1466 nsh->raid_conf = conf;
1445 spin_lock_init(&nsh->lock);
1446 #ifdef CONFIG_MULTICORE_RAID456 1467 #ifdef CONFIG_MULTICORE_RAID456
1447 init_waitqueue_head(&nsh->ops.wait_for_ops); 1468 init_waitqueue_head(&nsh->ops.wait_for_ops);
1448 #endif 1469 #endif
@@ -1587,12 +1608,15 @@ static void raid5_end_read_request(struct bio * bi, int error)
1587 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1608 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1588 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1609 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1589 rdev = conf->disks[i].rdev; 1610 rdev = conf->disks[i].rdev;
1590 printk_rl(KERN_INFO "md/raid:%s: read error corrected" 1611 printk_ratelimited(
1591 " (%lu sectors at %llu on %s)\n", 1612 KERN_INFO
1592 mdname(conf->mddev), STRIPE_SECTORS, 1613 "md/raid:%s: read error corrected"
1593 (unsigned long long)(sh->sector 1614 " (%lu sectors at %llu on %s)\n",
1594 + rdev->data_offset), 1615 mdname(conf->mddev), STRIPE_SECTORS,
1595 bdevname(rdev->bdev, b)); 1616 (unsigned long long)(sh->sector
1617 + rdev->data_offset),
1618 bdevname(rdev->bdev, b));
1619 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1596 clear_bit(R5_ReadError, &sh->dev[i].flags); 1620 clear_bit(R5_ReadError, &sh->dev[i].flags);
1597 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1621 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1598 } 1622 }
@@ -1606,22 +1630,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
1606 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1630 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1607 atomic_inc(&rdev->read_errors); 1631 atomic_inc(&rdev->read_errors);
1608 if (conf->mddev->degraded >= conf->max_degraded) 1632 if (conf->mddev->degraded >= conf->max_degraded)
1609 printk_rl(KERN_WARNING 1633 printk_ratelimited(
1610 "md/raid:%s: read error not correctable " 1634 KERN_WARNING
1611 "(sector %llu on %s).\n", 1635 "md/raid:%s: read error not correctable "
1612 mdname(conf->mddev), 1636 "(sector %llu on %s).\n",
1613 (unsigned long long)(sh->sector 1637 mdname(conf->mddev),
1614 + rdev->data_offset), 1638 (unsigned long long)(sh->sector
1615 bdn); 1639 + rdev->data_offset),
1640 bdn);
1616 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1641 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1617 /* Oh, no!!! */ 1642 /* Oh, no!!! */
1618 printk_rl(KERN_WARNING 1643 printk_ratelimited(
1619 "md/raid:%s: read error NOT corrected!! " 1644 KERN_WARNING
1620 "(sector %llu on %s).\n", 1645 "md/raid:%s: read error NOT corrected!! "
1621 mdname(conf->mddev), 1646 "(sector %llu on %s).\n",
1622 (unsigned long long)(sh->sector 1647 mdname(conf->mddev),
1623 + rdev->data_offset), 1648 (unsigned long long)(sh->sector
1624 bdn); 1649 + rdev->data_offset),
1650 bdn);
1625 else if (atomic_read(&rdev->read_errors) 1651 else if (atomic_read(&rdev->read_errors)
1626 > conf->max_nr_stripes) 1652 > conf->max_nr_stripes)
1627 printk(KERN_WARNING 1653 printk(KERN_WARNING
@@ -1649,6 +1675,8 @@ static void raid5_end_write_request(struct bio *bi, int error)
1649 raid5_conf_t *conf = sh->raid_conf; 1675 raid5_conf_t *conf = sh->raid_conf;
1650 int disks = sh->disks, i; 1676 int disks = sh->disks, i;
1651 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1677 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1678 sector_t first_bad;
1679 int bad_sectors;
1652 1680
1653 for (i=0 ; i<disks; i++) 1681 for (i=0 ; i<disks; i++)
1654 if (bi == &sh->dev[i].req) 1682 if (bi == &sh->dev[i].req)
@@ -1662,8 +1690,12 @@ static void raid5_end_write_request(struct bio *bi, int error)
1662 return; 1690 return;
1663 } 1691 }
1664 1692
1665 if (!uptodate) 1693 if (!uptodate) {
1666 md_error(conf->mddev, conf->disks[i].rdev); 1694 set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
1695 set_bit(R5_WriteError, &sh->dev[i].flags);
1696 } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS,
1697 &first_bad, &bad_sectors))
1698 set_bit(R5_MadeGood, &sh->dev[i].flags);
1667 1699
1668 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1700 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1669 1701
@@ -1710,6 +1742,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1710 */ 1742 */
1711 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1743 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1712 } 1744 }
1745 set_bit(Blocked, &rdev->flags);
1713 set_bit(Faulty, &rdev->flags); 1746 set_bit(Faulty, &rdev->flags);
1714 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1747 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1715 printk(KERN_ALERT 1748 printk(KERN_ALERT
@@ -1760,7 +1793,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1760 /* 1793 /*
1761 * Select the parity disk based on the user selected algorithm. 1794 * Select the parity disk based on the user selected algorithm.
1762 */ 1795 */
1763 pd_idx = qd_idx = ~0; 1796 pd_idx = qd_idx = -1;
1764 switch(conf->level) { 1797 switch(conf->level) {
1765 case 4: 1798 case 4:
1766 pd_idx = data_disks; 1799 pd_idx = data_disks;
@@ -2143,12 +2176,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2143 raid5_conf_t *conf = sh->raid_conf; 2176 raid5_conf_t *conf = sh->raid_conf;
2144 int firstwrite=0; 2177 int firstwrite=0;
2145 2178
2146 pr_debug("adding bh b#%llu to stripe s#%llu\n", 2179 pr_debug("adding bi b#%llu to stripe s#%llu\n",
2147 (unsigned long long)bi->bi_sector, 2180 (unsigned long long)bi->bi_sector,
2148 (unsigned long long)sh->sector); 2181 (unsigned long long)sh->sector);
2149 2182
2150 2183
2151 spin_lock(&sh->lock);
2152 spin_lock_irq(&conf->device_lock); 2184 spin_lock_irq(&conf->device_lock);
2153 if (forwrite) { 2185 if (forwrite) {
2154 bip = &sh->dev[dd_idx].towrite; 2186 bip = &sh->dev[dd_idx].towrite;
@@ -2169,19 +2201,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2169 bi->bi_next = *bip; 2201 bi->bi_next = *bip;
2170 *bip = bi; 2202 *bip = bi;
2171 bi->bi_phys_segments++; 2203 bi->bi_phys_segments++;
2172 spin_unlock_irq(&conf->device_lock);
2173 spin_unlock(&sh->lock);
2174
2175 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2176 (unsigned long long)bi->bi_sector,
2177 (unsigned long long)sh->sector, dd_idx);
2178
2179 if (conf->mddev->bitmap && firstwrite) {
2180 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2181 STRIPE_SECTORS, 0);
2182 sh->bm_seq = conf->seq_flush+1;
2183 set_bit(STRIPE_BIT_DELAY, &sh->state);
2184 }
2185 2204
2186 if (forwrite) { 2205 if (forwrite) {
2187 /* check if page is covered */ 2206 /* check if page is covered */
@@ -2196,12 +2215,23 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2196 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2215 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2197 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2216 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2198 } 2217 }
2218 spin_unlock_irq(&conf->device_lock);
2219
2220 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2221 (unsigned long long)(*bip)->bi_sector,
2222 (unsigned long long)sh->sector, dd_idx);
2223
2224 if (conf->mddev->bitmap && firstwrite) {
2225 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2226 STRIPE_SECTORS, 0);
2227 sh->bm_seq = conf->seq_flush+1;
2228 set_bit(STRIPE_BIT_DELAY, &sh->state);
2229 }
2199 return 1; 2230 return 1;
2200 2231
2201 overlap: 2232 overlap:
2202 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2233 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2203 spin_unlock_irq(&conf->device_lock); 2234 spin_unlock_irq(&conf->device_lock);
2204 spin_unlock(&sh->lock);
2205 return 0; 2235 return 0;
2206} 2236}
2207 2237
@@ -2238,9 +2268,18 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2238 rcu_read_lock(); 2268 rcu_read_lock();
2239 rdev = rcu_dereference(conf->disks[i].rdev); 2269 rdev = rcu_dereference(conf->disks[i].rdev);
2240 if (rdev && test_bit(In_sync, &rdev->flags)) 2270 if (rdev && test_bit(In_sync, &rdev->flags))
2241 /* multiple read failures in one stripe */ 2271 atomic_inc(&rdev->nr_pending);
2242 md_error(conf->mddev, rdev); 2272 else
2273 rdev = NULL;
2243 rcu_read_unlock(); 2274 rcu_read_unlock();
2275 if (rdev) {
2276 if (!rdev_set_badblocks(
2277 rdev,
2278 sh->sector,
2279 STRIPE_SECTORS, 0))
2280 md_error(conf->mddev, rdev);
2281 rdev_dec_pending(rdev, conf->mddev);
2282 }
2244 } 2283 }
2245 spin_lock_irq(&conf->device_lock); 2284 spin_lock_irq(&conf->device_lock);
2246 /* fail all writes first */ 2285 /* fail all writes first */
@@ -2308,6 +2347,10 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2308 if (bitmap_end) 2347 if (bitmap_end)
2309 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2348 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2310 STRIPE_SECTORS, 0, 0); 2349 STRIPE_SECTORS, 0, 0);
2350 /* If we were in the middle of a write the parity block might
2351 * still be locked - so just clear all R5_LOCKED flags
2352 */
2353 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2311 } 2354 }
2312 2355
2313 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2356 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
@@ -2315,109 +2358,73 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2315 md_wakeup_thread(conf->mddev->thread); 2358 md_wakeup_thread(conf->mddev->thread);
2316} 2359}
2317 2360
2318/* fetch_block5 - checks the given member device to see if its data needs 2361static void
2319 * to be read or computed to satisfy a request. 2362handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh,
2320 * 2363 struct stripe_head_state *s)
2321 * Returns 1 when no more member devices need to be checked, otherwise returns
2322 * 0 to tell the loop in handle_stripe_fill5 to continue
2323 */
2324static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
2325 int disk_idx, int disks)
2326{
2327 struct r5dev *dev = &sh->dev[disk_idx];
2328 struct r5dev *failed_dev = &sh->dev[s->failed_num];
2329
2330 /* is the data in this block needed, and can we get it? */
2331 if (!test_bit(R5_LOCKED, &dev->flags) &&
2332 !test_bit(R5_UPTODATE, &dev->flags) &&
2333 (dev->toread ||
2334 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2335 s->syncing || s->expanding ||
2336 (s->failed &&
2337 (failed_dev->toread ||
2338 (failed_dev->towrite &&
2339 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
2340 /* We would like to get this block, possibly by computing it,
2341 * otherwise read it if the backing disk is insync
2342 */
2343 if ((s->uptodate == disks - 1) &&
2344 (s->failed && disk_idx == s->failed_num)) {
2345 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2346 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2347 set_bit(R5_Wantcompute, &dev->flags);
2348 sh->ops.target = disk_idx;
2349 sh->ops.target2 = -1;
2350 s->req_compute = 1;
2351 /* Careful: from this point on 'uptodate' is in the eye
2352 * of raid_run_ops which services 'compute' operations
2353 * before writes. R5_Wantcompute flags a block that will
2354 * be R5_UPTODATE by the time it is needed for a
2355 * subsequent operation.
2356 */
2357 s->uptodate++;
2358 return 1; /* uptodate + compute == disks */
2359 } else if (test_bit(R5_Insync, &dev->flags)) {
2360 set_bit(R5_LOCKED, &dev->flags);
2361 set_bit(R5_Wantread, &dev->flags);
2362 s->locked++;
2363 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2364 s->syncing);
2365 }
2366 }
2367
2368 return 0;
2369}
2370
2371/**
2372 * handle_stripe_fill5 - read or compute data to satisfy pending requests.
2373 */
2374static void handle_stripe_fill5(struct stripe_head *sh,
2375 struct stripe_head_state *s, int disks)
2376{ 2364{
2365 int abort = 0;
2377 int i; 2366 int i;
2378 2367
2379 /* look for blocks to read/compute, skip this if a compute 2368 md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2380 * is already in flight, or if the stripe contents are in the 2369 clear_bit(STRIPE_SYNCING, &sh->state);
2381 * midst of changing due to a write 2370 s->syncing = 0;
2371 /* There is nothing more to do for sync/check/repair.
2372 * For recover we need to record a bad block on all
2373 * non-sync devices, or abort the recovery
2382 */ 2374 */
2383 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2375 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
2384 !sh->reconstruct_state) 2376 return;
2385 for (i = disks; i--; ) 2377 /* During recovery devices cannot be removed, so locking and
2386 if (fetch_block5(sh, s, i, disks)) 2378 * refcounting of rdevs is not needed
2387 break; 2379 */
2388 set_bit(STRIPE_HANDLE, &sh->state); 2380 for (i = 0; i < conf->raid_disks; i++) {
2381 mdk_rdev_t *rdev = conf->disks[i].rdev;
2382 if (!rdev
2383 || test_bit(Faulty, &rdev->flags)
2384 || test_bit(In_sync, &rdev->flags))
2385 continue;
2386 if (!rdev_set_badblocks(rdev, sh->sector,
2387 STRIPE_SECTORS, 0))
2388 abort = 1;
2389 }
2390 if (abort) {
2391 conf->recovery_disabled = conf->mddev->recovery_disabled;
2392 set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
2393 }
2389} 2394}
2390 2395
2391/* fetch_block6 - checks the given member device to see if its data needs 2396/* fetch_block - checks the given member device to see if its data needs
2392 * to be read or computed to satisfy a request. 2397 * to be read or computed to satisfy a request.
2393 * 2398 *
2394 * Returns 1 when no more member devices need to be checked, otherwise returns 2399 * Returns 1 when no more member devices need to be checked, otherwise returns
2395 * 0 to tell the loop in handle_stripe_fill6 to continue 2400 * 0 to tell the loop in handle_stripe_fill to continue
2396 */ 2401 */
2397static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, 2402static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2398 struct r6_state *r6s, int disk_idx, int disks) 2403 int disk_idx, int disks)
2399{ 2404{
2400 struct r5dev *dev = &sh->dev[disk_idx]; 2405 struct r5dev *dev = &sh->dev[disk_idx];
2401 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], 2406 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
2402 &sh->dev[r6s->failed_num[1]] }; 2407 &sh->dev[s->failed_num[1]] };
2403 2408
2409 /* is the data in this block needed, and can we get it? */
2404 if (!test_bit(R5_LOCKED, &dev->flags) && 2410 if (!test_bit(R5_LOCKED, &dev->flags) &&
2405 !test_bit(R5_UPTODATE, &dev->flags) && 2411 !test_bit(R5_UPTODATE, &dev->flags) &&
2406 (dev->toread || 2412 (dev->toread ||
2407 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2413 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2408 s->syncing || s->expanding || 2414 s->syncing || s->expanding ||
2409 (s->failed >= 1 && 2415 (s->failed >= 1 && fdev[0]->toread) ||
2410 (fdev[0]->toread || s->to_write)) || 2416 (s->failed >= 2 && fdev[1]->toread) ||
2411 (s->failed >= 2 && 2417 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
2412 (fdev[1]->toread || s->to_write)))) { 2418 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
2419 (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
2413 /* we would like to get this block, possibly by computing it, 2420 /* we would like to get this block, possibly by computing it,
2414 * otherwise read it if the backing disk is insync 2421 * otherwise read it if the backing disk is insync
2415 */ 2422 */
2416 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2423 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2417 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2424 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2418 if ((s->uptodate == disks - 1) && 2425 if ((s->uptodate == disks - 1) &&
2419 (s->failed && (disk_idx == r6s->failed_num[0] || 2426 (s->failed && (disk_idx == s->failed_num[0] ||
2420 disk_idx == r6s->failed_num[1]))) { 2427 disk_idx == s->failed_num[1]))) {
2421 /* have disk failed, and we're requested to fetch it; 2428 /* have disk failed, and we're requested to fetch it;
2422 * do compute it 2429 * do compute it
2423 */ 2430 */
@@ -2429,6 +2436,12 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2429 sh->ops.target = disk_idx; 2436 sh->ops.target = disk_idx;
2430 sh->ops.target2 = -1; /* no 2nd target */ 2437 sh->ops.target2 = -1; /* no 2nd target */
2431 s->req_compute = 1; 2438 s->req_compute = 1;
2439 /* Careful: from this point on 'uptodate' is in the eye
2440 * of raid_run_ops which services 'compute' operations
2441 * before writes. R5_Wantcompute flags a block that will
2442 * be R5_UPTODATE by the time it is needed for a
2443 * subsequent operation.
2444 */
2432 s->uptodate++; 2445 s->uptodate++;
2433 return 1; 2446 return 1;
2434 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2447 } else if (s->uptodate == disks-2 && s->failed >= 2) {
@@ -2469,11 +2482,11 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2469} 2482}
2470 2483
2471/** 2484/**
2472 * handle_stripe_fill6 - read or compute data to satisfy pending requests. 2485 * handle_stripe_fill - read or compute data to satisfy pending requests.
2473 */ 2486 */
2474static void handle_stripe_fill6(struct stripe_head *sh, 2487static void handle_stripe_fill(struct stripe_head *sh,
2475 struct stripe_head_state *s, struct r6_state *r6s, 2488 struct stripe_head_state *s,
2476 int disks) 2489 int disks)
2477{ 2490{
2478 int i; 2491 int i;
2479 2492
@@ -2484,7 +2497,7 @@ static void handle_stripe_fill6(struct stripe_head *sh,
2484 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2497 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2485 !sh->reconstruct_state) 2498 !sh->reconstruct_state)
2486 for (i = disks; i--; ) 2499 for (i = disks; i--; )
2487 if (fetch_block6(sh, s, r6s, i, disks)) 2500 if (fetch_block(sh, s, i, disks))
2488 break; 2501 break;
2489 set_bit(STRIPE_HANDLE, &sh->state); 2502 set_bit(STRIPE_HANDLE, &sh->state);
2490} 2503}
@@ -2540,11 +2553,19 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
2540 md_wakeup_thread(conf->mddev->thread); 2553 md_wakeup_thread(conf->mddev->thread);
2541} 2554}
2542 2555
2543static void handle_stripe_dirtying5(raid5_conf_t *conf, 2556static void handle_stripe_dirtying(raid5_conf_t *conf,
2544 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2557 struct stripe_head *sh,
2558 struct stripe_head_state *s,
2559 int disks)
2545{ 2560{
2546 int rmw = 0, rcw = 0, i; 2561 int rmw = 0, rcw = 0, i;
2547 for (i = disks; i--; ) { 2562 if (conf->max_degraded == 2) {
2563 /* RAID6 requires 'rcw' in current implementation
2564 * Calculate the real rcw later - for now fake it
2565 * look like rcw is cheaper
2566 */
2567 rcw = 1; rmw = 2;
2568 } else for (i = disks; i--; ) {
2548 /* would I have to read this buffer for read_modify_write */ 2569 /* would I have to read this buffer for read_modify_write */
2549 struct r5dev *dev = &sh->dev[i]; 2570 struct r5dev *dev = &sh->dev[i];
2550 if ((dev->towrite || i == sh->pd_idx) && 2571 if ((dev->towrite || i == sh->pd_idx) &&
@@ -2591,16 +2612,19 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2591 } 2612 }
2592 } 2613 }
2593 } 2614 }
2594 if (rcw <= rmw && rcw > 0) 2615 if (rcw <= rmw && rcw > 0) {
2595 /* want reconstruct write, but need to get some data */ 2616 /* want reconstruct write, but need to get some data */
2617 rcw = 0;
2596 for (i = disks; i--; ) { 2618 for (i = disks; i--; ) {
2597 struct r5dev *dev = &sh->dev[i]; 2619 struct r5dev *dev = &sh->dev[i];
2598 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2620 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2599 i != sh->pd_idx && 2621 i != sh->pd_idx && i != sh->qd_idx &&
2600 !test_bit(R5_LOCKED, &dev->flags) && 2622 !test_bit(R5_LOCKED, &dev->flags) &&
2601 !(test_bit(R5_UPTODATE, &dev->flags) || 2623 !(test_bit(R5_UPTODATE, &dev->flags) ||
2602 test_bit(R5_Wantcompute, &dev->flags)) && 2624 test_bit(R5_Wantcompute, &dev->flags))) {
2603 test_bit(R5_Insync, &dev->flags)) { 2625 rcw++;
2626 if (!test_bit(R5_Insync, &dev->flags))
2627 continue; /* it's a failed drive */
2604 if ( 2628 if (
2605 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2629 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2606 pr_debug("Read_old block " 2630 pr_debug("Read_old block "
@@ -2614,6 +2638,7 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2614 } 2638 }
2615 } 2639 }
2616 } 2640 }
2641 }
2617 /* now if nothing is locked, and if we have enough data, 2642 /* now if nothing is locked, and if we have enough data,
2618 * we can start a write request 2643 * we can start a write request
2619 */ 2644 */
@@ -2630,53 +2655,6 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2630 schedule_reconstruction(sh, s, rcw == 0, 0); 2655 schedule_reconstruction(sh, s, rcw == 0, 0);
2631} 2656}
2632 2657
2633static void handle_stripe_dirtying6(raid5_conf_t *conf,
2634 struct stripe_head *sh, struct stripe_head_state *s,
2635 struct r6_state *r6s, int disks)
2636{
2637 int rcw = 0, pd_idx = sh->pd_idx, i;
2638 int qd_idx = sh->qd_idx;
2639
2640 set_bit(STRIPE_HANDLE, &sh->state);
2641 for (i = disks; i--; ) {
2642 struct r5dev *dev = &sh->dev[i];
2643 /* check if we haven't enough data */
2644 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2645 i != pd_idx && i != qd_idx &&
2646 !test_bit(R5_LOCKED, &dev->flags) &&
2647 !(test_bit(R5_UPTODATE, &dev->flags) ||
2648 test_bit(R5_Wantcompute, &dev->flags))) {
2649 rcw++;
2650 if (!test_bit(R5_Insync, &dev->flags))
2651 continue; /* it's a failed drive */
2652
2653 if (
2654 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2655 pr_debug("Read_old stripe %llu "
2656 "block %d for Reconstruct\n",
2657 (unsigned long long)sh->sector, i);
2658 set_bit(R5_LOCKED, &dev->flags);
2659 set_bit(R5_Wantread, &dev->flags);
2660 s->locked++;
2661 } else {
2662 pr_debug("Request delayed stripe %llu "
2663 "block %d for Reconstruct\n",
2664 (unsigned long long)sh->sector, i);
2665 set_bit(STRIPE_DELAYED, &sh->state);
2666 set_bit(STRIPE_HANDLE, &sh->state);
2667 }
2668 }
2669 }
2670 /* now if nothing is locked, and if we have enough data, we can start a
2671 * write request
2672 */
2673 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2674 s->locked == 0 && rcw == 0 &&
2675 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2676 schedule_reconstruction(sh, s, 1, 0);
2677 }
2678}
2679
2680static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2658static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2681 struct stripe_head_state *s, int disks) 2659 struct stripe_head_state *s, int disks)
2682{ 2660{
@@ -2695,7 +2673,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2695 s->uptodate--; 2673 s->uptodate--;
2696 break; 2674 break;
2697 } 2675 }
2698 dev = &sh->dev[s->failed_num]; 2676 dev = &sh->dev[s->failed_num[0]];
2699 /* fall through */ 2677 /* fall through */
2700 case check_state_compute_result: 2678 case check_state_compute_result:
2701 sh->check_state = check_state_idle; 2679 sh->check_state = check_state_idle;
@@ -2767,7 +2745,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2767 2745
2768static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2746static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2769 struct stripe_head_state *s, 2747 struct stripe_head_state *s,
2770 struct r6_state *r6s, int disks) 2748 int disks)
2771{ 2749{
2772 int pd_idx = sh->pd_idx; 2750 int pd_idx = sh->pd_idx;
2773 int qd_idx = sh->qd_idx; 2751 int qd_idx = sh->qd_idx;
@@ -2786,14 +2764,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2786 switch (sh->check_state) { 2764 switch (sh->check_state) {
2787 case check_state_idle: 2765 case check_state_idle:
2788 /* start a new check operation if there are < 2 failures */ 2766 /* start a new check operation if there are < 2 failures */
2789 if (s->failed == r6s->q_failed) { 2767 if (s->failed == s->q_failed) {
2790 /* The only possible failed device holds Q, so it 2768 /* The only possible failed device holds Q, so it
2791 * makes sense to check P (If anything else were failed, 2769 * makes sense to check P (If anything else were failed,
2792 * we would have used P to recreate it). 2770 * we would have used P to recreate it).
2793 */ 2771 */
2794 sh->check_state = check_state_run; 2772 sh->check_state = check_state_run;
2795 } 2773 }
2796 if (!r6s->q_failed && s->failed < 2) { 2774 if (!s->q_failed && s->failed < 2) {
2797 /* Q is not failed, and we didn't use it to generate 2775 /* Q is not failed, and we didn't use it to generate
2798 * anything, so it makes sense to check it 2776 * anything, so it makes sense to check it
2799 */ 2777 */
@@ -2835,13 +2813,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2835 */ 2813 */
2836 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 2814 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2837 if (s->failed == 2) { 2815 if (s->failed == 2) {
2838 dev = &sh->dev[r6s->failed_num[1]]; 2816 dev = &sh->dev[s->failed_num[1]];
2839 s->locked++; 2817 s->locked++;
2840 set_bit(R5_LOCKED, &dev->flags); 2818 set_bit(R5_LOCKED, &dev->flags);
2841 set_bit(R5_Wantwrite, &dev->flags); 2819 set_bit(R5_Wantwrite, &dev->flags);
2842 } 2820 }
2843 if (s->failed >= 1) { 2821 if (s->failed >= 1) {
2844 dev = &sh->dev[r6s->failed_num[0]]; 2822 dev = &sh->dev[s->failed_num[0]];
2845 s->locked++; 2823 s->locked++;
2846 set_bit(R5_LOCKED, &dev->flags); 2824 set_bit(R5_LOCKED, &dev->flags);
2847 set_bit(R5_Wantwrite, &dev->flags); 2825 set_bit(R5_Wantwrite, &dev->flags);
@@ -2928,8 +2906,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2928 } 2906 }
2929} 2907}
2930 2908
2931static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, 2909static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh)
2932 struct r6_state *r6s)
2933{ 2910{
2934 int i; 2911 int i;
2935 2912
@@ -2971,7 +2948,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2971 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2948 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2972 for (j = 0; j < conf->raid_disks; j++) 2949 for (j = 0; j < conf->raid_disks; j++)
2973 if (j != sh2->pd_idx && 2950 if (j != sh2->pd_idx &&
2974 (!r6s || j != sh2->qd_idx) && 2951 j != sh2->qd_idx &&
2975 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2952 !test_bit(R5_Expanded, &sh2->dev[j].flags))
2976 break; 2953 break;
2977 if (j == conf->raid_disks) { 2954 if (j == conf->raid_disks) {
@@ -3006,43 +2983,35 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
3006 * 2983 *
3007 */ 2984 */
3008 2985
3009static void handle_stripe5(struct stripe_head *sh) 2986static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3010{ 2987{
3011 raid5_conf_t *conf = sh->raid_conf; 2988 raid5_conf_t *conf = sh->raid_conf;
3012 int disks = sh->disks, i; 2989 int disks = sh->disks;
3013 struct bio *return_bi = NULL;
3014 struct stripe_head_state s;
3015 struct r5dev *dev; 2990 struct r5dev *dev;
3016 mdk_rdev_t *blocked_rdev = NULL; 2991 int i;
3017 int prexor;
3018 int dec_preread_active = 0;
3019 2992
3020 memset(&s, 0, sizeof(s)); 2993 memset(s, 0, sizeof(*s));
3021 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
3022 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
3023 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
3024 sh->reconstruct_state);
3025 2994
3026 spin_lock(&sh->lock); 2995 s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
3027 clear_bit(STRIPE_HANDLE, &sh->state); 2996 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3028 clear_bit(STRIPE_DELAYED, &sh->state); 2997 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3029 2998 s->failed_num[0] = -1;
3030 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 2999 s->failed_num[1] = -1;
3031 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3032 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3033 3000
3034 /* Now to look around and see what can be done */ 3001 /* Now to look around and see what can be done */
3035 rcu_read_lock(); 3002 rcu_read_lock();
3003 spin_lock_irq(&conf->device_lock);
3036 for (i=disks; i--; ) { 3004 for (i=disks; i--; ) {
3037 mdk_rdev_t *rdev; 3005 mdk_rdev_t *rdev;
3006 sector_t first_bad;
3007 int bad_sectors;
3008 int is_bad = 0;
3038 3009
3039 dev = &sh->dev[i]; 3010 dev = &sh->dev[i];
3040 3011
3041 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 3012 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3042 "written %p\n", i, dev->flags, dev->toread, dev->read, 3013 i, dev->flags, dev->toread, dev->towrite, dev->written);
3043 dev->towrite, dev->written); 3014 /* maybe we can reply to a read
3044
3045 /* maybe we can request a biofill operation
3046 * 3015 *
3047 * new wantfill requests are only permitted while 3016 * new wantfill requests are only permitted while
3048 * ops_complete_biofill is guaranteed to be inactive 3017 * ops_complete_biofill is guaranteed to be inactive
@@ -3052,37 +3021,74 @@ static void handle_stripe5(struct stripe_head *sh)
3052 set_bit(R5_Wantfill, &dev->flags); 3021 set_bit(R5_Wantfill, &dev->flags);
3053 3022
3054 /* now count some things */ 3023 /* now count some things */
3055 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3024 if (test_bit(R5_LOCKED, &dev->flags))
3056 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3025 s->locked++;
3057 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; 3026 if (test_bit(R5_UPTODATE, &dev->flags))
3027 s->uptodate++;
3028 if (test_bit(R5_Wantcompute, &dev->flags)) {
3029 s->compute++;
3030 BUG_ON(s->compute > 2);
3031 }
3058 3032
3059 if (test_bit(R5_Wantfill, &dev->flags)) 3033 if (test_bit(R5_Wantfill, &dev->flags))
3060 s.to_fill++; 3034 s->to_fill++;
3061 else if (dev->toread) 3035 else if (dev->toread)
3062 s.to_read++; 3036 s->to_read++;
3063 if (dev->towrite) { 3037 if (dev->towrite) {
3064 s.to_write++; 3038 s->to_write++;
3065 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3039 if (!test_bit(R5_OVERWRITE, &dev->flags))
3066 s.non_overwrite++; 3040 s->non_overwrite++;
3067 } 3041 }
3068 if (dev->written) 3042 if (dev->written)
3069 s.written++; 3043 s->written++;
3070 rdev = rcu_dereference(conf->disks[i].rdev); 3044 rdev = rcu_dereference(conf->disks[i].rdev);
3071 if (blocked_rdev == NULL && 3045 if (rdev) {
3072 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3046 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3073 blocked_rdev = rdev; 3047 &first_bad, &bad_sectors);
3074 atomic_inc(&rdev->nr_pending); 3048 if (s->blocked_rdev == NULL
3049 && (test_bit(Blocked, &rdev->flags)
3050 || is_bad < 0)) {
3051 if (is_bad < 0)
3052 set_bit(BlockedBadBlocks,
3053 &rdev->flags);
3054 s->blocked_rdev = rdev;
3055 atomic_inc(&rdev->nr_pending);
3056 }
3075 } 3057 }
3076 clear_bit(R5_Insync, &dev->flags); 3058 clear_bit(R5_Insync, &dev->flags);
3077 if (!rdev) 3059 if (!rdev)
3078 /* Not in-sync */; 3060 /* Not in-sync */;
3079 else if (test_bit(In_sync, &rdev->flags)) 3061 else if (is_bad) {
3062 /* also not in-sync */
3063 if (!test_bit(WriteErrorSeen, &rdev->flags)) {
3064 /* treat as in-sync, but with a read error
3065 * which we can now try to correct
3066 */
3067 set_bit(R5_Insync, &dev->flags);
3068 set_bit(R5_ReadError, &dev->flags);
3069 }
3070 } else if (test_bit(In_sync, &rdev->flags))
3080 set_bit(R5_Insync, &dev->flags); 3071 set_bit(R5_Insync, &dev->flags);
3081 else { 3072 else {
3082 /* could be in-sync depending on recovery/reshape status */ 3073 /* in sync if before recovery_offset */
3083 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3074 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3084 set_bit(R5_Insync, &dev->flags); 3075 set_bit(R5_Insync, &dev->flags);
3085 } 3076 }
3077 if (test_bit(R5_WriteError, &dev->flags)) {
3078 clear_bit(R5_Insync, &dev->flags);
3079 if (!test_bit(Faulty, &rdev->flags)) {
3080 s->handle_bad_blocks = 1;
3081 atomic_inc(&rdev->nr_pending);
3082 } else
3083 clear_bit(R5_WriteError, &dev->flags);
3084 }
3085 if (test_bit(R5_MadeGood, &dev->flags)) {
3086 if (!test_bit(Faulty, &rdev->flags)) {
3087 s->handle_bad_blocks = 1;
3088 atomic_inc(&rdev->nr_pending);
3089 } else
3090 clear_bit(R5_MadeGood, &dev->flags);
3091 }
3086 if (!test_bit(R5_Insync, &dev->flags)) { 3092 if (!test_bit(R5_Insync, &dev->flags)) {
3087 /* The ReadError flag will just be confusing now */ 3093 /* The ReadError flag will just be confusing now */
3088 clear_bit(R5_ReadError, &dev->flags); 3094 clear_bit(R5_ReadError, &dev->flags);
@@ -3091,313 +3097,60 @@ static void handle_stripe5(struct stripe_head *sh)
3091 if (test_bit(R5_ReadError, &dev->flags)) 3097 if (test_bit(R5_ReadError, &dev->flags))
3092 clear_bit(R5_Insync, &dev->flags); 3098 clear_bit(R5_Insync, &dev->flags);
3093 if (!test_bit(R5_Insync, &dev->flags)) { 3099 if (!test_bit(R5_Insync, &dev->flags)) {
3094 s.failed++; 3100 if (s->failed < 2)
3095 s.failed_num = i; 3101 s->failed_num[s->failed] = i;
3102 s->failed++;
3096 } 3103 }
3097 } 3104 }
3105 spin_unlock_irq(&conf->device_lock);
3098 rcu_read_unlock(); 3106 rcu_read_unlock();
3099
3100 if (unlikely(blocked_rdev)) {
3101 if (s.syncing || s.expanding || s.expanded ||
3102 s.to_write || s.written) {
3103 set_bit(STRIPE_HANDLE, &sh->state);
3104 goto unlock;
3105 }
3106 /* There is nothing for the blocked_rdev to block */
3107 rdev_dec_pending(blocked_rdev, conf->mddev);
3108 blocked_rdev = NULL;
3109 }
3110
3111 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3112 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3113 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3114 }
3115
3116 pr_debug("locked=%d uptodate=%d to_read=%d"
3117 " to_write=%d failed=%d failed_num=%d\n",
3118 s.locked, s.uptodate, s.to_read, s.to_write,
3119 s.failed, s.failed_num);
3120 /* check if the array has lost two devices and, if so, some requests might
3121 * need to be failed
3122 */
3123 if (s.failed > 1 && s.to_read+s.to_write+s.written)
3124 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
3125 if (s.failed > 1 && s.syncing) {
3126 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3127 clear_bit(STRIPE_SYNCING, &sh->state);
3128 s.syncing = 0;
3129 }
3130
3131 /* might be able to return some write requests if the parity block
3132 * is safe, or on a failed drive
3133 */
3134 dev = &sh->dev[sh->pd_idx];
3135 if ( s.written &&
3136 ((test_bit(R5_Insync, &dev->flags) &&
3137 !test_bit(R5_LOCKED, &dev->flags) &&
3138 test_bit(R5_UPTODATE, &dev->flags)) ||
3139 (s.failed == 1 && s.failed_num == sh->pd_idx)))
3140 handle_stripe_clean_event(conf, sh, disks, &return_bi);
3141
3142 /* Now we might consider reading some blocks, either to check/generate
3143 * parity, or to satisfy requests
3144 * or to load a block that is being partially written.
3145 */
3146 if (s.to_read || s.non_overwrite ||
3147 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3148 handle_stripe_fill5(sh, &s, disks);
3149
3150 /* Now we check to see if any write operations have recently
3151 * completed
3152 */
3153 prexor = 0;
3154 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3155 prexor = 1;
3156 if (sh->reconstruct_state == reconstruct_state_drain_result ||
3157 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3158 sh->reconstruct_state = reconstruct_state_idle;
3159
3160 /* All the 'written' buffers and the parity block are ready to
3161 * be written back to disk
3162 */
3163 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3164 for (i = disks; i--; ) {
3165 dev = &sh->dev[i];
3166 if (test_bit(R5_LOCKED, &dev->flags) &&
3167 (i == sh->pd_idx || dev->written)) {
3168 pr_debug("Writing block %d\n", i);
3169 set_bit(R5_Wantwrite, &dev->flags);
3170 if (prexor)
3171 continue;
3172 if (!test_bit(R5_Insync, &dev->flags) ||
3173 (i == sh->pd_idx && s.failed == 0))
3174 set_bit(STRIPE_INSYNC, &sh->state);
3175 }
3176 }
3177 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3178 dec_preread_active = 1;
3179 }
3180
3181 /* Now to consider new write requests and what else, if anything
3182 * should be read. We do not handle new writes when:
3183 * 1/ A 'write' operation (copy+xor) is already in flight.
3184 * 2/ A 'check' operation is in flight, as it may clobber the parity
3185 * block.
3186 */
3187 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3188 handle_stripe_dirtying5(conf, sh, &s, disks);
3189
3190 /* maybe we need to check and possibly fix the parity for this stripe
3191 * Any reads will already have been scheduled, so we just see if enough
3192 * data is available. The parity check is held off while parity
3193 * dependent operations are in flight.
3194 */
3195 if (sh->check_state ||
3196 (s.syncing && s.locked == 0 &&
3197 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3198 !test_bit(STRIPE_INSYNC, &sh->state)))
3199 handle_parity_checks5(conf, sh, &s, disks);
3200
3201 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3202 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
3203 clear_bit(STRIPE_SYNCING, &sh->state);
3204 }
3205
3206 /* If the failed drive is just a ReadError, then we might need to progress
3207 * the repair/check process
3208 */
3209 if (s.failed == 1 && !conf->mddev->ro &&
3210 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
3211 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
3212 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
3213 ) {
3214 dev = &sh->dev[s.failed_num];
3215 if (!test_bit(R5_ReWrite, &dev->flags)) {
3216 set_bit(R5_Wantwrite, &dev->flags);
3217 set_bit(R5_ReWrite, &dev->flags);
3218 set_bit(R5_LOCKED, &dev->flags);
3219 s.locked++;
3220 } else {
3221 /* let's read it back */
3222 set_bit(R5_Wantread, &dev->flags);
3223 set_bit(R5_LOCKED, &dev->flags);
3224 s.locked++;
3225 }
3226 }
3227
3228 /* Finish reconstruct operations initiated by the expansion process */
3229 if (sh->reconstruct_state == reconstruct_state_result) {
3230 struct stripe_head *sh2
3231 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3232 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3233 /* sh cannot be written until sh2 has been read.
3234 * so arrange for sh to be delayed a little
3235 */
3236 set_bit(STRIPE_DELAYED, &sh->state);
3237 set_bit(STRIPE_HANDLE, &sh->state);
3238 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3239 &sh2->state))
3240 atomic_inc(&conf->preread_active_stripes);
3241 release_stripe(sh2);
3242 goto unlock;
3243 }
3244 if (sh2)
3245 release_stripe(sh2);
3246
3247 sh->reconstruct_state = reconstruct_state_idle;
3248 clear_bit(STRIPE_EXPANDING, &sh->state);
3249 for (i = conf->raid_disks; i--; ) {
3250 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3251 set_bit(R5_LOCKED, &sh->dev[i].flags);
3252 s.locked++;
3253 }
3254 }
3255
3256 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3257 !sh->reconstruct_state) {
3258 /* Need to write out all blocks after computing parity */
3259 sh->disks = conf->raid_disks;
3260 stripe_set_idx(sh->sector, conf, 0, sh);
3261 schedule_reconstruction(sh, &s, 1, 1);
3262 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3263 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3264 atomic_dec(&conf->reshape_stripes);
3265 wake_up(&conf->wait_for_overlap);
3266 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3267 }
3268
3269 if (s.expanding && s.locked == 0 &&
3270 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3271 handle_stripe_expansion(conf, sh, NULL);
3272
3273 unlock:
3274 spin_unlock(&sh->lock);
3275
3276 /* wait for this device to become unblocked */
3277 if (unlikely(blocked_rdev))
3278 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3279
3280 if (s.ops_request)
3281 raid_run_ops(sh, s.ops_request);
3282
3283 ops_run_io(sh, &s);
3284
3285 if (dec_preread_active) {
3286 /* We delay this until after ops_run_io so that if make_request
3287 * is waiting on a flush, it won't continue until the writes
3288 * have actually been submitted.
3289 */
3290 atomic_dec(&conf->preread_active_stripes);
3291 if (atomic_read(&conf->preread_active_stripes) <
3292 IO_THRESHOLD)
3293 md_wakeup_thread(conf->mddev->thread);
3294 }
3295 return_io(return_bi);
3296} 3107}
3297 3108
3298static void handle_stripe6(struct stripe_head *sh) 3109static void handle_stripe(struct stripe_head *sh)
3299{ 3110{
3111 struct stripe_head_state s;
3300 raid5_conf_t *conf = sh->raid_conf; 3112 raid5_conf_t *conf = sh->raid_conf;
3113 int i;
3114 int prexor;
3301 int disks = sh->disks; 3115 int disks = sh->disks;
3302 struct bio *return_bi = NULL; 3116 struct r5dev *pdev, *qdev;
3303 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; 3117
3304 struct stripe_head_state s; 3118 clear_bit(STRIPE_HANDLE, &sh->state);
3305 struct r6_state r6s; 3119 if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) {
3306 struct r5dev *dev, *pdev, *qdev; 3120 /* already being handled, ensure it gets handled
3307 mdk_rdev_t *blocked_rdev = NULL; 3121 * again when current action finishes */
3308 int dec_preread_active = 0; 3122 set_bit(STRIPE_HANDLE, &sh->state);
3123 return;
3124 }
3125
3126 if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
3127 set_bit(STRIPE_SYNCING, &sh->state);
3128 clear_bit(STRIPE_INSYNC, &sh->state);
3129 }
3130 clear_bit(STRIPE_DELAYED, &sh->state);
3309 3131
3310 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3132 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3311 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3133 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3312 (unsigned long long)sh->sector, sh->state, 3134 (unsigned long long)sh->sector, sh->state,
3313 atomic_read(&sh->count), pd_idx, qd_idx, 3135 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
3314 sh->check_state, sh->reconstruct_state); 3136 sh->check_state, sh->reconstruct_state);
3315 memset(&s, 0, sizeof(s));
3316
3317 spin_lock(&sh->lock);
3318 clear_bit(STRIPE_HANDLE, &sh->state);
3319 clear_bit(STRIPE_DELAYED, &sh->state);
3320
3321 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
3322 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3323 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3324 /* Now to look around and see what can be done */
3325
3326 rcu_read_lock();
3327 for (i=disks; i--; ) {
3328 mdk_rdev_t *rdev;
3329 dev = &sh->dev[i];
3330 3137
3331 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3138 analyse_stripe(sh, &s);
3332 i, dev->flags, dev->toread, dev->towrite, dev->written);
3333 /* maybe we can reply to a read
3334 *
3335 * new wantfill requests are only permitted while
3336 * ops_complete_biofill is guaranteed to be inactive
3337 */
3338 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3339 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3340 set_bit(R5_Wantfill, &dev->flags);
3341 3139
3342 /* now count some things */ 3140 if (s.handle_bad_blocks) {
3343 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3141 set_bit(STRIPE_HANDLE, &sh->state);
3344 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3142 goto finish;
3345 if (test_bit(R5_Wantcompute, &dev->flags)) {
3346 s.compute++;
3347 BUG_ON(s.compute > 2);
3348 }
3349
3350 if (test_bit(R5_Wantfill, &dev->flags)) {
3351 s.to_fill++;
3352 } else if (dev->toread)
3353 s.to_read++;
3354 if (dev->towrite) {
3355 s.to_write++;
3356 if (!test_bit(R5_OVERWRITE, &dev->flags))
3357 s.non_overwrite++;
3358 }
3359 if (dev->written)
3360 s.written++;
3361 rdev = rcu_dereference(conf->disks[i].rdev);
3362 if (blocked_rdev == NULL &&
3363 rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
3364 blocked_rdev = rdev;
3365 atomic_inc(&rdev->nr_pending);
3366 }
3367 clear_bit(R5_Insync, &dev->flags);
3368 if (!rdev)
3369 /* Not in-sync */;
3370 else if (test_bit(In_sync, &rdev->flags))
3371 set_bit(R5_Insync, &dev->flags);
3372 else {
3373 /* in sync if before recovery_offset */
3374 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3375 set_bit(R5_Insync, &dev->flags);
3376 }
3377 if (!test_bit(R5_Insync, &dev->flags)) {
3378 /* The ReadError flag will just be confusing now */
3379 clear_bit(R5_ReadError, &dev->flags);
3380 clear_bit(R5_ReWrite, &dev->flags);
3381 }
3382 if (test_bit(R5_ReadError, &dev->flags))
3383 clear_bit(R5_Insync, &dev->flags);
3384 if (!test_bit(R5_Insync, &dev->flags)) {
3385 if (s.failed < 2)
3386 r6s.failed_num[s.failed] = i;
3387 s.failed++;
3388 }
3389 } 3143 }
3390 rcu_read_unlock();
3391 3144
3392 if (unlikely(blocked_rdev)) { 3145 if (unlikely(s.blocked_rdev)) {
3393 if (s.syncing || s.expanding || s.expanded || 3146 if (s.syncing || s.expanding || s.expanded ||
3394 s.to_write || s.written) { 3147 s.to_write || s.written) {
3395 set_bit(STRIPE_HANDLE, &sh->state); 3148 set_bit(STRIPE_HANDLE, &sh->state);
3396 goto unlock; 3149 goto finish;
3397 } 3150 }
3398 /* There is nothing for the blocked_rdev to block */ 3151 /* There is nothing for the blocked_rdev to block */
3399 rdev_dec_pending(blocked_rdev, conf->mddev); 3152 rdev_dec_pending(s.blocked_rdev, conf->mddev);
3400 blocked_rdev = NULL; 3153 s.blocked_rdev = NULL;
3401 } 3154 }
3402 3155
3403 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3156 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
@@ -3408,83 +3161,88 @@ static void handle_stripe6(struct stripe_head *sh)
3408 pr_debug("locked=%d uptodate=%d to_read=%d" 3161 pr_debug("locked=%d uptodate=%d to_read=%d"
3409 " to_write=%d failed=%d failed_num=%d,%d\n", 3162 " to_write=%d failed=%d failed_num=%d,%d\n",
3410 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3163 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3411 r6s.failed_num[0], r6s.failed_num[1]); 3164 s.failed_num[0], s.failed_num[1]);
3412 /* check if the array has lost >2 devices and, if so, some requests 3165 /* check if the array has lost more than max_degraded devices and,
3413 * might need to be failed 3166 * if so, some requests might need to be failed.
3414 */ 3167 */
3415 if (s.failed > 2 && s.to_read+s.to_write+s.written) 3168 if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written)
3416 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3169 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3417 if (s.failed > 2 && s.syncing) { 3170 if (s.failed > conf->max_degraded && s.syncing)
3418 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3171 handle_failed_sync(conf, sh, &s);
3419 clear_bit(STRIPE_SYNCING, &sh->state);
3420 s.syncing = 0;
3421 }
3422 3172
3423 /* 3173 /*
3424 * might be able to return some write requests if the parity blocks 3174 * might be able to return some write requests if the parity blocks
3425 * are safe, or on a failed drive 3175 * are safe, or on a failed drive
3426 */ 3176 */
3427 pdev = &sh->dev[pd_idx]; 3177 pdev = &sh->dev[sh->pd_idx];
3428 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) 3178 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
3429 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); 3179 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
3430 qdev = &sh->dev[qd_idx]; 3180 qdev = &sh->dev[sh->qd_idx];
3431 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) 3181 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
3432 || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); 3182 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
3433 3183 || conf->level < 6;
3434 if ( s.written && 3184
3435 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3185 if (s.written &&
3186 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3436 && !test_bit(R5_LOCKED, &pdev->flags) 3187 && !test_bit(R5_LOCKED, &pdev->flags)
3437 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3188 && test_bit(R5_UPTODATE, &pdev->flags)))) &&
3438 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3189 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3439 && !test_bit(R5_LOCKED, &qdev->flags) 3190 && !test_bit(R5_LOCKED, &qdev->flags)
3440 && test_bit(R5_UPTODATE, &qdev->flags))))) 3191 && test_bit(R5_UPTODATE, &qdev->flags)))))
3441 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3192 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
3442 3193
3443 /* Now we might consider reading some blocks, either to check/generate 3194 /* Now we might consider reading some blocks, either to check/generate
3444 * parity, or to satisfy requests 3195 * parity, or to satisfy requests
3445 * or to load a block that is being partially written. 3196 * or to load a block that is being partially written.
3446 */ 3197 */
3447 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3198 if (s.to_read || s.non_overwrite
3448 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3199 || (conf->level == 6 && s.to_write && s.failed)
3449 handle_stripe_fill6(sh, &s, &r6s, disks); 3200 || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3201 handle_stripe_fill(sh, &s, disks);
3450 3202
3451 /* Now we check to see if any write operations have recently 3203 /* Now we check to see if any write operations have recently
3452 * completed 3204 * completed
3453 */ 3205 */
3454 if (sh->reconstruct_state == reconstruct_state_drain_result) { 3206 prexor = 0;
3455 3207 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3208 prexor = 1;
3209 if (sh->reconstruct_state == reconstruct_state_drain_result ||
3210 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3456 sh->reconstruct_state = reconstruct_state_idle; 3211 sh->reconstruct_state = reconstruct_state_idle;
3457 /* All the 'written' buffers and the parity blocks are ready to 3212
3213 /* All the 'written' buffers and the parity block are ready to
3458 * be written back to disk 3214 * be written back to disk
3459 */ 3215 */
3460 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3216 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3461 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); 3217 BUG_ON(sh->qd_idx >= 0 &&
3218 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
3462 for (i = disks; i--; ) { 3219 for (i = disks; i--; ) {
3463 dev = &sh->dev[i]; 3220 struct r5dev *dev = &sh->dev[i];
3464 if (test_bit(R5_LOCKED, &dev->flags) && 3221 if (test_bit(R5_LOCKED, &dev->flags) &&
3465 (i == sh->pd_idx || i == qd_idx || 3222 (i == sh->pd_idx || i == sh->qd_idx ||
3466 dev->written)) { 3223 dev->written)) {
3467 pr_debug("Writing block %d\n", i); 3224 pr_debug("Writing block %d\n", i);
3468 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3469 set_bit(R5_Wantwrite, &dev->flags); 3225 set_bit(R5_Wantwrite, &dev->flags);
3226 if (prexor)
3227 continue;
3470 if (!test_bit(R5_Insync, &dev->flags) || 3228 if (!test_bit(R5_Insync, &dev->flags) ||
3471 ((i == sh->pd_idx || i == qd_idx) && 3229 ((i == sh->pd_idx || i == sh->qd_idx) &&
3472 s.failed == 0)) 3230 s.failed == 0))
3473 set_bit(STRIPE_INSYNC, &sh->state); 3231 set_bit(STRIPE_INSYNC, &sh->state);
3474 } 3232 }
3475 } 3233 }
3476 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3234 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3477 dec_preread_active = 1; 3235 s.dec_preread_active = 1;
3478 } 3236 }
3479 3237
3480 /* Now to consider new write requests and what else, if anything 3238 /* Now to consider new write requests and what else, if anything
3481 * should be read. We do not handle new writes when: 3239 * should be read. We do not handle new writes when:
3482 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. 3240 * 1/ A 'write' operation (copy+xor) is already in flight.
3483 * 2/ A 'check' operation is in flight, as it may clobber the parity 3241 * 2/ A 'check' operation is in flight, as it may clobber the parity
3484 * block. 3242 * block.
3485 */ 3243 */
3486 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3244 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3487 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3245 handle_stripe_dirtying(conf, sh, &s, disks);
3488 3246
3489 /* maybe we need to check and possibly fix the parity for this stripe 3247 /* maybe we need to check and possibly fix the parity for this stripe
3490 * Any reads will already have been scheduled, so we just see if enough 3248 * Any reads will already have been scheduled, so we just see if enough
@@ -3494,20 +3252,24 @@ static void handle_stripe6(struct stripe_head *sh)
3494 if (sh->check_state || 3252 if (sh->check_state ||
3495 (s.syncing && s.locked == 0 && 3253 (s.syncing && s.locked == 0 &&
3496 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3254 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3497 !test_bit(STRIPE_INSYNC, &sh->state))) 3255 !test_bit(STRIPE_INSYNC, &sh->state))) {
3498 handle_parity_checks6(conf, sh, &s, &r6s, disks); 3256 if (conf->level == 6)
3257 handle_parity_checks6(conf, sh, &s, disks);
3258 else
3259 handle_parity_checks5(conf, sh, &s, disks);
3260 }
3499 3261
3500 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3262 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3501 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3263 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3502 clear_bit(STRIPE_SYNCING, &sh->state); 3264 clear_bit(STRIPE_SYNCING, &sh->state);
3503 } 3265 }
3504 3266
3505 /* If the failed drives are just a ReadError, then we might need 3267 /* If the failed drives are just a ReadError, then we might need
3506 * to progress the repair/check process 3268 * to progress the repair/check process
3507 */ 3269 */
3508 if (s.failed <= 2 && !conf->mddev->ro) 3270 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
3509 for (i = 0; i < s.failed; i++) { 3271 for (i = 0; i < s.failed; i++) {
3510 dev = &sh->dev[r6s.failed_num[i]]; 3272 struct r5dev *dev = &sh->dev[s.failed_num[i]];
3511 if (test_bit(R5_ReadError, &dev->flags) 3273 if (test_bit(R5_ReadError, &dev->flags)
3512 && !test_bit(R5_LOCKED, &dev->flags) 3274 && !test_bit(R5_LOCKED, &dev->flags)
3513 && test_bit(R5_UPTODATE, &dev->flags) 3275 && test_bit(R5_UPTODATE, &dev->flags)
@@ -3526,8 +3288,26 @@ static void handle_stripe6(struct stripe_head *sh)
3526 } 3288 }
3527 } 3289 }
3528 3290
3291
3529 /* Finish reconstruct operations initiated by the expansion process */ 3292 /* Finish reconstruct operations initiated by the expansion process */
3530 if (sh->reconstruct_state == reconstruct_state_result) { 3293 if (sh->reconstruct_state == reconstruct_state_result) {
3294 struct stripe_head *sh_src
3295 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3296 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
3297 /* sh cannot be written until sh_src has been read.
3298 * so arrange for sh to be delayed a little
3299 */
3300 set_bit(STRIPE_DELAYED, &sh->state);
3301 set_bit(STRIPE_HANDLE, &sh->state);
3302 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3303 &sh_src->state))
3304 atomic_inc(&conf->preread_active_stripes);
3305 release_stripe(sh_src);
3306 goto finish;
3307 }
3308 if (sh_src)
3309 release_stripe(sh_src);
3310
3531 sh->reconstruct_state = reconstruct_state_idle; 3311 sh->reconstruct_state = reconstruct_state_idle;
3532 clear_bit(STRIPE_EXPANDING, &sh->state); 3312 clear_bit(STRIPE_EXPANDING, &sh->state);
3533 for (i = conf->raid_disks; i--; ) { 3313 for (i = conf->raid_disks; i--; ) {
@@ -3539,24 +3319,7 @@ static void handle_stripe6(struct stripe_head *sh)
3539 3319
3540 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3320 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3541 !sh->reconstruct_state) { 3321 !sh->reconstruct_state) {
3542 struct stripe_head *sh2 3322 /* Need to write out all blocks after computing parity */
3543 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3544 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3545 /* sh cannot be written until sh2 has been read.
3546 * so arrange for sh to be delayed a little
3547 */
3548 set_bit(STRIPE_DELAYED, &sh->state);
3549 set_bit(STRIPE_HANDLE, &sh->state);
3550 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3551 &sh2->state))
3552 atomic_inc(&conf->preread_active_stripes);
3553 release_stripe(sh2);
3554 goto unlock;
3555 }
3556 if (sh2)
3557 release_stripe(sh2);
3558
3559 /* Need to write out all blocks after computing P&Q */
3560 sh->disks = conf->raid_disks; 3323 sh->disks = conf->raid_disks;
3561 stripe_set_idx(sh->sector, conf, 0, sh); 3324 stripe_set_idx(sh->sector, conf, 0, sh);
3562 schedule_reconstruction(sh, &s, 1, 1); 3325 schedule_reconstruction(sh, &s, 1, 1);
@@ -3569,22 +3332,39 @@ static void handle_stripe6(struct stripe_head *sh)
3569 3332
3570 if (s.expanding && s.locked == 0 && 3333 if (s.expanding && s.locked == 0 &&
3571 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3334 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3572 handle_stripe_expansion(conf, sh, &r6s); 3335 handle_stripe_expansion(conf, sh);
3573
3574 unlock:
3575 spin_unlock(&sh->lock);
3576 3336
3337finish:
3577 /* wait for this device to become unblocked */ 3338 /* wait for this device to become unblocked */
3578 if (unlikely(blocked_rdev)) 3339 if (unlikely(s.blocked_rdev))
3579 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3340 md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
3341
3342 if (s.handle_bad_blocks)
3343 for (i = disks; i--; ) {
3344 mdk_rdev_t *rdev;
3345 struct r5dev *dev = &sh->dev[i];
3346 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
3347 /* We own a safe reference to the rdev */
3348 rdev = conf->disks[i].rdev;
3349 if (!rdev_set_badblocks(rdev, sh->sector,
3350 STRIPE_SECTORS, 0))
3351 md_error(conf->mddev, rdev);
3352 rdev_dec_pending(rdev, conf->mddev);
3353 }
3354 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
3355 rdev = conf->disks[i].rdev;
3356 rdev_clear_badblocks(rdev, sh->sector,
3357 STRIPE_SECTORS);
3358 rdev_dec_pending(rdev, conf->mddev);
3359 }
3360 }
3580 3361
3581 if (s.ops_request) 3362 if (s.ops_request)
3582 raid_run_ops(sh, s.ops_request); 3363 raid_run_ops(sh, s.ops_request);
3583 3364
3584 ops_run_io(sh, &s); 3365 ops_run_io(sh, &s);
3585 3366
3586 3367 if (s.dec_preread_active) {
3587 if (dec_preread_active) {
3588 /* We delay this until after ops_run_io so that if make_request 3368 /* We delay this until after ops_run_io so that if make_request
3589 * is waiting on a flush, it won't continue until the writes 3369 * is waiting on a flush, it won't continue until the writes
3590 * have actually been submitted. 3370 * have actually been submitted.
@@ -3595,15 +3375,9 @@ static void handle_stripe6(struct stripe_head *sh)
3595 md_wakeup_thread(conf->mddev->thread); 3375 md_wakeup_thread(conf->mddev->thread);
3596 } 3376 }
3597 3377
3598 return_io(return_bi); 3378 return_io(s.return_bi);
3599}
3600 3379
3601static void handle_stripe(struct stripe_head *sh) 3380 clear_bit(STRIPE_ACTIVE, &sh->state);
3602{
3603 if (sh->raid_conf->level == 6)
3604 handle_stripe6(sh);
3605 else
3606 handle_stripe5(sh);
3607} 3381}
3608 3382
3609static void raid5_activate_delayed(raid5_conf_t *conf) 3383static void raid5_activate_delayed(raid5_conf_t *conf)
@@ -3833,6 +3607,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3833 rcu_read_lock(); 3607 rcu_read_lock();
3834 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3608 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3835 if (rdev && test_bit(In_sync, &rdev->flags)) { 3609 if (rdev && test_bit(In_sync, &rdev->flags)) {
3610 sector_t first_bad;
3611 int bad_sectors;
3612
3836 atomic_inc(&rdev->nr_pending); 3613 atomic_inc(&rdev->nr_pending);
3837 rcu_read_unlock(); 3614 rcu_read_unlock();
3838 raid_bio->bi_next = (void*)rdev; 3615 raid_bio->bi_next = (void*)rdev;
@@ -3840,8 +3617,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3840 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3617 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3841 align_bi->bi_sector += rdev->data_offset; 3618 align_bi->bi_sector += rdev->data_offset;
3842 3619
3843 if (!bio_fits_rdev(align_bi)) { 3620 if (!bio_fits_rdev(align_bi) ||
3844 /* too big in some way */ 3621 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
3622 &first_bad, &bad_sectors)) {
3623 /* too big in some way, or has a known bad block */
3845 bio_put(align_bi); 3624 bio_put(align_bi);
3846 rdev_dec_pending(rdev, mddev); 3625 rdev_dec_pending(rdev, mddev);
3847 return 0; 3626 return 0;
@@ -4016,7 +3795,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4016 } 3795 }
4017 } 3796 }
4018 3797
4019 if (bio_data_dir(bi) == WRITE && 3798 if (rw == WRITE &&
4020 logical_sector >= mddev->suspend_lo && 3799 logical_sector >= mddev->suspend_lo &&
4021 logical_sector < mddev->suspend_hi) { 3800 logical_sector < mddev->suspend_hi) {
4022 release_stripe(sh); 3801 release_stripe(sh);
@@ -4034,7 +3813,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4034 } 3813 }
4035 3814
4036 if (test_bit(STRIPE_EXPANDING, &sh->state) || 3815 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
4037 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 3816 !add_stripe_bio(sh, bi, dd_idx, rw)) {
4038 /* Stripe is busy expanding or 3817 /* Stripe is busy expanding or
4039 * add failed due to overlap. Flush everything 3818 * add failed due to overlap. Flush everything
4040 * and wait a while 3819 * and wait a while
@@ -4375,10 +4154,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4375 4154
4376 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4155 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
4377 4156
4378 spin_lock(&sh->lock); 4157 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
4379 set_bit(STRIPE_SYNCING, &sh->state);
4380 clear_bit(STRIPE_INSYNC, &sh->state);
4381 spin_unlock(&sh->lock);
4382 4158
4383 handle_stripe(sh); 4159 handle_stripe(sh);
4384 release_stripe(sh); 4160 release_stripe(sh);
@@ -4509,6 +4285,9 @@ static void raid5d(mddev_t *mddev)
4509 release_stripe(sh); 4285 release_stripe(sh);
4510 cond_resched(); 4286 cond_resched();
4511 4287
4288 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
4289 md_check_recovery(mddev);
4290
4512 spin_lock_irq(&conf->device_lock); 4291 spin_lock_irq(&conf->device_lock);
4513 } 4292 }
4514 pr_debug("%d stripes handled\n", handled); 4293 pr_debug("%d stripes handled\n", handled);
@@ -5313,6 +5092,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
5313 * isn't possible. 5092 * isn't possible.
5314 */ 5093 */
5315 if (!test_bit(Faulty, &rdev->flags) && 5094 if (!test_bit(Faulty, &rdev->flags) &&
5095 mddev->recovery_disabled != conf->recovery_disabled &&
5316 !has_failed(conf) && 5096 !has_failed(conf) &&
5317 number < conf->raid_disks) { 5097 number < conf->raid_disks) {
5318 err = -EBUSY; 5098 err = -EBUSY;
@@ -5341,6 +5121,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5341 int first = 0; 5121 int first = 0;
5342 int last = conf->raid_disks - 1; 5122 int last = conf->raid_disks - 1;
5343 5123
5124 if (mddev->recovery_disabled == conf->recovery_disabled)
5125 return -EBUSY;
5126
5344 if (has_failed(conf)) 5127 if (has_failed(conf))
5345 /* no point adding a device */ 5128 /* no point adding a device */
5346 return -EINVAL; 5129 return -EINVAL;
@@ -5519,16 +5302,14 @@ static int raid5_start_reshape(mddev_t *mddev)
5519 if (rdev->raid_disk < 0 && 5302 if (rdev->raid_disk < 0 &&
5520 !test_bit(Faulty, &rdev->flags)) { 5303 !test_bit(Faulty, &rdev->flags)) {
5521 if (raid5_add_disk(mddev, rdev) == 0) { 5304 if (raid5_add_disk(mddev, rdev) == 0) {
5522 char nm[20];
5523 if (rdev->raid_disk 5305 if (rdev->raid_disk
5524 >= conf->previous_raid_disks) { 5306 >= conf->previous_raid_disks) {
5525 set_bit(In_sync, &rdev->flags); 5307 set_bit(In_sync, &rdev->flags);
5526 added_devices++; 5308 added_devices++;
5527 } else 5309 } else
5528 rdev->recovery_offset = 0; 5310 rdev->recovery_offset = 0;
5529 sprintf(nm, "rd%d", rdev->raid_disk); 5311
5530 if (sysfs_create_link(&mddev->kobj, 5312 if (sysfs_link_rdev(mddev, rdev))
5531 &rdev->kobj, nm))
5532 /* Failure here is OK */; 5313 /* Failure here is OK */;
5533 } 5314 }
5534 } else if (rdev->raid_disk >= conf->previous_raid_disks 5315 } else if (rdev->raid_disk >= conf->previous_raid_disks
@@ -5624,9 +5405,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
5624 d++) { 5405 d++) {
5625 mdk_rdev_t *rdev = conf->disks[d].rdev; 5406 mdk_rdev_t *rdev = conf->disks[d].rdev;
5626 if (rdev && raid5_remove_disk(mddev, d) == 0) { 5407 if (rdev && raid5_remove_disk(mddev, d) == 0) {
5627 char nm[20]; 5408 sysfs_unlink_rdev(mddev, rdev);
5628 sprintf(nm, "rd%d", rdev->raid_disk);
5629 sysfs_remove_link(&mddev->kobj, nm);
5630 rdev->raid_disk = -1; 5409 rdev->raid_disk = -1;
5631 } 5410 }
5632 } 5411 }
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3ca77a2613ba..11b9566184b2 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -6,11 +6,11 @@
6 6
7/* 7/*
8 * 8 *
9 * Each stripe contains one buffer per disc. Each buffer can be in 9 * Each stripe contains one buffer per device. Each buffer can be in
10 * one of a number of states stored in "flags". Changes between 10 * one of a number of states stored in "flags". Changes between
11 * these states happen *almost* exclusively under a per-stripe 11 * these states happen *almost* exclusively under the protection of the
12 * spinlock. Some very specific changes can happen in bi_end_io, and 12 * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and
13 * these are not protected by the spin lock. 13 * these are not protected by STRIPE_ACTIVE.
14 * 14 *
15 * The flag bits that are used to represent these states are: 15 * The flag bits that are used to represent these states are:
16 * R5_UPTODATE and R5_LOCKED 16 * R5_UPTODATE and R5_LOCKED
@@ -76,12 +76,10 @@
76 * block and the cached buffer are successfully written, any buffer on 76 * block and the cached buffer are successfully written, any buffer on
77 * a written list can be returned with b_end_io. 77 * a written list can be returned with b_end_io.
78 * 78 *
79 * The write list and read list both act as fifos. The read list is 79 * The write list and read list both act as fifos. The read list,
80 * protected by the device_lock. The write and written lists are 80 * write list and written list are protected by the device_lock.
81 * protected by the stripe lock. The device_lock, which can be 81 * The device_lock is only for list manipulations and will only be
82 * claimed while the stipe lock is held, is only for list 82 * held for a very short time. It can be claimed from interrupts.
83 * manipulations and will only be held for a very short time. It can
84 * be claimed from interrupts.
85 * 83 *
86 * 84 *
87 * Stripes in the stripe cache can be on one of two lists (or on 85 * Stripes in the stripe cache can be on one of two lists (or on
@@ -96,7 +94,6 @@
96 * 94 *
97 * The inactive_list, handle_list and hash bucket lists are all protected by the 95 * The inactive_list, handle_list and hash bucket lists are all protected by the
98 * device_lock. 96 * device_lock.
99 * - stripes on the inactive_list never have their stripe_lock held.
100 * - stripes have a reference counter. If count==0, they are on a list. 97 * - stripes have a reference counter. If count==0, they are on a list.
101 * - If a stripe might need handling, STRIPE_HANDLE is set. 98 * - If a stripe might need handling, STRIPE_HANDLE is set.
102 * - When refcount reaches zero, then if STRIPE_HANDLE it is put on 99 * - When refcount reaches zero, then if STRIPE_HANDLE it is put on
@@ -116,10 +113,10 @@
116 * attach a request to an active stripe (add_stripe_bh()) 113 * attach a request to an active stripe (add_stripe_bh())
117 * lockdev attach-buffer unlockdev 114 * lockdev attach-buffer unlockdev
118 * handle a stripe (handle_stripe()) 115 * handle a stripe (handle_stripe())
119 * lockstripe clrSTRIPE_HANDLE ... 116 * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ...
120 * (lockdev check-buffers unlockdev) .. 117 * (lockdev check-buffers unlockdev) ..
121 * change-state .. 118 * change-state ..
122 * record io/ops needed unlockstripe schedule io/ops 119 * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops
123 * release an active stripe (release_stripe()) 120 * release an active stripe (release_stripe())
124 * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev 121 * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
125 * 122 *
@@ -128,8 +125,7 @@
128 * on a cached buffer, and plus one if the stripe is undergoing stripe 125 * on a cached buffer, and plus one if the stripe is undergoing stripe
129 * operations. 126 * operations.
130 * 127 *
131 * Stripe operations are performed outside the stripe lock, 128 * The stripe operations are:
132 * the stripe operations are:
133 * -copying data between the stripe cache and user application buffers 129 * -copying data between the stripe cache and user application buffers
134 * -computing blocks to save a disk access, or to recover a missing block 130 * -computing blocks to save a disk access, or to recover a missing block
135 * -updating the parity on a write operation (reconstruct write and 131 * -updating the parity on a write operation (reconstruct write and
@@ -159,7 +155,8 @@
159 */ 155 */
160 156
161/* 157/*
162 * Operations state - intermediate states that are visible outside of sh->lock 158 * Operations state - intermediate states that are visible outside of
159 * STRIPE_ACTIVE.
163 * In general _idle indicates nothing is running, _run indicates a data 160 * In general _idle indicates nothing is running, _run indicates a data
164 * processing operation is active, and _result means the data processing result 161 * processing operation is active, and _result means the data processing result
165 * is stable and can be acted upon. For simple operations like biofill and 162 * is stable and can be acted upon. For simple operations like biofill and
@@ -209,7 +206,6 @@ struct stripe_head {
209 short ddf_layout;/* use DDF ordering to calculate Q */ 206 short ddf_layout;/* use DDF ordering to calculate Q */
210 unsigned long state; /* state flags */ 207 unsigned long state; /* state flags */
211 atomic_t count; /* nr of active thread/requests */ 208 atomic_t count; /* nr of active thread/requests */
212 spinlock_t lock;
213 int bm_seq; /* sequence number for bitmap flushes */ 209 int bm_seq; /* sequence number for bitmap flushes */
214 int disks; /* disks in stripe */ 210 int disks; /* disks in stripe */
215 enum check_states check_state; 211 enum check_states check_state;
@@ -240,19 +236,20 @@ struct stripe_head {
240}; 236};
241 237
242/* stripe_head_state - collects and tracks the dynamic state of a stripe_head 238/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
243 * for handle_stripe. It is only valid under spin_lock(sh->lock); 239 * for handle_stripe.
244 */ 240 */
245struct stripe_head_state { 241struct stripe_head_state {
246 int syncing, expanding, expanded; 242 int syncing, expanding, expanded;
247 int locked, uptodate, to_read, to_write, failed, written; 243 int locked, uptodate, to_read, to_write, failed, written;
248 int to_fill, compute, req_compute, non_overwrite; 244 int to_fill, compute, req_compute, non_overwrite;
249 int failed_num; 245 int failed_num[2];
246 int p_failed, q_failed;
247 int dec_preread_active;
250 unsigned long ops_request; 248 unsigned long ops_request;
251};
252 249
253/* r6_state - extra state data only relevant to r6 */ 250 struct bio *return_bi;
254struct r6_state { 251 mdk_rdev_t *blocked_rdev;
255 int p_failed, q_failed, failed_num[2]; 252 int handle_bad_blocks;
256}; 253};
257 254
258/* Flags */ 255/* Flags */
@@ -268,14 +265,16 @@ struct r6_state {
268#define R5_ReWrite 9 /* have tried to over-write the readerror */ 265#define R5_ReWrite 9 /* have tried to over-write the readerror */
269 266
270#define R5_Expanded 10 /* This block now has post-expand data */ 267#define R5_Expanded 10 /* This block now has post-expand data */
271#define R5_Wantcompute 11 /* compute_block in progress treat as 268#define R5_Wantcompute 11 /* compute_block in progress treat as
272 * uptodate 269 * uptodate
273 */ 270 */
274#define R5_Wantfill 12 /* dev->toread contains a bio that needs 271#define R5_Wantfill 12 /* dev->toread contains a bio that needs
275 * filling 272 * filling
276 */ 273 */
277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ 274#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
278#define R5_WantFUA 14 /* Write should be FUA */ 275#define R5_WantFUA 14 /* Write should be FUA */
276#define R5_WriteError 15 /* got a write error - need to record it */
277#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/
279/* 278/*
280 * Write method 279 * Write method
281 */ 280 */
@@ -289,21 +288,25 @@ struct r6_state {
289/* 288/*
290 * Stripe state 289 * Stripe state
291 */ 290 */
292#define STRIPE_HANDLE 2 291enum {
293#define STRIPE_SYNCING 3 292 STRIPE_ACTIVE,
294#define STRIPE_INSYNC 4 293 STRIPE_HANDLE,
295#define STRIPE_PREREAD_ACTIVE 5 294 STRIPE_SYNC_REQUESTED,
296#define STRIPE_DELAYED 6 295 STRIPE_SYNCING,
297#define STRIPE_DEGRADED 7 296 STRIPE_INSYNC,
298#define STRIPE_BIT_DELAY 8 297 STRIPE_PREREAD_ACTIVE,
299#define STRIPE_EXPANDING 9 298 STRIPE_DELAYED,
300#define STRIPE_EXPAND_SOURCE 10 299 STRIPE_DEGRADED,
301#define STRIPE_EXPAND_READY 11 300 STRIPE_BIT_DELAY,
302#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ 301 STRIPE_EXPANDING,
303#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ 302 STRIPE_EXPAND_SOURCE,
304#define STRIPE_BIOFILL_RUN 14 303 STRIPE_EXPAND_READY,
305#define STRIPE_COMPUTE_RUN 15 304 STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */
306#define STRIPE_OPS_REQ_PENDING 16 305 STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
306 STRIPE_BIOFILL_RUN,
307 STRIPE_COMPUTE_RUN,
308 STRIPE_OPS_REQ_PENDING,
309};
307 310
308/* 311/*
309 * Operation request flags 312 * Operation request flags
@@ -336,7 +339,7 @@ struct r6_state {
336 * PREREAD_ACTIVE. 339 * PREREAD_ACTIVE.
337 * In stripe_handle, if we find pre-reading is necessary, we do it if 340 * In stripe_handle, if we find pre-reading is necessary, we do it if
338 * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. 341 * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
339 * HANDLE gets cleared if stripe_handle leave nothing locked. 342 * HANDLE gets cleared if stripe_handle leaves nothing locked.
340 */ 343 */
341 344
342 345
@@ -399,7 +402,7 @@ struct raid5_private_data {
399 * (fresh device added). 402 * (fresh device added).
400 * Cleared when a sync completes. 403 * Cleared when a sync completes.
401 */ 404 */
402 405 int recovery_disabled;
403 /* per cpu variables */ 406 /* per cpu variables */
404 struct raid5_percpu { 407 struct raid5_percpu {
405 struct page *spare_page; /* Used when checking P/Q in raid6 */ 408 struct page *spare_page; /* Used when checking P/Q in raid6 */