aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-28 08:50:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-28 08:50:27 -0400
commit6140333d3656f62ac7e6a5af87e7fe92cfb8d655 (patch)
treed96f7ad2196b4383f5ca4396c956e24c82b2952c
parent6f56c218666b5c7eff354364357307d18c10058b (diff)
parent58c54fcca3bac5bf9290cfed31c76e4c4bfbabaf (diff)
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (75 commits) md/raid10: handle further errors during fix_read_error better. md/raid10: Handle read errors during recovery better. md/raid10: simplify read error handling during recovery. md/raid10: record bad blocks due to write errors during resync/recovery. md/raid10: attempt to fix read errors during resync/check md/raid10: Handle write errors by updating badblock log. md/raid10: clear bad-block record when write succeeds. md/raid10: avoid writing to known bad blocks on known bad drives. md/raid10 record bad blocks as needed during recovery. md/raid10: avoid reading known bad blocks during resync/recovery. md/raid10 - avoid reading from known bad blocks - part 3 md/raid10: avoid reading from known bad blocks - part 2 md/raid10: avoid reading from known bad blocks - part 1 md/raid10: Split handle_read_error out from raid10d. md/raid10: simplify/reindent some loops. md/raid5: Clear bad blocks on successful write. md/raid5. Don't write to known bad block on doubtful devices. md/raid5: write errors should be recorded as bad blocks if possible. md/raid5: use bad-block log to improve handling of uncorrectable read errors. md/raid5: avoid reading from known bad blocks. ...
-rw-r--r--Documentation/md.txt29
-rw-r--r--drivers/md/bitmap.c137
-rw-r--r--drivers/md/bitmap.h5
-rw-r--r--drivers/md/md.c871
-rw-r--r--drivers/md/md.h110
-rw-r--r--drivers/md/raid1.c962
-rw-r--r--drivers/md/raid1.h26
-rw-r--r--drivers/md/raid10.c1183
-rw-r--r--drivers/md/raid10.h21
-rw-r--r--drivers/md/raid5.c1015
-rw-r--r--drivers/md/raid5.h99
-rw-r--r--include/linux/raid/md_p.h14
12 files changed, 3093 insertions, 1379 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt
index f0eee83ff78a..fc94770f44ab 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -360,18 +360,20 @@ Each directory contains:
360 A file recording the current state of the device in the array 360 A file recording the current state of the device in the array
361 which can be a comma separated list of 361 which can be a comma separated list of
362 faulty - device has been kicked from active use due to 362 faulty - device has been kicked from active use due to
363 a detected fault 363 a detected fault or it has unacknowledged bad
364 blocks
364 in_sync - device is a fully in-sync member of the array 365 in_sync - device is a fully in-sync member of the array
365 writemostly - device will only be subject to read 366 writemostly - device will only be subject to read
366 requests if there are no other options. 367 requests if there are no other options.
367 This applies only to raid1 arrays. 368 This applies only to raid1 arrays.
368 blocked - device has failed, metadata is "external", 369 blocked - device has failed, and the failure hasn't been
369 and the failure hasn't been acknowledged yet. 370 acknowledged yet by the metadata handler.
370 Writes that would write to this device if 371 Writes that would write to this device if
371 it were not faulty are blocked. 372 it were not faulty are blocked.
372 spare - device is working, but not a full member. 373 spare - device is working, but not a full member.
373 This includes spares that are in the process 374 This includes spares that are in the process
374 of being recovered to 375 of being recovered to
376 write_error - device has ever seen a write error.
375 This list may grow in future. 377 This list may grow in future.
376 This can be written to. 378 This can be written to.
377 Writing "faulty" simulates a failure on the device. 379 Writing "faulty" simulates a failure on the device.
@@ -379,9 +381,11 @@ Each directory contains:
379 Writing "writemostly" sets the writemostly flag. 381 Writing "writemostly" sets the writemostly flag.
380 Writing "-writemostly" clears the writemostly flag. 382 Writing "-writemostly" clears the writemostly flag.
381 Writing "blocked" sets the "blocked" flag. 383 Writing "blocked" sets the "blocked" flag.
382 Writing "-blocked" clears the "blocked" flag and allows writes 384 Writing "-blocked" clears the "blocked" flags and allows writes
383 to complete. 385 to complete and possibly simulates an error.
384 Writing "in_sync" sets the in_sync flag. 386 Writing "in_sync" sets the in_sync flag.
387 Writing "write_error" sets writeerrorseen flag.
388 Writing "-write_error" clears writeerrorseen flag.
385 389
386 This file responds to select/poll. Any change to 'faulty' 390 This file responds to select/poll. Any change to 'faulty'
387 or 'blocked' causes an event. 391 or 'blocked' causes an event.
@@ -419,7 +423,6 @@ Each directory contains:
419 written, it will be rejected. 423 written, it will be rejected.
420 424
421 recovery_start 425 recovery_start
422
423 When the device is not 'in_sync', this records the number of 426 When the device is not 'in_sync', this records the number of
424 sectors from the start of the device which are known to be 427 sectors from the start of the device which are known to be
425 correct. This is normally zero, but during a recovery 428 correct. This is normally zero, but during a recovery
@@ -435,6 +438,20 @@ Each directory contains:
435 Setting this to 'none' is equivalent to setting 'in_sync'. 438 Setting this to 'none' is equivalent to setting 'in_sync'.
436 Setting to any other value also clears the 'in_sync' flag. 439 Setting to any other value also clears the 'in_sync' flag.
437 440
441 bad_blocks
442 This gives the list of all known bad blocks in the form of
443 start address and length (in sectors respectively). If output
444 is too big to fit in a page, it will be truncated. Writing
445 "sector length" to this file adds new acknowledged (i.e.
446 recorded to disk safely) bad blocks.
447
448 unacknowledged_bad_blocks
449 This gives the list of known-but-not-yet-saved-to-disk bad
450 blocks in the same form of 'bad_blocks'. If output is too big
451 to fit in a page, it will be truncated. Writing to this file
452 adds bad blocks without acknowledging them. This is largely
453 for testing.
454
438 455
439 456
440An active md device will also contain and entry for each active device 457An active md device will also contain and entry for each active device
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 574b09afedd3..0dc6546b77a8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -29,7 +29,6 @@
29#include "md.h" 29#include "md.h"
30#include "bitmap.h" 30#include "bitmap.h"
31 31
32#include <linux/dm-dirty-log.h>
33/* debug macros */ 32/* debug macros */
34 33
35#define DEBUG 0 34#define DEBUG 0
@@ -775,10 +774,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon
775 * 0 or page 1 774 * 0 or page 1
776 */ 775 */
777static inline struct page *filemap_get_page(struct bitmap *bitmap, 776static inline struct page *filemap_get_page(struct bitmap *bitmap,
778 unsigned long chunk) 777 unsigned long chunk)
779{ 778{
780 if (bitmap->filemap == NULL)
781 return NULL;
782 if (file_page_index(bitmap, chunk) >= bitmap->file_pages) 779 if (file_page_index(bitmap, chunk) >= bitmap->file_pages)
783 return NULL; 780 return NULL;
784 return bitmap->filemap[file_page_index(bitmap, chunk) 781 return bitmap->filemap[file_page_index(bitmap, chunk)
@@ -878,28 +875,19 @@ enum bitmap_page_attr {
878static inline void set_page_attr(struct bitmap *bitmap, struct page *page, 875static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
879 enum bitmap_page_attr attr) 876 enum bitmap_page_attr attr)
880{ 877{
881 if (page) 878 __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
882 __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
883 else
884 __set_bit(attr, &bitmap->logattrs);
885} 879}
886 880
887static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, 881static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
888 enum bitmap_page_attr attr) 882 enum bitmap_page_attr attr)
889{ 883{
890 if (page) 884 __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
891 __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
892 else
893 __clear_bit(attr, &bitmap->logattrs);
894} 885}
895 886
896static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, 887static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
897 enum bitmap_page_attr attr) 888 enum bitmap_page_attr attr)
898{ 889{
899 if (page) 890 return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
900 return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
901 else
902 return test_bit(attr, &bitmap->logattrs);
903} 891}
904 892
905/* 893/*
@@ -912,30 +900,26 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p
912static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) 900static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
913{ 901{
914 unsigned long bit; 902 unsigned long bit;
915 struct page *page = NULL; 903 struct page *page;
916 void *kaddr; 904 void *kaddr;
917 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); 905 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
918 906
919 if (!bitmap->filemap) { 907 if (!bitmap->filemap)
920 struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; 908 return;
921 if (log)
922 log->type->mark_region(log, chunk);
923 } else {
924 909
925 page = filemap_get_page(bitmap, chunk); 910 page = filemap_get_page(bitmap, chunk);
926 if (!page) 911 if (!page)
927 return; 912 return;
928 bit = file_page_offset(bitmap, chunk); 913 bit = file_page_offset(bitmap, chunk);
929 914
930 /* set the bit */ 915 /* set the bit */
931 kaddr = kmap_atomic(page, KM_USER0); 916 kaddr = kmap_atomic(page, KM_USER0);
932 if (bitmap->flags & BITMAP_HOSTENDIAN) 917 if (bitmap->flags & BITMAP_HOSTENDIAN)
933 set_bit(bit, kaddr); 918 set_bit(bit, kaddr);
934 else 919 else
935 __test_and_set_bit_le(bit, kaddr); 920 __set_bit_le(bit, kaddr);
936 kunmap_atomic(kaddr, KM_USER0); 921 kunmap_atomic(kaddr, KM_USER0);
937 PRINTK("set file bit %lu page %lu\n", bit, page->index); 922 PRINTK("set file bit %lu page %lu\n", bit, page->index);
938 }
939 /* record page number so it gets flushed to disk when unplug occurs */ 923 /* record page number so it gets flushed to disk when unplug occurs */
940 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); 924 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
941} 925}
@@ -952,16 +936,6 @@ void bitmap_unplug(struct bitmap *bitmap)
952 936
953 if (!bitmap) 937 if (!bitmap)
954 return; 938 return;
955 if (!bitmap->filemap) {
956 /* Must be using a dirty_log */
957 struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log;
958 dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs);
959 need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs);
960 if (dirty || need_write)
961 if (log->type->flush(log))
962 bitmap->flags |= BITMAP_WRITE_ERROR;
963 goto out;
964 }
965 939
966 /* look at each page to see if there are any set bits that need to be 940 /* look at each page to see if there are any set bits that need to be
967 * flushed out to disk */ 941 * flushed out to disk */
@@ -990,7 +964,6 @@ void bitmap_unplug(struct bitmap *bitmap)
990 else 964 else
991 md_super_wait(bitmap->mddev); 965 md_super_wait(bitmap->mddev);
992 } 966 }
993out:
994 if (bitmap->flags & BITMAP_WRITE_ERROR) 967 if (bitmap->flags & BITMAP_WRITE_ERROR)
995 bitmap_file_kick(bitmap); 968 bitmap_file_kick(bitmap);
996} 969}
@@ -1199,7 +1172,6 @@ void bitmap_daemon_work(mddev_t *mddev)
1199 struct page *page = NULL, *lastpage = NULL; 1172 struct page *page = NULL, *lastpage = NULL;
1200 sector_t blocks; 1173 sector_t blocks;
1201 void *paddr; 1174 void *paddr;
1202 struct dm_dirty_log *log = mddev->bitmap_info.log;
1203 1175
1204 /* Use a mutex to guard daemon_work against 1176 /* Use a mutex to guard daemon_work against
1205 * bitmap_destroy. 1177 * bitmap_destroy.
@@ -1224,12 +1196,11 @@ void bitmap_daemon_work(mddev_t *mddev)
1224 spin_lock_irqsave(&bitmap->lock, flags); 1196 spin_lock_irqsave(&bitmap->lock, flags);
1225 for (j = 0; j < bitmap->chunks; j++) { 1197 for (j = 0; j < bitmap->chunks; j++) {
1226 bitmap_counter_t *bmc; 1198 bitmap_counter_t *bmc;
1227 if (!bitmap->filemap) { 1199 if (!bitmap->filemap)
1228 if (!log) 1200 /* error or shutdown */
1229 /* error or shutdown */ 1201 break;
1230 break; 1202
1231 } else 1203 page = filemap_get_page(bitmap, j);
1232 page = filemap_get_page(bitmap, j);
1233 1204
1234 if (page != lastpage) { 1205 if (page != lastpage) {
1235 /* skip this page unless it's marked as needing cleaning */ 1206 /* skip this page unless it's marked as needing cleaning */
@@ -1298,17 +1269,16 @@ void bitmap_daemon_work(mddev_t *mddev)
1298 -1); 1269 -1);
1299 1270
1300 /* clear the bit */ 1271 /* clear the bit */
1301 if (page) { 1272 paddr = kmap_atomic(page, KM_USER0);
1302 paddr = kmap_atomic(page, KM_USER0); 1273 if (bitmap->flags & BITMAP_HOSTENDIAN)
1303 if (bitmap->flags & BITMAP_HOSTENDIAN) 1274 clear_bit(file_page_offset(bitmap, j),
1304 clear_bit(file_page_offset(bitmap, j), 1275 paddr);
1305 paddr); 1276 else
1306 else 1277 __clear_bit_le(
1307 __test_and_clear_bit_le(file_page_offset(bitmap, j), 1278 file_page_offset(bitmap,
1308 paddr); 1279 j),
1309 kunmap_atomic(paddr, KM_USER0); 1280 paddr);
1310 } else 1281 kunmap_atomic(paddr, KM_USER0);
1311 log->type->clear_region(log, j);
1312 } 1282 }
1313 } else 1283 } else
1314 j |= PAGE_COUNTER_MASK; 1284 j |= PAGE_COUNTER_MASK;
@@ -1316,16 +1286,12 @@ void bitmap_daemon_work(mddev_t *mddev)
1316 spin_unlock_irqrestore(&bitmap->lock, flags); 1286 spin_unlock_irqrestore(&bitmap->lock, flags);
1317 1287
1318 /* now sync the final page */ 1288 /* now sync the final page */
1319 if (lastpage != NULL || log != NULL) { 1289 if (lastpage != NULL) {
1320 spin_lock_irqsave(&bitmap->lock, flags); 1290 spin_lock_irqsave(&bitmap->lock, flags);
1321 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { 1291 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
1322 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1292 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1323 spin_unlock_irqrestore(&bitmap->lock, flags); 1293 spin_unlock_irqrestore(&bitmap->lock, flags);
1324 if (lastpage) 1294 write_page(bitmap, lastpage, 0);
1325 write_page(bitmap, lastpage, 0);
1326 else
1327 if (log->type->flush(log))
1328 bitmap->flags |= BITMAP_WRITE_ERROR;
1329 } else { 1295 } else {
1330 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1296 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1331 spin_unlock_irqrestore(&bitmap->lock, flags); 1297 spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1767,12 +1733,10 @@ int bitmap_create(mddev_t *mddev)
1767 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); 1733 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
1768 1734
1769 if (!file 1735 if (!file
1770 && !mddev->bitmap_info.offset 1736 && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
1771 && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */
1772 return 0; 1737 return 0;
1773 1738
1774 BUG_ON(file && mddev->bitmap_info.offset); 1739 BUG_ON(file && mddev->bitmap_info.offset);
1775 BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log);
1776 1740
1777 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); 1741 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
1778 if (!bitmap) 1742 if (!bitmap)
@@ -1863,6 +1827,7 @@ int bitmap_create(mddev_t *mddev)
1863int bitmap_load(mddev_t *mddev) 1827int bitmap_load(mddev_t *mddev)
1864{ 1828{
1865 int err = 0; 1829 int err = 0;
1830 sector_t start = 0;
1866 sector_t sector = 0; 1831 sector_t sector = 0;
1867 struct bitmap *bitmap = mddev->bitmap; 1832 struct bitmap *bitmap = mddev->bitmap;
1868 1833
@@ -1881,24 +1846,14 @@ int bitmap_load(mddev_t *mddev)
1881 } 1846 }
1882 bitmap_close_sync(bitmap); 1847 bitmap_close_sync(bitmap);
1883 1848
1884 if (mddev->bitmap_info.log) { 1849 if (mddev->degraded == 0
1885 unsigned long i; 1850 || bitmap->events_cleared == mddev->events)
1886 struct dm_dirty_log *log = mddev->bitmap_info.log; 1851 /* no need to keep dirty bits to optimise a
1887 for (i = 0; i < bitmap->chunks; i++) 1852 * re-add of a missing device */
1888 if (!log->type->in_sync(log, i, 1)) 1853 start = mddev->recovery_cp;
1889 bitmap_set_memory_bits(bitmap, 1854
1890 (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), 1855 err = bitmap_init_from_disk(bitmap, start);
1891 1); 1856
1892 } else {
1893 sector_t start = 0;
1894 if (mddev->degraded == 0
1895 || bitmap->events_cleared == mddev->events)
1896 /* no need to keep dirty bits to optimise a
1897 * re-add of a missing device */
1898 start = mddev->recovery_cp;
1899
1900 err = bitmap_init_from_disk(bitmap, start);
1901 }
1902 if (err) 1857 if (err)
1903 goto out; 1858 goto out;
1904 1859
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index b2a127e891ac..a28f2e5588c6 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -212,10 +212,6 @@ struct bitmap {
212 unsigned long file_pages; /* number of pages in the file */ 212 unsigned long file_pages; /* number of pages in the file */
213 int last_page_size; /* bytes in the last page */ 213 int last_page_size; /* bytes in the last page */
214 214
215 unsigned long logattrs; /* used when filemap_attr doesn't exist
216 * because we are working with a dirty_log
217 */
218
219 unsigned long flags; 215 unsigned long flags;
220 216
221 int allclean; 217 int allclean;
@@ -237,7 +233,6 @@ struct bitmap {
237 wait_queue_head_t behind_wait; 233 wait_queue_head_t behind_wait;
238 234
239 struct sysfs_dirent *sysfs_can_clear; 235 struct sysfs_dirent *sysfs_can_clear;
240
241}; 236};
242 237
243/* the bitmap API */ 238/* the bitmap API */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index dfc9425db70b..8e221a20f5d9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
215} 215}
216EXPORT_SYMBOL_GPL(bio_clone_mddev); 216EXPORT_SYMBOL_GPL(bio_clone_mddev);
217 217
218void md_trim_bio(struct bio *bio, int offset, int size)
219{
220 /* 'bio' is a cloned bio which we need to trim to match
221 * the given offset and size.
222 * This requires adjusting bi_sector, bi_size, and bi_io_vec
223 */
224 int i;
225 struct bio_vec *bvec;
226 int sofar = 0;
227
228 size <<= 9;
229 if (offset == 0 && size == bio->bi_size)
230 return;
231
232 bio->bi_sector += offset;
233 bio->bi_size = size;
234 offset <<= 9;
235 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
236
237 while (bio->bi_idx < bio->bi_vcnt &&
238 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
239 /* remove this whole bio_vec */
240 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
241 bio->bi_idx++;
242 }
243 if (bio->bi_idx < bio->bi_vcnt) {
244 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
245 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
246 }
247 /* avoid any complications with bi_idx being non-zero*/
248 if (bio->bi_idx) {
249 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
250 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
251 bio->bi_vcnt -= bio->bi_idx;
252 bio->bi_idx = 0;
253 }
254 /* Make sure vcnt and last bv are not too big */
255 bio_for_each_segment(bvec, bio, i) {
256 if (sofar + bvec->bv_len > size)
257 bvec->bv_len = size - sofar;
258 if (bvec->bv_len == 0) {
259 bio->bi_vcnt = i;
260 break;
261 }
262 sofar += bvec->bv_len;
263 }
264}
265EXPORT_SYMBOL_GPL(md_trim_bio);
266
218/* 267/*
219 * We have a system wide 'event count' that is incremented 268 * We have a system wide 'event count' that is incremented
220 * on any 'interesting' event, and readers of /proc/mdstat 269 * on any 'interesting' event, and readers of /proc/mdstat
@@ -757,6 +806,10 @@ static void free_disk_sb(mdk_rdev_t * rdev)
757 rdev->sb_start = 0; 806 rdev->sb_start = 0;
758 rdev->sectors = 0; 807 rdev->sectors = 0;
759 } 808 }
809 if (rdev->bb_page) {
810 put_page(rdev->bb_page);
811 rdev->bb_page = NULL;
812 }
760} 813}
761 814
762 815
@@ -1025,7 +1078,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1025 ret = -EINVAL; 1078 ret = -EINVAL;
1026 1079
1027 bdevname(rdev->bdev, b); 1080 bdevname(rdev->bdev, b);
1028 sb = (mdp_super_t*)page_address(rdev->sb_page); 1081 sb = page_address(rdev->sb_page);
1029 1082
1030 if (sb->md_magic != MD_SB_MAGIC) { 1083 if (sb->md_magic != MD_SB_MAGIC) {
1031 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 1084 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
@@ -1054,6 +1107,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1054 rdev->preferred_minor = sb->md_minor; 1107 rdev->preferred_minor = sb->md_minor;
1055 rdev->data_offset = 0; 1108 rdev->data_offset = 0;
1056 rdev->sb_size = MD_SB_BYTES; 1109 rdev->sb_size = MD_SB_BYTES;
1110 rdev->badblocks.shift = -1;
1057 1111
1058 if (sb->level == LEVEL_MULTIPATH) 1112 if (sb->level == LEVEL_MULTIPATH)
1059 rdev->desc_nr = -1; 1113 rdev->desc_nr = -1;
@@ -1064,7 +1118,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1064 ret = 1; 1118 ret = 1;
1065 } else { 1119 } else {
1066 __u64 ev1, ev2; 1120 __u64 ev1, ev2;
1067 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 1121 mdp_super_t *refsb = page_address(refdev->sb_page);
1068 if (!uuid_equal(refsb, sb)) { 1122 if (!uuid_equal(refsb, sb)) {
1069 printk(KERN_WARNING "md: %s has different UUID to %s\n", 1123 printk(KERN_WARNING "md: %s has different UUID to %s\n",
1070 b, bdevname(refdev->bdev,b2)); 1124 b, bdevname(refdev->bdev,b2));
@@ -1099,7 +1153,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1099static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1153static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1100{ 1154{
1101 mdp_disk_t *desc; 1155 mdp_disk_t *desc;
1102 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 1156 mdp_super_t *sb = page_address(rdev->sb_page);
1103 __u64 ev1 = md_event(sb); 1157 __u64 ev1 = md_event(sb);
1104 1158
1105 rdev->raid_disk = -1; 1159 rdev->raid_disk = -1;
@@ -1230,7 +1284,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1230 1284
1231 rdev->sb_size = MD_SB_BYTES; 1285 rdev->sb_size = MD_SB_BYTES;
1232 1286
1233 sb = (mdp_super_t*)page_address(rdev->sb_page); 1287 sb = page_address(rdev->sb_page);
1234 1288
1235 memset(sb, 0, sizeof(*sb)); 1289 memset(sb, 0, sizeof(*sb));
1236 1290
@@ -1395,6 +1449,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1395 return cpu_to_le32(csum); 1449 return cpu_to_le32(csum);
1396} 1450}
1397 1451
1452static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1453 int acknowledged);
1398static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1454static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1399{ 1455{
1400 struct mdp_superblock_1 *sb; 1456 struct mdp_superblock_1 *sb;
@@ -1435,7 +1491,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1435 if (ret) return ret; 1491 if (ret) return ret;
1436 1492
1437 1493
1438 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1494 sb = page_address(rdev->sb_page);
1439 1495
1440 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1496 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1441 sb->major_version != cpu_to_le32(1) || 1497 sb->major_version != cpu_to_le32(1) ||
@@ -1473,12 +1529,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1473 else 1529 else
1474 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1530 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1475 1531
1532 if (!rdev->bb_page) {
1533 rdev->bb_page = alloc_page(GFP_KERNEL);
1534 if (!rdev->bb_page)
1535 return -ENOMEM;
1536 }
1537 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1538 rdev->badblocks.count == 0) {
1539 /* need to load the bad block list.
1540 * Currently we limit it to one page.
1541 */
1542 s32 offset;
1543 sector_t bb_sector;
1544 u64 *bbp;
1545 int i;
1546 int sectors = le16_to_cpu(sb->bblog_size);
1547 if (sectors > (PAGE_SIZE / 512))
1548 return -EINVAL;
1549 offset = le32_to_cpu(sb->bblog_offset);
1550 if (offset == 0)
1551 return -EINVAL;
1552 bb_sector = (long long)offset;
1553 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1554 rdev->bb_page, READ, true))
1555 return -EIO;
1556 bbp = (u64 *)page_address(rdev->bb_page);
1557 rdev->badblocks.shift = sb->bblog_shift;
1558 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1559 u64 bb = le64_to_cpu(*bbp);
1560 int count = bb & (0x3ff);
1561 u64 sector = bb >> 10;
1562 sector <<= sb->bblog_shift;
1563 count <<= sb->bblog_shift;
1564 if (bb + 1 == 0)
1565 break;
1566 if (md_set_badblocks(&rdev->badblocks,
1567 sector, count, 1) == 0)
1568 return -EINVAL;
1569 }
1570 } else if (sb->bblog_offset == 0)
1571 rdev->badblocks.shift = -1;
1572
1476 if (!refdev) { 1573 if (!refdev) {
1477 ret = 1; 1574 ret = 1;
1478 } else { 1575 } else {
1479 __u64 ev1, ev2; 1576 __u64 ev1, ev2;
1480 struct mdp_superblock_1 *refsb = 1577 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1481 (struct mdp_superblock_1*)page_address(refdev->sb_page);
1482 1578
1483 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1579 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1484 sb->level != refsb->level || 1580 sb->level != refsb->level ||
@@ -1513,7 +1609,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1513 1609
1514static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1610static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1515{ 1611{
1516 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1612 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1517 __u64 ev1 = le64_to_cpu(sb->events); 1613 __u64 ev1 = le64_to_cpu(sb->events);
1518 1614
1519 rdev->raid_disk = -1; 1615 rdev->raid_disk = -1;
@@ -1619,13 +1715,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1619 int max_dev, i; 1715 int max_dev, i;
1620 /* make rdev->sb match mddev and rdev data. */ 1716 /* make rdev->sb match mddev and rdev data. */
1621 1717
1622 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1718 sb = page_address(rdev->sb_page);
1623 1719
1624 sb->feature_map = 0; 1720 sb->feature_map = 0;
1625 sb->pad0 = 0; 1721 sb->pad0 = 0;
1626 sb->recovery_offset = cpu_to_le64(0); 1722 sb->recovery_offset = cpu_to_le64(0);
1627 memset(sb->pad1, 0, sizeof(sb->pad1)); 1723 memset(sb->pad1, 0, sizeof(sb->pad1));
1628 memset(sb->pad2, 0, sizeof(sb->pad2));
1629 memset(sb->pad3, 0, sizeof(sb->pad3)); 1724 memset(sb->pad3, 0, sizeof(sb->pad3));
1630 1725
1631 sb->utime = cpu_to_le64((__u64)mddev->utime); 1726 sb->utime = cpu_to_le64((__u64)mddev->utime);
@@ -1665,6 +1760,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1665 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1760 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1666 } 1761 }
1667 1762
1763 if (rdev->badblocks.count == 0)
1764 /* Nothing to do for bad blocks*/ ;
1765 else if (sb->bblog_offset == 0)
1766 /* Cannot record bad blocks on this device */
1767 md_error(mddev, rdev);
1768 else {
1769 struct badblocks *bb = &rdev->badblocks;
1770 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1771 u64 *p = bb->page;
1772 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1773 if (bb->changed) {
1774 unsigned seq;
1775
1776retry:
1777 seq = read_seqbegin(&bb->lock);
1778
1779 memset(bbp, 0xff, PAGE_SIZE);
1780
1781 for (i = 0 ; i < bb->count ; i++) {
1782 u64 internal_bb = *p++;
1783 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1784 | BB_LEN(internal_bb));
1785 *bbp++ = cpu_to_le64(store_bb);
1786 }
1787 if (read_seqretry(&bb->lock, seq))
1788 goto retry;
1789
1790 bb->sector = (rdev->sb_start +
1791 (int)le32_to_cpu(sb->bblog_offset));
1792 bb->size = le16_to_cpu(sb->bblog_size);
1793 bb->changed = 0;
1794 }
1795 }
1796
1668 max_dev = 0; 1797 max_dev = 0;
1669 list_for_each_entry(rdev2, &mddev->disks, same_set) 1798 list_for_each_entry(rdev2, &mddev->disks, same_set)
1670 if (rdev2->desc_nr+1 > max_dev) 1799 if (rdev2->desc_nr+1 > max_dev)
@@ -1724,7 +1853,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1724 num_sectors = max_sectors; 1853 num_sectors = max_sectors;
1725 rdev->sb_start = sb_start; 1854 rdev->sb_start = sb_start;
1726 } 1855 }
1727 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); 1856 sb = page_address(rdev->sb_page);
1728 sb->data_size = cpu_to_le64(num_sectors); 1857 sb->data_size = cpu_to_le64(num_sectors);
1729 sb->super_offset = rdev->sb_start; 1858 sb->super_offset = rdev->sb_start;
1730 sb->sb_csum = calc_sb_1_csum(sb); 1859 sb->sb_csum = calc_sb_1_csum(sb);
@@ -1922,7 +2051,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1922 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2051 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
1923 2052
1924 /* May as well allow recovery to be retried once */ 2053 /* May as well allow recovery to be retried once */
1925 mddev->recovery_disabled = 0; 2054 mddev->recovery_disabled++;
1926 2055
1927 return 0; 2056 return 0;
1928 2057
@@ -1953,6 +2082,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1953 sysfs_remove_link(&rdev->kobj, "block"); 2082 sysfs_remove_link(&rdev->kobj, "block");
1954 sysfs_put(rdev->sysfs_state); 2083 sysfs_put(rdev->sysfs_state);
1955 rdev->sysfs_state = NULL; 2084 rdev->sysfs_state = NULL;
2085 kfree(rdev->badblocks.page);
2086 rdev->badblocks.count = 0;
2087 rdev->badblocks.page = NULL;
1956 /* We need to delay this, otherwise we can deadlock when 2088 /* We need to delay this, otherwise we can deadlock when
1957 * writing to 'remove' to "dev/state". We also need 2089 * writing to 'remove' to "dev/state". We also need
1958 * to delay it due to rcu usage. 2090 * to delay it due to rcu usage.
@@ -2127,10 +2259,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version)
2127 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); 2259 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
2128 switch (major_version) { 2260 switch (major_version) {
2129 case 0: 2261 case 0:
2130 print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); 2262 print_sb_90(page_address(rdev->sb_page));
2131 break; 2263 break;
2132 case 1: 2264 case 1:
2133 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); 2265 print_sb_1(page_address(rdev->sb_page));
2134 break; 2266 break;
2135 } 2267 }
2136 } else 2268 } else
@@ -2194,6 +2326,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
2194 mdk_rdev_t *rdev; 2326 mdk_rdev_t *rdev;
2195 int sync_req; 2327 int sync_req;
2196 int nospares = 0; 2328 int nospares = 0;
2329 int any_badblocks_changed = 0;
2197 2330
2198repeat: 2331repeat:
2199 /* First make sure individual recovery_offsets are correct */ 2332 /* First make sure individual recovery_offsets are correct */
@@ -2208,8 +2341,18 @@ repeat:
2208 if (!mddev->persistent) { 2341 if (!mddev->persistent) {
2209 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2342 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2210 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2343 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2211 if (!mddev->external) 2344 if (!mddev->external) {
2212 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2345 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2346 list_for_each_entry(rdev, &mddev->disks, same_set) {
2347 if (rdev->badblocks.changed) {
2348 md_ack_all_badblocks(&rdev->badblocks);
2349 md_error(mddev, rdev);
2350 }
2351 clear_bit(Blocked, &rdev->flags);
2352 clear_bit(BlockedBadBlocks, &rdev->flags);
2353 wake_up(&rdev->blocked_wait);
2354 }
2355 }
2213 wake_up(&mddev->sb_wait); 2356 wake_up(&mddev->sb_wait);
2214 return; 2357 return;
2215 } 2358 }
@@ -2265,6 +2408,14 @@ repeat:
2265 MD_BUG(); 2408 MD_BUG();
2266 mddev->events --; 2409 mddev->events --;
2267 } 2410 }
2411
2412 list_for_each_entry(rdev, &mddev->disks, same_set) {
2413 if (rdev->badblocks.changed)
2414 any_badblocks_changed++;
2415 if (test_bit(Faulty, &rdev->flags))
2416 set_bit(FaultRecorded, &rdev->flags);
2417 }
2418
2268 sync_sbs(mddev, nospares); 2419 sync_sbs(mddev, nospares);
2269 spin_unlock_irq(&mddev->write_lock); 2420 spin_unlock_irq(&mddev->write_lock);
2270 2421
@@ -2290,6 +2441,13 @@ repeat:
2290 bdevname(rdev->bdev,b), 2441 bdevname(rdev->bdev,b),
2291 (unsigned long long)rdev->sb_start); 2442 (unsigned long long)rdev->sb_start);
2292 rdev->sb_events = mddev->events; 2443 rdev->sb_events = mddev->events;
2444 if (rdev->badblocks.size) {
2445 md_super_write(mddev, rdev,
2446 rdev->badblocks.sector,
2447 rdev->badblocks.size << 9,
2448 rdev->bb_page);
2449 rdev->badblocks.size = 0;
2450 }
2293 2451
2294 } else 2452 } else
2295 dprintk(")\n"); 2453 dprintk(")\n");
@@ -2313,6 +2471,15 @@ repeat:
2313 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2471 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2314 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2472 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2315 2473
2474 list_for_each_entry(rdev, &mddev->disks, same_set) {
2475 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2476 clear_bit(Blocked, &rdev->flags);
2477
2478 if (any_badblocks_changed)
2479 md_ack_all_badblocks(&rdev->badblocks);
2480 clear_bit(BlockedBadBlocks, &rdev->flags);
2481 wake_up(&rdev->blocked_wait);
2482 }
2316} 2483}
2317 2484
2318/* words written to sysfs files may, or may not, be \n terminated. 2485/* words written to sysfs files may, or may not, be \n terminated.
@@ -2347,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page)
2347 char *sep = ""; 2514 char *sep = "";
2348 size_t len = 0; 2515 size_t len = 0;
2349 2516
2350 if (test_bit(Faulty, &rdev->flags)) { 2517 if (test_bit(Faulty, &rdev->flags) ||
2518 rdev->badblocks.unacked_exist) {
2351 len+= sprintf(page+len, "%sfaulty",sep); 2519 len+= sprintf(page+len, "%sfaulty",sep);
2352 sep = ","; 2520 sep = ",";
2353 } 2521 }
@@ -2359,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page)
2359 len += sprintf(page+len, "%swrite_mostly",sep); 2527 len += sprintf(page+len, "%swrite_mostly",sep);
2360 sep = ","; 2528 sep = ",";
2361 } 2529 }
2362 if (test_bit(Blocked, &rdev->flags)) { 2530 if (test_bit(Blocked, &rdev->flags) ||
2531 rdev->badblocks.unacked_exist) {
2363 len += sprintf(page+len, "%sblocked", sep); 2532 len += sprintf(page+len, "%sblocked", sep);
2364 sep = ","; 2533 sep = ",";
2365 } 2534 }
@@ -2368,6 +2537,10 @@ state_show(mdk_rdev_t *rdev, char *page)
2368 len += sprintf(page+len, "%sspare", sep); 2537 len += sprintf(page+len, "%sspare", sep);
2369 sep = ","; 2538 sep = ",";
2370 } 2539 }
2540 if (test_bit(WriteErrorSeen, &rdev->flags)) {
2541 len += sprintf(page+len, "%swrite_error", sep);
2542 sep = ",";
2543 }
2371 return len+sprintf(page+len, "\n"); 2544 return len+sprintf(page+len, "\n");
2372} 2545}
2373 2546
@@ -2375,13 +2548,15 @@ static ssize_t
2375state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2548state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2376{ 2549{
2377 /* can write 2550 /* can write
2378 * faulty - simulates and error 2551 * faulty - simulates an error
2379 * remove - disconnects the device 2552 * remove - disconnects the device
2380 * writemostly - sets write_mostly 2553 * writemostly - sets write_mostly
2381 * -writemostly - clears write_mostly 2554 * -writemostly - clears write_mostly
2382 * blocked - sets the Blocked flag 2555 * blocked - sets the Blocked flags
2383 * -blocked - clears the Blocked flag 2556 * -blocked - clears the Blocked and possibly simulates an error
2384 * insync - sets Insync providing device isn't active 2557 * insync - sets Insync providing device isn't active
2558 * write_error - sets WriteErrorSeen
2559 * -write_error - clears WriteErrorSeen
2385 */ 2560 */
2386 int err = -EINVAL; 2561 int err = -EINVAL;
2387 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2562 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
@@ -2408,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2408 set_bit(Blocked, &rdev->flags); 2583 set_bit(Blocked, &rdev->flags);
2409 err = 0; 2584 err = 0;
2410 } else if (cmd_match(buf, "-blocked")) { 2585 } else if (cmd_match(buf, "-blocked")) {
2586 if (!test_bit(Faulty, &rdev->flags) &&
2587 test_bit(BlockedBadBlocks, &rdev->flags)) {
2588 /* metadata handler doesn't understand badblocks,
2589 * so we need to fail the device
2590 */
2591 md_error(rdev->mddev, rdev);
2592 }
2411 clear_bit(Blocked, &rdev->flags); 2593 clear_bit(Blocked, &rdev->flags);
2594 clear_bit(BlockedBadBlocks, &rdev->flags);
2412 wake_up(&rdev->blocked_wait); 2595 wake_up(&rdev->blocked_wait);
2413 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2596 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2414 md_wakeup_thread(rdev->mddev->thread); 2597 md_wakeup_thread(rdev->mddev->thread);
@@ -2417,6 +2600,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2417 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2600 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2418 set_bit(In_sync, &rdev->flags); 2601 set_bit(In_sync, &rdev->flags);
2419 err = 0; 2602 err = 0;
2603 } else if (cmd_match(buf, "write_error")) {
2604 set_bit(WriteErrorSeen, &rdev->flags);
2605 err = 0;
2606 } else if (cmd_match(buf, "-write_error")) {
2607 clear_bit(WriteErrorSeen, &rdev->flags);
2608 err = 0;
2420 } 2609 }
2421 if (!err) 2610 if (!err)
2422 sysfs_notify_dirent_safe(rdev->sysfs_state); 2611 sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2459,7 +2648,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2459{ 2648{
2460 char *e; 2649 char *e;
2461 int err; 2650 int err;
2462 char nm[20];
2463 int slot = simple_strtoul(buf, &e, 10); 2651 int slot = simple_strtoul(buf, &e, 10);
2464 if (strncmp(buf, "none", 4)==0) 2652 if (strncmp(buf, "none", 4)==0)
2465 slot = -1; 2653 slot = -1;
@@ -2482,8 +2670,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2482 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2670 hot_remove_disk(rdev->mddev, rdev->raid_disk);
2483 if (err) 2671 if (err)
2484 return err; 2672 return err;
2485 sprintf(nm, "rd%d", rdev->raid_disk); 2673 sysfs_unlink_rdev(rdev->mddev, rdev);
2486 sysfs_remove_link(&rdev->mddev->kobj, nm);
2487 rdev->raid_disk = -1; 2674 rdev->raid_disk = -1;
2488 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2675 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2489 md_wakeup_thread(rdev->mddev->thread); 2676 md_wakeup_thread(rdev->mddev->thread);
@@ -2522,8 +2709,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2522 return err; 2709 return err;
2523 } else 2710 } else
2524 sysfs_notify_dirent_safe(rdev->sysfs_state); 2711 sysfs_notify_dirent_safe(rdev->sysfs_state);
2525 sprintf(nm, "rd%d", rdev->raid_disk); 2712 if (sysfs_link_rdev(rdev->mddev, rdev))
2526 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2527 /* failure here is OK */; 2713 /* failure here is OK */;
2528 /* don't wakeup anyone, leave that to userspace. */ 2714 /* don't wakeup anyone, leave that to userspace. */
2529 } else { 2715 } else {
@@ -2712,6 +2898,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le
2712static struct rdev_sysfs_entry rdev_recovery_start = 2898static struct rdev_sysfs_entry rdev_recovery_start =
2713__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 2899__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2714 2900
2901
2902static ssize_t
2903badblocks_show(struct badblocks *bb, char *page, int unack);
2904static ssize_t
2905badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
2906
2907static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
2908{
2909 return badblocks_show(&rdev->badblocks, page, 0);
2910}
2911static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
2912{
2913 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
2914 /* Maybe that ack was all we needed */
2915 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
2916 wake_up(&rdev->blocked_wait);
2917 return rv;
2918}
2919static struct rdev_sysfs_entry rdev_bad_blocks =
2920__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
2921
2922
2923static ssize_t ubb_show(mdk_rdev_t *rdev, char *page)
2924{
2925 return badblocks_show(&rdev->badblocks, page, 1);
2926}
2927static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len)
2928{
2929 return badblocks_store(&rdev->badblocks, page, len, 1);
2930}
2931static struct rdev_sysfs_entry rdev_unack_bad_blocks =
2932__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
2933
2715static struct attribute *rdev_default_attrs[] = { 2934static struct attribute *rdev_default_attrs[] = {
2716 &rdev_state.attr, 2935 &rdev_state.attr,
2717 &rdev_errors.attr, 2936 &rdev_errors.attr,
@@ -2719,6 +2938,8 @@ static struct attribute *rdev_default_attrs[] = {
2719 &rdev_offset.attr, 2938 &rdev_offset.attr,
2720 &rdev_size.attr, 2939 &rdev_size.attr,
2721 &rdev_recovery_start.attr, 2940 &rdev_recovery_start.attr,
2941 &rdev_bad_blocks.attr,
2942 &rdev_unack_bad_blocks.attr,
2722 NULL, 2943 NULL,
2723}; 2944};
2724static ssize_t 2945static ssize_t
@@ -2782,7 +3003,7 @@ static struct kobj_type rdev_ktype = {
2782 .default_attrs = rdev_default_attrs, 3003 .default_attrs = rdev_default_attrs,
2783}; 3004};
2784 3005
2785void md_rdev_init(mdk_rdev_t *rdev) 3006int md_rdev_init(mdk_rdev_t *rdev)
2786{ 3007{
2787 rdev->desc_nr = -1; 3008 rdev->desc_nr = -1;
2788 rdev->saved_raid_disk = -1; 3009 rdev->saved_raid_disk = -1;
@@ -2792,12 +3013,27 @@ void md_rdev_init(mdk_rdev_t *rdev)
2792 rdev->sb_events = 0; 3013 rdev->sb_events = 0;
2793 rdev->last_read_error.tv_sec = 0; 3014 rdev->last_read_error.tv_sec = 0;
2794 rdev->last_read_error.tv_nsec = 0; 3015 rdev->last_read_error.tv_nsec = 0;
3016 rdev->sb_loaded = 0;
3017 rdev->bb_page = NULL;
2795 atomic_set(&rdev->nr_pending, 0); 3018 atomic_set(&rdev->nr_pending, 0);
2796 atomic_set(&rdev->read_errors, 0); 3019 atomic_set(&rdev->read_errors, 0);
2797 atomic_set(&rdev->corrected_errors, 0); 3020 atomic_set(&rdev->corrected_errors, 0);
2798 3021
2799 INIT_LIST_HEAD(&rdev->same_set); 3022 INIT_LIST_HEAD(&rdev->same_set);
2800 init_waitqueue_head(&rdev->blocked_wait); 3023 init_waitqueue_head(&rdev->blocked_wait);
3024
3025 /* Add space to store bad block list.
3026 * This reserves the space even on arrays where it cannot
3027 * be used - I wonder if that matters
3028 */
3029 rdev->badblocks.count = 0;
3030 rdev->badblocks.shift = 0;
3031 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3032 seqlock_init(&rdev->badblocks.lock);
3033 if (rdev->badblocks.page == NULL)
3034 return -ENOMEM;
3035
3036 return 0;
2801} 3037}
2802EXPORT_SYMBOL_GPL(md_rdev_init); 3038EXPORT_SYMBOL_GPL(md_rdev_init);
2803/* 3039/*
@@ -2823,8 +3059,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2823 return ERR_PTR(-ENOMEM); 3059 return ERR_PTR(-ENOMEM);
2824 } 3060 }
2825 3061
2826 md_rdev_init(rdev); 3062 err = md_rdev_init(rdev);
2827 if ((err = alloc_disk_sb(rdev))) 3063 if (err)
3064 goto abort_free;
3065 err = alloc_disk_sb(rdev);
3066 if (err)
2828 goto abort_free; 3067 goto abort_free;
2829 3068
2830 err = lock_rdev(rdev, newdev, super_format == -2); 3069 err = lock_rdev(rdev, newdev, super_format == -2);
@@ -2860,15 +3099,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2860 goto abort_free; 3099 goto abort_free;
2861 } 3100 }
2862 } 3101 }
3102 if (super_format == -1)
3103 /* hot-add for 0.90, or non-persistent: so no badblocks */
3104 rdev->badblocks.shift = -1;
2863 3105
2864 return rdev; 3106 return rdev;
2865 3107
2866abort_free: 3108abort_free:
2867 if (rdev->sb_page) { 3109 if (rdev->bdev)
2868 if (rdev->bdev) 3110 unlock_rdev(rdev);
2869 unlock_rdev(rdev); 3111 free_disk_sb(rdev);
2870 free_disk_sb(rdev); 3112 kfree(rdev->badblocks.page);
2871 }
2872 kfree(rdev); 3113 kfree(rdev);
2873 return ERR_PTR(err); 3114 return ERR_PTR(err);
2874} 3115}
@@ -3149,15 +3390,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3149 } 3390 }
3150 3391
3151 list_for_each_entry(rdev, &mddev->disks, same_set) { 3392 list_for_each_entry(rdev, &mddev->disks, same_set) {
3152 char nm[20];
3153 if (rdev->raid_disk < 0) 3393 if (rdev->raid_disk < 0)
3154 continue; 3394 continue;
3155 if (rdev->new_raid_disk >= mddev->raid_disks) 3395 if (rdev->new_raid_disk >= mddev->raid_disks)
3156 rdev->new_raid_disk = -1; 3396 rdev->new_raid_disk = -1;
3157 if (rdev->new_raid_disk == rdev->raid_disk) 3397 if (rdev->new_raid_disk == rdev->raid_disk)
3158 continue; 3398 continue;
3159 sprintf(nm, "rd%d", rdev->raid_disk); 3399 sysfs_unlink_rdev(mddev, rdev);
3160 sysfs_remove_link(&mddev->kobj, nm);
3161 } 3400 }
3162 list_for_each_entry(rdev, &mddev->disks, same_set) { 3401 list_for_each_entry(rdev, &mddev->disks, same_set) {
3163 if (rdev->raid_disk < 0) 3402 if (rdev->raid_disk < 0)
@@ -3168,11 +3407,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3168 if (rdev->raid_disk < 0) 3407 if (rdev->raid_disk < 0)
3169 clear_bit(In_sync, &rdev->flags); 3408 clear_bit(In_sync, &rdev->flags);
3170 else { 3409 else {
3171 char nm[20]; 3410 if (sysfs_link_rdev(mddev, rdev))
3172 sprintf(nm, "rd%d", rdev->raid_disk); 3411 printk(KERN_WARNING "md: cannot register rd%d"
3173 if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 3412 " for %s after level change\n",
3174 printk("md: cannot register %s for %s after level change\n", 3413 rdev->raid_disk, mdname(mddev));
3175 nm, mdname(mddev));
3176 } 3414 }
3177 } 3415 }
3178 3416
@@ -4504,7 +4742,8 @@ int md_run(mddev_t *mddev)
4504 } 4742 }
4505 4743
4506 if (mddev->bio_set == NULL) 4744 if (mddev->bio_set == NULL)
4507 mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); 4745 mddev->bio_set = bioset_create(BIO_POOL_SIZE,
4746 sizeof(mddev_t *));
4508 4747
4509 spin_lock(&pers_lock); 4748 spin_lock(&pers_lock);
4510 pers = find_pers(mddev->level, mddev->clevel); 4749 pers = find_pers(mddev->level, mddev->clevel);
@@ -4621,12 +4860,9 @@ int md_run(mddev_t *mddev)
4621 smp_wmb(); 4860 smp_wmb();
4622 mddev->ready = 1; 4861 mddev->ready = 1;
4623 list_for_each_entry(rdev, &mddev->disks, same_set) 4862 list_for_each_entry(rdev, &mddev->disks, same_set)
4624 if (rdev->raid_disk >= 0) { 4863 if (rdev->raid_disk >= 0)
4625 char nm[20]; 4864 if (sysfs_link_rdev(mddev, rdev))
4626 sprintf(nm, "rd%d", rdev->raid_disk);
4627 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
4628 /* failure here is OK */; 4865 /* failure here is OK */;
4629 }
4630 4866
4631 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4867 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4632 4868
@@ -4854,11 +5090,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4854 sysfs_notify_dirent_safe(mddev->sysfs_state); 5090 sysfs_notify_dirent_safe(mddev->sysfs_state);
4855 5091
4856 list_for_each_entry(rdev, &mddev->disks, same_set) 5092 list_for_each_entry(rdev, &mddev->disks, same_set)
4857 if (rdev->raid_disk >= 0) { 5093 if (rdev->raid_disk >= 0)
4858 char nm[20]; 5094 sysfs_unlink_rdev(mddev, rdev);
4859 sprintf(nm, "rd%d", rdev->raid_disk);
4860 sysfs_remove_link(&mddev->kobj, nm);
4861 }
4862 5095
4863 set_capacity(disk, 0); 5096 set_capacity(disk, 0);
4864 mutex_unlock(&mddev->open_mutex); 5097 mutex_unlock(&mddev->open_mutex);
@@ -6198,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
6198 if (!rdev || test_bit(Faulty, &rdev->flags)) 6431 if (!rdev || test_bit(Faulty, &rdev->flags))
6199 return; 6432 return;
6200 6433
6201 if (mddev->external) 6434 if (!mddev->pers || !mddev->pers->error_handler)
6202 set_bit(Blocked, &rdev->flags);
6203/*
6204 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
6205 mdname(mddev),
6206 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
6207 __builtin_return_address(0),__builtin_return_address(1),
6208 __builtin_return_address(2),__builtin_return_address(3));
6209*/
6210 if (!mddev->pers)
6211 return;
6212 if (!mddev->pers->error_handler)
6213 return; 6435 return;
6214 mddev->pers->error_handler(mddev,rdev); 6436 mddev->pers->error_handler(mddev,rdev);
6215 if (mddev->degraded) 6437 if (mddev->degraded)
@@ -6933,11 +7155,14 @@ void md_do_sync(mddev_t *mddev)
6933 atomic_add(sectors, &mddev->recovery_active); 7155 atomic_add(sectors, &mddev->recovery_active);
6934 } 7156 }
6935 7157
7158 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7159 break;
7160
6936 j += sectors; 7161 j += sectors;
6937 if (j>1) mddev->curr_resync = j; 7162 if (j>1) mddev->curr_resync = j;
6938 mddev->curr_mark_cnt = io_sectors; 7163 mddev->curr_mark_cnt = io_sectors;
6939 if (last_check == 0) 7164 if (last_check == 0)
6940 /* this is the earliers that rebuilt will be 7165 /* this is the earliest that rebuild will be
6941 * visible in /proc/mdstat 7166 * visible in /proc/mdstat
6942 */ 7167 */
6943 md_new_event(mddev); 7168 md_new_event(mddev);
@@ -6946,10 +7171,6 @@ void md_do_sync(mddev_t *mddev)
6946 continue; 7171 continue;
6947 7172
6948 last_check = io_sectors; 7173 last_check = io_sectors;
6949
6950 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6951 break;
6952
6953 repeat: 7174 repeat:
6954 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 7175 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
6955 /* step marks */ 7176 /* step marks */
@@ -7067,29 +7288,23 @@ static int remove_and_add_spares(mddev_t *mddev)
7067 atomic_read(&rdev->nr_pending)==0) { 7288 atomic_read(&rdev->nr_pending)==0) {
7068 if (mddev->pers->hot_remove_disk( 7289 if (mddev->pers->hot_remove_disk(
7069 mddev, rdev->raid_disk)==0) { 7290 mddev, rdev->raid_disk)==0) {
7070 char nm[20]; 7291 sysfs_unlink_rdev(mddev, rdev);
7071 sprintf(nm,"rd%d", rdev->raid_disk);
7072 sysfs_remove_link(&mddev->kobj, nm);
7073 rdev->raid_disk = -1; 7292 rdev->raid_disk = -1;
7074 } 7293 }
7075 } 7294 }
7076 7295
7077 if (mddev->degraded && !mddev->recovery_disabled) { 7296 if (mddev->degraded) {
7078 list_for_each_entry(rdev, &mddev->disks, same_set) { 7297 list_for_each_entry(rdev, &mddev->disks, same_set) {
7079 if (rdev->raid_disk >= 0 && 7298 if (rdev->raid_disk >= 0 &&
7080 !test_bit(In_sync, &rdev->flags) && 7299 !test_bit(In_sync, &rdev->flags) &&
7081 !test_bit(Faulty, &rdev->flags) && 7300 !test_bit(Faulty, &rdev->flags))
7082 !test_bit(Blocked, &rdev->flags))
7083 spares++; 7301 spares++;
7084 if (rdev->raid_disk < 0 7302 if (rdev->raid_disk < 0
7085 && !test_bit(Faulty, &rdev->flags)) { 7303 && !test_bit(Faulty, &rdev->flags)) {
7086 rdev->recovery_offset = 0; 7304 rdev->recovery_offset = 0;
7087 if (mddev->pers-> 7305 if (mddev->pers->
7088 hot_add_disk(mddev, rdev) == 0) { 7306 hot_add_disk(mddev, rdev) == 0) {
7089 char nm[20]; 7307 if (sysfs_link_rdev(mddev, rdev))
7090 sprintf(nm, "rd%d", rdev->raid_disk);
7091 if (sysfs_create_link(&mddev->kobj,
7092 &rdev->kobj, nm))
7093 /* failure here is OK */; 7308 /* failure here is OK */;
7094 spares++; 7309 spares++;
7095 md_new_event(mddev); 7310 md_new_event(mddev);
@@ -7138,6 +7353,8 @@ static void reap_sync_thread(mddev_t *mddev)
7138 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7353 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7139 sysfs_notify_dirent_safe(mddev->sysfs_action); 7354 sysfs_notify_dirent_safe(mddev->sysfs_action);
7140 md_new_event(mddev); 7355 md_new_event(mddev);
7356 if (mddev->event_work.func)
7357 queue_work(md_misc_wq, &mddev->event_work);
7141} 7358}
7142 7359
7143/* 7360/*
@@ -7170,9 +7387,6 @@ void md_check_recovery(mddev_t *mddev)
7170 if (mddev->bitmap) 7387 if (mddev->bitmap)
7171 bitmap_daemon_work(mddev); 7388 bitmap_daemon_work(mddev);
7172 7389
7173 if (mddev->ro)
7174 return;
7175
7176 if (signal_pending(current)) { 7390 if (signal_pending(current)) {
7177 if (mddev->pers->sync_request && !mddev->external) { 7391 if (mddev->pers->sync_request && !mddev->external) {
7178 printk(KERN_INFO "md: %s in immediate safe mode\n", 7392 printk(KERN_INFO "md: %s in immediate safe mode\n",
@@ -7209,9 +7423,7 @@ void md_check_recovery(mddev_t *mddev)
7209 atomic_read(&rdev->nr_pending)==0) { 7423 atomic_read(&rdev->nr_pending)==0) {
7210 if (mddev->pers->hot_remove_disk( 7424 if (mddev->pers->hot_remove_disk(
7211 mddev, rdev->raid_disk)==0) { 7425 mddev, rdev->raid_disk)==0) {
7212 char nm[20]; 7426 sysfs_unlink_rdev(mddev, rdev);
7213 sprintf(nm,"rd%d", rdev->raid_disk);
7214 sysfs_remove_link(&mddev->kobj, nm);
7215 rdev->raid_disk = -1; 7427 rdev->raid_disk = -1;
7216 } 7428 }
7217 } 7429 }
@@ -7331,12 +7543,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
7331{ 7543{
7332 sysfs_notify_dirent_safe(rdev->sysfs_state); 7544 sysfs_notify_dirent_safe(rdev->sysfs_state);
7333 wait_event_timeout(rdev->blocked_wait, 7545 wait_event_timeout(rdev->blocked_wait,
7334 !test_bit(Blocked, &rdev->flags), 7546 !test_bit(Blocked, &rdev->flags) &&
7547 !test_bit(BlockedBadBlocks, &rdev->flags),
7335 msecs_to_jiffies(5000)); 7548 msecs_to_jiffies(5000));
7336 rdev_dec_pending(rdev, mddev); 7549 rdev_dec_pending(rdev, mddev);
7337} 7550}
7338EXPORT_SYMBOL(md_wait_for_blocked_rdev); 7551EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7339 7552
7553
7554/* Bad block management.
7555 * We can record which blocks on each device are 'bad' and so just
7556 * fail those blocks, or that stripe, rather than the whole device.
7557 * Entries in the bad-block table are 64bits wide. This comprises:
7558 * Length of bad-range, in sectors: 0-511 for lengths 1-512
7559 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
7560 * A 'shift' can be set so that larger blocks are tracked and
7561 * consequently larger devices can be covered.
7562 * 'Acknowledged' flag - 1 bit. - the most significant bit.
7563 *
7564 * Locking of the bad-block table uses a seqlock so md_is_badblock
7565 * might need to retry if it is very unlucky.
7566 * We will sometimes want to check for bad blocks in a bi_end_io function,
7567 * so we use the write_seqlock_irq variant.
7568 *
7569 * When looking for a bad block we specify a range and want to
7570 * know if any block in the range is bad. So we binary-search
7571 * to the last range that starts at-or-before the given endpoint,
7572 * (or "before the sector after the target range")
7573 * then see if it ends after the given start.
7574 * We return
7575 * 0 if there are no known bad blocks in the range
7576 * 1 if there are known bad block which are all acknowledged
7577 * -1 if there are bad blocks which have not yet been acknowledged in metadata.
7578 * plus the start/length of the first bad section we overlap.
7579 */
7580int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7581 sector_t *first_bad, int *bad_sectors)
7582{
7583 int hi;
7584 int lo = 0;
7585 u64 *p = bb->page;
7586 int rv = 0;
7587 sector_t target = s + sectors;
7588 unsigned seq;
7589
7590 if (bb->shift > 0) {
7591 /* round the start down, and the end up */
7592 s >>= bb->shift;
7593 target += (1<<bb->shift) - 1;
7594 target >>= bb->shift;
7595 sectors = target - s;
7596 }
7597 /* 'target' is now the first block after the bad range */
7598
7599retry:
7600 seq = read_seqbegin(&bb->lock);
7601
7602 hi = bb->count;
7603
7604 /* Binary search between lo and hi for 'target'
7605 * i.e. for the last range that starts before 'target'
7606 */
7607 /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
7608 * are known not to be the last range before target.
7609 * VARIANT: hi-lo is the number of possible
7610 * ranges, and decreases until it reaches 1
7611 */
7612 while (hi - lo > 1) {
7613 int mid = (lo + hi) / 2;
7614 sector_t a = BB_OFFSET(p[mid]);
7615 if (a < target)
7616 /* This could still be the one, earlier ranges
7617 * could not. */
7618 lo = mid;
7619 else
7620 /* This and later ranges are definitely out. */
7621 hi = mid;
7622 }
7623 /* 'lo' might be the last that started before target, but 'hi' isn't */
7624 if (hi > lo) {
7625 /* need to check all range that end after 's' to see if
7626 * any are unacknowledged.
7627 */
7628 while (lo >= 0 &&
7629 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7630 if (BB_OFFSET(p[lo]) < target) {
7631 /* starts before the end, and finishes after
7632 * the start, so they must overlap
7633 */
7634 if (rv != -1 && BB_ACK(p[lo]))
7635 rv = 1;
7636 else
7637 rv = -1;
7638 *first_bad = BB_OFFSET(p[lo]);
7639 *bad_sectors = BB_LEN(p[lo]);
7640 }
7641 lo--;
7642 }
7643 }
7644
7645 if (read_seqretry(&bb->lock, seq))
7646 goto retry;
7647
7648 return rv;
7649}
7650EXPORT_SYMBOL_GPL(md_is_badblock);
7651
7652/*
7653 * Add a range of bad blocks to the table.
7654 * This might extend the table, or might contract it
7655 * if two adjacent ranges can be merged.
7656 * We binary-search to find the 'insertion' point, then
7657 * decide how best to handle it.
7658 */
7659static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
7660 int acknowledged)
7661{
7662 u64 *p;
7663 int lo, hi;
7664 int rv = 1;
7665
7666 if (bb->shift < 0)
7667 /* badblocks are disabled */
7668 return 0;
7669
7670 if (bb->shift) {
7671 /* round the start down, and the end up */
7672 sector_t next = s + sectors;
7673 s >>= bb->shift;
7674 next += (1<<bb->shift) - 1;
7675 next >>= bb->shift;
7676 sectors = next - s;
7677 }
7678
7679 write_seqlock_irq(&bb->lock);
7680
7681 p = bb->page;
7682 lo = 0;
7683 hi = bb->count;
7684 /* Find the last range that starts at-or-before 's' */
7685 while (hi - lo > 1) {
7686 int mid = (lo + hi) / 2;
7687 sector_t a = BB_OFFSET(p[mid]);
7688 if (a <= s)
7689 lo = mid;
7690 else
7691 hi = mid;
7692 }
7693 if (hi > lo && BB_OFFSET(p[lo]) > s)
7694 hi = lo;
7695
7696 if (hi > lo) {
7697 /* we found a range that might merge with the start
7698 * of our new range
7699 */
7700 sector_t a = BB_OFFSET(p[lo]);
7701 sector_t e = a + BB_LEN(p[lo]);
7702 int ack = BB_ACK(p[lo]);
7703 if (e >= s) {
7704 /* Yes, we can merge with a previous range */
7705 if (s == a && s + sectors >= e)
7706 /* new range covers old */
7707 ack = acknowledged;
7708 else
7709 ack = ack && acknowledged;
7710
7711 if (e < s + sectors)
7712 e = s + sectors;
7713 if (e - a <= BB_MAX_LEN) {
7714 p[lo] = BB_MAKE(a, e-a, ack);
7715 s = e;
7716 } else {
7717 /* does not all fit in one range,
7718 * make p[lo] maximal
7719 */
7720 if (BB_LEN(p[lo]) != BB_MAX_LEN)
7721 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
7722 s = a + BB_MAX_LEN;
7723 }
7724 sectors = e - s;
7725 }
7726 }
7727 if (sectors && hi < bb->count) {
7728 /* 'hi' points to the first range that starts after 's'.
7729 * Maybe we can merge with the start of that range */
7730 sector_t a = BB_OFFSET(p[hi]);
7731 sector_t e = a + BB_LEN(p[hi]);
7732 int ack = BB_ACK(p[hi]);
7733 if (a <= s + sectors) {
7734 /* merging is possible */
7735 if (e <= s + sectors) {
7736 /* full overlap */
7737 e = s + sectors;
7738 ack = acknowledged;
7739 } else
7740 ack = ack && acknowledged;
7741
7742 a = s;
7743 if (e - a <= BB_MAX_LEN) {
7744 p[hi] = BB_MAKE(a, e-a, ack);
7745 s = e;
7746 } else {
7747 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
7748 s = a + BB_MAX_LEN;
7749 }
7750 sectors = e - s;
7751 lo = hi;
7752 hi++;
7753 }
7754 }
7755 if (sectors == 0 && hi < bb->count) {
7756 /* we might be able to combine lo and hi */
7757 /* Note: 's' is at the end of 'lo' */
7758 sector_t a = BB_OFFSET(p[hi]);
7759 int lolen = BB_LEN(p[lo]);
7760 int hilen = BB_LEN(p[hi]);
7761 int newlen = lolen + hilen - (s - a);
7762 if (s >= a && newlen < BB_MAX_LEN) {
7763 /* yes, we can combine them */
7764 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
7765 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
7766 memmove(p + hi, p + hi + 1,
7767 (bb->count - hi - 1) * 8);
7768 bb->count--;
7769 }
7770 }
7771 while (sectors) {
7772 /* didn't merge (it all).
7773 * Need to add a range just before 'hi' */
7774 if (bb->count >= MD_MAX_BADBLOCKS) {
7775 /* No room for more */
7776 rv = 0;
7777 break;
7778 } else {
7779 int this_sectors = sectors;
7780 memmove(p + hi + 1, p + hi,
7781 (bb->count - hi) * 8);
7782 bb->count++;
7783
7784 if (this_sectors > BB_MAX_LEN)
7785 this_sectors = BB_MAX_LEN;
7786 p[hi] = BB_MAKE(s, this_sectors, acknowledged);
7787 sectors -= this_sectors;
7788 s += this_sectors;
7789 }
7790 }
7791
7792 bb->changed = 1;
7793 if (!acknowledged)
7794 bb->unacked_exist = 1;
7795 write_sequnlock_irq(&bb->lock);
7796
7797 return rv;
7798}
7799
7800int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
7801 int acknowledged)
7802{
7803 int rv = md_set_badblocks(&rdev->badblocks,
7804 s + rdev->data_offset, sectors, acknowledged);
7805 if (rv) {
7806 /* Make sure they get written out promptly */
7807 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
7808 md_wakeup_thread(rdev->mddev->thread);
7809 }
7810 return rv;
7811}
7812EXPORT_SYMBOL_GPL(rdev_set_badblocks);
7813
7814/*
7815 * Remove a range of bad blocks from the table.
7816 * This may involve extending the table if we spilt a region,
7817 * but it must not fail. So if the table becomes full, we just
7818 * drop the remove request.
7819 */
7820static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
7821{
7822 u64 *p;
7823 int lo, hi;
7824 sector_t target = s + sectors;
7825 int rv = 0;
7826
7827 if (bb->shift > 0) {
7828 /* When clearing we round the start up and the end down.
7829 * This should not matter as the shift should align with
7830 * the block size and no rounding should ever be needed.
7831 * However it is better the think a block is bad when it
7832 * isn't than to think a block is not bad when it is.
7833 */
7834 s += (1<<bb->shift) - 1;
7835 s >>= bb->shift;
7836 target >>= bb->shift;
7837 sectors = target - s;
7838 }
7839
7840 write_seqlock_irq(&bb->lock);
7841
7842 p = bb->page;
7843 lo = 0;
7844 hi = bb->count;
7845 /* Find the last range that starts before 'target' */
7846 while (hi - lo > 1) {
7847 int mid = (lo + hi) / 2;
7848 sector_t a = BB_OFFSET(p[mid]);
7849 if (a < target)
7850 lo = mid;
7851 else
7852 hi = mid;
7853 }
7854 if (hi > lo) {
7855 /* p[lo] is the last range that could overlap the
7856 * current range. Earlier ranges could also overlap,
7857 * but only this one can overlap the end of the range.
7858 */
7859 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
7860 /* Partial overlap, leave the tail of this range */
7861 int ack = BB_ACK(p[lo]);
7862 sector_t a = BB_OFFSET(p[lo]);
7863 sector_t end = a + BB_LEN(p[lo]);
7864
7865 if (a < s) {
7866 /* we need to split this range */
7867 if (bb->count >= MD_MAX_BADBLOCKS) {
7868 rv = 0;
7869 goto out;
7870 }
7871 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
7872 bb->count++;
7873 p[lo] = BB_MAKE(a, s-a, ack);
7874 lo++;
7875 }
7876 p[lo] = BB_MAKE(target, end - target, ack);
7877 /* there is no longer an overlap */
7878 hi = lo;
7879 lo--;
7880 }
7881 while (lo >= 0 &&
7882 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7883 /* This range does overlap */
7884 if (BB_OFFSET(p[lo]) < s) {
7885 /* Keep the early parts of this range. */
7886 int ack = BB_ACK(p[lo]);
7887 sector_t start = BB_OFFSET(p[lo]);
7888 p[lo] = BB_MAKE(start, s - start, ack);
7889 /* now low doesn't overlap, so.. */
7890 break;
7891 }
7892 lo--;
7893 }
7894 /* 'lo' is strictly before, 'hi' is strictly after,
7895 * anything between needs to be discarded
7896 */
7897 if (hi - lo > 1) {
7898 memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
7899 bb->count -= (hi - lo - 1);
7900 }
7901 }
7902
7903 bb->changed = 1;
7904out:
7905 write_sequnlock_irq(&bb->lock);
7906 return rv;
7907}
7908
7909int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors)
7910{
7911 return md_clear_badblocks(&rdev->badblocks,
7912 s + rdev->data_offset,
7913 sectors);
7914}
7915EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
7916
7917/*
7918 * Acknowledge all bad blocks in a list.
7919 * This only succeeds if ->changed is clear. It is used by
7920 * in-kernel metadata updates
7921 */
7922void md_ack_all_badblocks(struct badblocks *bb)
7923{
7924 if (bb->page == NULL || bb->changed)
7925 /* no point even trying */
7926 return;
7927 write_seqlock_irq(&bb->lock);
7928
7929 if (bb->changed == 0) {
7930 u64 *p = bb->page;
7931 int i;
7932 for (i = 0; i < bb->count ; i++) {
7933 if (!BB_ACK(p[i])) {
7934 sector_t start = BB_OFFSET(p[i]);
7935 int len = BB_LEN(p[i]);
7936 p[i] = BB_MAKE(start, len, 1);
7937 }
7938 }
7939 bb->unacked_exist = 0;
7940 }
7941 write_sequnlock_irq(&bb->lock);
7942}
7943EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
7944
7945/* sysfs access to bad-blocks list.
7946 * We present two files.
7947 * 'bad-blocks' lists sector numbers and lengths of ranges that
7948 * are recorded as bad. The list is truncated to fit within
7949 * the one-page limit of sysfs.
7950 * Writing "sector length" to this file adds an acknowledged
7951 * bad block list.
7952 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
7953 * been acknowledged. Writing to this file adds bad blocks
7954 * without acknowledging them. This is largely for testing.
7955 */
7956
7957static ssize_t
7958badblocks_show(struct badblocks *bb, char *page, int unack)
7959{
7960 size_t len;
7961 int i;
7962 u64 *p = bb->page;
7963 unsigned seq;
7964
7965 if (bb->shift < 0)
7966 return 0;
7967
7968retry:
7969 seq = read_seqbegin(&bb->lock);
7970
7971 len = 0;
7972 i = 0;
7973
7974 while (len < PAGE_SIZE && i < bb->count) {
7975 sector_t s = BB_OFFSET(p[i]);
7976 unsigned int length = BB_LEN(p[i]);
7977 int ack = BB_ACK(p[i]);
7978 i++;
7979
7980 if (unack && ack)
7981 continue;
7982
7983 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
7984 (unsigned long long)s << bb->shift,
7985 length << bb->shift);
7986 }
7987 if (unack && len == 0)
7988 bb->unacked_exist = 0;
7989
7990 if (read_seqretry(&bb->lock, seq))
7991 goto retry;
7992
7993 return len;
7994}
7995
7996#define DO_DEBUG 1
7997
7998static ssize_t
7999badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8000{
8001 unsigned long long sector;
8002 int length;
8003 char newline;
8004#ifdef DO_DEBUG
8005 /* Allow clearing via sysfs *only* for testing/debugging.
8006 * Normally only a successful write may clear a badblock
8007 */
8008 int clear = 0;
8009 if (page[0] == '-') {
8010 clear = 1;
8011 page++;
8012 }
8013#endif /* DO_DEBUG */
8014
8015 switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
8016 case 3:
8017 if (newline != '\n')
8018 return -EINVAL;
8019 case 2:
8020 if (length <= 0)
8021 return -EINVAL;
8022 break;
8023 default:
8024 return -EINVAL;
8025 }
8026
8027#ifdef DO_DEBUG
8028 if (clear) {
8029 md_clear_badblocks(bb, sector, length);
8030 return len;
8031 }
8032#endif /* DO_DEBUG */
8033 if (md_set_badblocks(bb, sector, length, !unack))
8034 return len;
8035 else
8036 return -ENOSPC;
8037}
8038
7340static int md_notify_reboot(struct notifier_block *this, 8039static int md_notify_reboot(struct notifier_block *this,
7341 unsigned long code, void *x) 8040 unsigned long code, void *x)
7342{ 8041{
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1c26c7a08ae6..1e586bb4452e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,6 +29,13 @@
29typedef struct mddev_s mddev_t; 29typedef struct mddev_s mddev_t;
30typedef struct mdk_rdev_s mdk_rdev_t; 30typedef struct mdk_rdev_s mdk_rdev_t;
31 31
32/* Bad block numbers are stored sorted in a single page.
33 * 64bits is used for each block or extent.
34 * 54 bits are sector number, 9 bits are extent size,
35 * 1 bit is an 'acknowledged' flag.
36 */
37#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
38
32/* 39/*
33 * MD's 'extended' device 40 * MD's 'extended' device
34 */ 41 */
@@ -48,7 +55,7 @@ struct mdk_rdev_s
48 struct block_device *meta_bdev; 55 struct block_device *meta_bdev;
49 struct block_device *bdev; /* block device handle */ 56 struct block_device *bdev; /* block device handle */
50 57
51 struct page *sb_page; 58 struct page *sb_page, *bb_page;
52 int sb_loaded; 59 int sb_loaded;
53 __u64 sb_events; 60 __u64 sb_events;
54 sector_t data_offset; /* start of data in array */ 61 sector_t data_offset; /* start of data in array */
@@ -74,9 +81,29 @@ struct mdk_rdev_s
74#define In_sync 2 /* device is in_sync with rest of array */ 81#define In_sync 2 /* device is in_sync with rest of array */
75#define WriteMostly 4 /* Avoid reading if at all possible */ 82#define WriteMostly 4 /* Avoid reading if at all possible */
76#define AutoDetected 7 /* added by auto-detect */ 83#define AutoDetected 7 /* added by auto-detect */
77#define Blocked 8 /* An error occurred on an externally 84#define Blocked 8 /* An error occurred but has not yet
78 * managed array, don't allow writes 85 * been acknowledged by the metadata
86 * handler, so don't allow writes
79 * until it is cleared */ 87 * until it is cleared */
88#define WriteErrorSeen 9 /* A write error has been seen on this
89 * device
90 */
91#define FaultRecorded 10 /* Intermediate state for clearing
92 * Blocked. The Fault is/will-be
93 * recorded in the metadata, but that
94 * metadata hasn't been stored safely
95 * on disk yet.
96 */
97#define BlockedBadBlocks 11 /* A writer is blocked because they
98 * found an unacknowledged bad-block.
99 * This can safely be cleared at any
100 * time, and the writer will re-check.
101 * It may be set at any time, and at
102 * worst the writer will timeout and
103 * re-check. So setting it as
104 * accurately as possible is good, but
105 * not absolutely critical.
106 */
80 wait_queue_head_t blocked_wait; 107 wait_queue_head_t blocked_wait;
81 108
82 int desc_nr; /* descriptor index in the superblock */ 109 int desc_nr; /* descriptor index in the superblock */
@@ -111,8 +138,54 @@ struct mdk_rdev_s
111 138
112 struct sysfs_dirent *sysfs_state; /* handle for 'state' 139 struct sysfs_dirent *sysfs_state; /* handle for 'state'
113 * sysfs entry */ 140 * sysfs entry */
141
142 struct badblocks {
143 int count; /* count of bad blocks */
144 int unacked_exist; /* there probably are unacknowledged
145 * bad blocks. This is only cleared
146 * when a read discovers none
147 */
148 int shift; /* shift from sectors to block size
149 * a -ve shift means badblocks are
150 * disabled.*/
151 u64 *page; /* badblock list */
152 int changed;
153 seqlock_t lock;
154
155 sector_t sector;
156 sector_t size; /* in sectors */
157 } badblocks;
114}; 158};
115 159
160#define BB_LEN_MASK (0x00000000000001FFULL)
161#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
162#define BB_ACK_MASK (0x8000000000000000ULL)
163#define BB_MAX_LEN 512
164#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
165#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
166#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
167#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
168
169extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
170 sector_t *first_bad, int *bad_sectors);
171static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors,
172 sector_t *first_bad, int *bad_sectors)
173{
174 if (unlikely(rdev->badblocks.count)) {
175 int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
176 sectors,
177 first_bad, bad_sectors);
178 if (rv)
179 *first_bad -= rdev->data_offset;
180 return rv;
181 }
182 return 0;
183}
184extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
185 int acknowledged);
186extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors);
187extern void md_ack_all_badblocks(struct badblocks *bb);
188
116struct mddev_s 189struct mddev_s
117{ 190{
118 void *private; 191 void *private;
@@ -239,9 +312,12 @@ struct mddev_s
239#define MD_RECOVERY_FROZEN 9 312#define MD_RECOVERY_FROZEN 9
240 313
241 unsigned long recovery; 314 unsigned long recovery;
242 int recovery_disabled; /* if we detect that recovery 315 /* If a RAID personality determines that recovery (of a particular
243 * will always fail, set this 316 * device) will fail due to a read error on the source device, it
244 * so we don't loop trying */ 317 * takes a copy of this number and does not attempt recovery again
318 * until this number changes.
319 */
320 int recovery_disabled;
245 321
246 int in_sync; /* know to not need resync */ 322 int in_sync; /* know to not need resync */
247 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so 323 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
@@ -304,11 +380,6 @@ struct mddev_s
304 * hot-adding a bitmap. It should 380 * hot-adding a bitmap. It should
305 * eventually be settable by sysfs. 381 * eventually be settable by sysfs.
306 */ 382 */
307 /* When md is serving under dm, it might use a
308 * dirty_log to store the bits.
309 */
310 struct dm_dirty_log *log;
311
312 struct mutex mutex; 383 struct mutex mutex;
313 unsigned long chunksize; 384 unsigned long chunksize;
314 unsigned long daemon_sleep; /* how many jiffies between updates? */ 385 unsigned long daemon_sleep; /* how many jiffies between updates? */
@@ -413,6 +484,20 @@ static inline char * mdname (mddev_t * mddev)
413 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; 484 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
414} 485}
415 486
487static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
488{
489 char nm[20];
490 sprintf(nm, "rd%d", rdev->raid_disk);
491 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
492}
493
494static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
495{
496 char nm[20];
497 sprintf(nm, "rd%d", rdev->raid_disk);
498 sysfs_remove_link(&mddev->kobj, nm);
499}
500
416/* 501/*
417 * iterates through some rdev ringlist. It's safe to remove the 502 * iterates through some rdev ringlist. It's safe to remove the
418 * current 'rdev'. Dont touch 'tmp' though. 503 * current 'rdev'. Dont touch 'tmp' though.
@@ -505,7 +590,7 @@ extern void mddev_init(mddev_t *mddev);
505extern int md_run(mddev_t *mddev); 590extern int md_run(mddev_t *mddev);
506extern void md_stop(mddev_t *mddev); 591extern void md_stop(mddev_t *mddev);
507extern void md_stop_writes(mddev_t *mddev); 592extern void md_stop_writes(mddev_t *mddev);
508extern void md_rdev_init(mdk_rdev_t *rdev); 593extern int md_rdev_init(mdk_rdev_t *rdev);
509 594
510extern void mddev_suspend(mddev_t *mddev); 595extern void mddev_suspend(mddev_t *mddev);
511extern void mddev_resume(mddev_t *mddev); 596extern void mddev_resume(mddev_t *mddev);
@@ -514,4 +599,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
514extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 599extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
515 mddev_t *mddev); 600 mddev_t *mddev);
516extern int mddev_check_plugged(mddev_t *mddev); 601extern int mddev_check_plugged(mddev_t *mddev);
602extern void md_trim_bio(struct bio *bio, int offset, int size);
517#endif /* _MD_MD_H */ 603#endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f7431b6d8447..32323f0afd89 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -35,16 +35,13 @@
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/seq_file.h> 37#include <linux/seq_file.h>
38#include <linux/ratelimit.h>
38#include "md.h" 39#include "md.h"
39#include "raid1.h" 40#include "raid1.h"
40#include "bitmap.h" 41#include "bitmap.h"
41 42
42#define DEBUG 0 43#define DEBUG 0
43#if DEBUG 44#define PRINTK(x...) do { if (DEBUG) printk(x); } while (0)
44#define PRINTK(x...) printk(x)
45#else
46#define PRINTK(x...)
47#endif
48 45
49/* 46/*
50 * Number of guaranteed r1bios in case of extreme VM load: 47 * Number of guaranteed r1bios in case of extreme VM load:
@@ -166,7 +163,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
166 163
167 for (i = 0; i < conf->raid_disks; i++) { 164 for (i = 0; i < conf->raid_disks; i++) {
168 struct bio **bio = r1_bio->bios + i; 165 struct bio **bio = r1_bio->bios + i;
169 if (*bio && *bio != IO_BLOCKED) 166 if (!BIO_SPECIAL(*bio))
170 bio_put(*bio); 167 bio_put(*bio);
171 *bio = NULL; 168 *bio = NULL;
172 } 169 }
@@ -176,12 +173,6 @@ static void free_r1bio(r1bio_t *r1_bio)
176{ 173{
177 conf_t *conf = r1_bio->mddev->private; 174 conf_t *conf = r1_bio->mddev->private;
178 175
179 /*
180 * Wake up any possible resync thread that waits for the device
181 * to go idle.
182 */
183 allow_barrier(conf);
184
185 put_all_bios(conf, r1_bio); 176 put_all_bios(conf, r1_bio);
186 mempool_free(r1_bio, conf->r1bio_pool); 177 mempool_free(r1_bio, conf->r1bio_pool);
187} 178}
@@ -222,6 +213,33 @@ static void reschedule_retry(r1bio_t *r1_bio)
222 * operation and are ready to return a success/failure code to the buffer 213 * operation and are ready to return a success/failure code to the buffer
223 * cache layer. 214 * cache layer.
224 */ 215 */
216static void call_bio_endio(r1bio_t *r1_bio)
217{
218 struct bio *bio = r1_bio->master_bio;
219 int done;
220 conf_t *conf = r1_bio->mddev->private;
221
222 if (bio->bi_phys_segments) {
223 unsigned long flags;
224 spin_lock_irqsave(&conf->device_lock, flags);
225 bio->bi_phys_segments--;
226 done = (bio->bi_phys_segments == 0);
227 spin_unlock_irqrestore(&conf->device_lock, flags);
228 } else
229 done = 1;
230
231 if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
232 clear_bit(BIO_UPTODATE, &bio->bi_flags);
233 if (done) {
234 bio_endio(bio, 0);
235 /*
236 * Wake up any possible resync thread that waits for the device
237 * to go idle.
238 */
239 allow_barrier(conf);
240 }
241}
242
225static void raid_end_bio_io(r1bio_t *r1_bio) 243static void raid_end_bio_io(r1bio_t *r1_bio)
226{ 244{
227 struct bio *bio = r1_bio->master_bio; 245 struct bio *bio = r1_bio->master_bio;
@@ -234,8 +252,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
234 (unsigned long long) bio->bi_sector + 252 (unsigned long long) bio->bi_sector +
235 (bio->bi_size >> 9) - 1); 253 (bio->bi_size >> 9) - 1);
236 254
237 bio_endio(bio, 255 call_bio_endio(r1_bio);
238 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
239 } 256 }
240 free_r1bio(r1_bio); 257 free_r1bio(r1_bio);
241} 258}
@@ -287,36 +304,52 @@ static void raid1_end_read_request(struct bio *bio, int error)
287 * oops, read error: 304 * oops, read error:
288 */ 305 */
289 char b[BDEVNAME_SIZE]; 306 char b[BDEVNAME_SIZE];
290 if (printk_ratelimit()) 307 printk_ratelimited(
291 printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", 308 KERN_ERR "md/raid1:%s: %s: "
292 mdname(conf->mddev), 309 "rescheduling sector %llu\n",
293 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); 310 mdname(conf->mddev),
311 bdevname(conf->mirrors[mirror].rdev->bdev,
312 b),
313 (unsigned long long)r1_bio->sector);
314 set_bit(R1BIO_ReadError, &r1_bio->state);
294 reschedule_retry(r1_bio); 315 reschedule_retry(r1_bio);
295 } 316 }
296 317
297 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 318 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
298} 319}
299 320
321static void close_write(r1bio_t *r1_bio)
322{
323 /* it really is the end of this request */
324 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
325 /* free extra copy of the data pages */
326 int i = r1_bio->behind_page_count;
327 while (i--)
328 safe_put_page(r1_bio->behind_bvecs[i].bv_page);
329 kfree(r1_bio->behind_bvecs);
330 r1_bio->behind_bvecs = NULL;
331 }
332 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
334 r1_bio->sectors,
335 !test_bit(R1BIO_Degraded, &r1_bio->state),
336 test_bit(R1BIO_BehindIO, &r1_bio->state));
337 md_write_end(r1_bio->mddev);
338}
339
300static void r1_bio_write_done(r1bio_t *r1_bio) 340static void r1_bio_write_done(r1bio_t *r1_bio)
301{ 341{
302 if (atomic_dec_and_test(&r1_bio->remaining)) 342 if (!atomic_dec_and_test(&r1_bio->remaining))
303 { 343 return;
304 /* it really is the end of this request */ 344
305 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 345 if (test_bit(R1BIO_WriteError, &r1_bio->state))
306 /* free extra copy of the data pages */ 346 reschedule_retry(r1_bio);
307 int i = r1_bio->behind_page_count; 347 else {
308 while (i--) 348 close_write(r1_bio);
309 safe_put_page(r1_bio->behind_pages[i]); 349 if (test_bit(R1BIO_MadeGood, &r1_bio->state))
310 kfree(r1_bio->behind_pages); 350 reschedule_retry(r1_bio);
311 r1_bio->behind_pages = NULL; 351 else
312 } 352 raid_end_bio_io(r1_bio);
313 /* clear the bitmap if all writes complete successfully */
314 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
315 r1_bio->sectors,
316 !test_bit(R1BIO_Degraded, &r1_bio->state),
317 test_bit(R1BIO_BehindIO, &r1_bio->state));
318 md_write_end(r1_bio->mddev);
319 raid_end_bio_io(r1_bio);
320 } 353 }
321} 354}
322 355
@@ -336,13 +369,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
336 /* 369 /*
337 * 'one mirror IO has finished' event handler: 370 * 'one mirror IO has finished' event handler:
338 */ 371 */
339 r1_bio->bios[mirror] = NULL;
340 to_put = bio;
341 if (!uptodate) { 372 if (!uptodate) {
342 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 373 set_bit(WriteErrorSeen,
343 /* an I/O failed, we can't clear the bitmap */ 374 &conf->mirrors[mirror].rdev->flags);
344 set_bit(R1BIO_Degraded, &r1_bio->state); 375 set_bit(R1BIO_WriteError, &r1_bio->state);
345 } else 376 } else {
346 /* 377 /*
347 * Set R1BIO_Uptodate in our master bio, so that we 378 * Set R1BIO_Uptodate in our master bio, so that we
348 * will return a good error code for to the higher 379 * will return a good error code for to the higher
@@ -353,8 +384,22 @@ static void raid1_end_write_request(struct bio *bio, int error)
353 * to user-side. So if something waits for IO, then it 384 * to user-side. So if something waits for IO, then it
354 * will wait for the 'master' bio. 385 * will wait for the 'master' bio.
355 */ 386 */
387 sector_t first_bad;
388 int bad_sectors;
389
390 r1_bio->bios[mirror] = NULL;
391 to_put = bio;
356 set_bit(R1BIO_Uptodate, &r1_bio->state); 392 set_bit(R1BIO_Uptodate, &r1_bio->state);
357 393
394 /* Maybe we can clear some bad blocks. */
395 if (is_badblock(conf->mirrors[mirror].rdev,
396 r1_bio->sector, r1_bio->sectors,
397 &first_bad, &bad_sectors)) {
398 r1_bio->bios[mirror] = IO_MADE_GOOD;
399 set_bit(R1BIO_MadeGood, &r1_bio->state);
400 }
401 }
402
358 update_head_pos(mirror, r1_bio); 403 update_head_pos(mirror, r1_bio);
359 404
360 if (behind) { 405 if (behind) {
@@ -377,11 +422,13 @@ static void raid1_end_write_request(struct bio *bio, int error)
377 (unsigned long long) mbio->bi_sector, 422 (unsigned long long) mbio->bi_sector,
378 (unsigned long long) mbio->bi_sector + 423 (unsigned long long) mbio->bi_sector +
379 (mbio->bi_size >> 9) - 1); 424 (mbio->bi_size >> 9) - 1);
380 bio_endio(mbio, 0); 425 call_bio_endio(r1_bio);
381 } 426 }
382 } 427 }
383 } 428 }
384 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 429 if (r1_bio->bios[mirror] == NULL)
430 rdev_dec_pending(conf->mirrors[mirror].rdev,
431 conf->mddev);
385 432
386 /* 433 /*
387 * Let's see if all mirrored write operations have finished 434 * Let's see if all mirrored write operations have finished
@@ -408,10 +455,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
408 * 455 *
409 * The rdev for the device selected will have nr_pending incremented. 456 * The rdev for the device selected will have nr_pending incremented.
410 */ 457 */
411static int read_balance(conf_t *conf, r1bio_t *r1_bio) 458static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors)
412{ 459{
413 const sector_t this_sector = r1_bio->sector; 460 const sector_t this_sector = r1_bio->sector;
414 const int sectors = r1_bio->sectors; 461 int sectors;
462 int best_good_sectors;
415 int start_disk; 463 int start_disk;
416 int best_disk; 464 int best_disk;
417 int i; 465 int i;
@@ -426,8 +474,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
426 * We take the first readable disk when above the resync window. 474 * We take the first readable disk when above the resync window.
427 */ 475 */
428 retry: 476 retry:
477 sectors = r1_bio->sectors;
429 best_disk = -1; 478 best_disk = -1;
430 best_dist = MaxSector; 479 best_dist = MaxSector;
480 best_good_sectors = 0;
481
431 if (conf->mddev->recovery_cp < MaxSector && 482 if (conf->mddev->recovery_cp < MaxSector &&
432 (this_sector + sectors >= conf->next_resync)) { 483 (this_sector + sectors >= conf->next_resync)) {
433 choose_first = 1; 484 choose_first = 1;
@@ -439,6 +490,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
439 490
440 for (i = 0 ; i < conf->raid_disks ; i++) { 491 for (i = 0 ; i < conf->raid_disks ; i++) {
441 sector_t dist; 492 sector_t dist;
493 sector_t first_bad;
494 int bad_sectors;
495
442 int disk = start_disk + i; 496 int disk = start_disk + i;
443 if (disk >= conf->raid_disks) 497 if (disk >= conf->raid_disks)
444 disk -= conf->raid_disks; 498 disk -= conf->raid_disks;
@@ -461,6 +515,35 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
461 /* This is a reasonable device to use. It might 515 /* This is a reasonable device to use. It might
462 * even be best. 516 * even be best.
463 */ 517 */
518 if (is_badblock(rdev, this_sector, sectors,
519 &first_bad, &bad_sectors)) {
520 if (best_dist < MaxSector)
521 /* already have a better device */
522 continue;
523 if (first_bad <= this_sector) {
524 /* cannot read here. If this is the 'primary'
525 * device, then we must not read beyond
526 * bad_sectors from another device..
527 */
528 bad_sectors -= (this_sector - first_bad);
529 if (choose_first && sectors > bad_sectors)
530 sectors = bad_sectors;
531 if (best_good_sectors > sectors)
532 best_good_sectors = sectors;
533
534 } else {
535 sector_t good_sectors = first_bad - this_sector;
536 if (good_sectors > best_good_sectors) {
537 best_good_sectors = good_sectors;
538 best_disk = disk;
539 }
540 if (choose_first)
541 break;
542 }
543 continue;
544 } else
545 best_good_sectors = sectors;
546
464 dist = abs(this_sector - conf->mirrors[disk].head_position); 547 dist = abs(this_sector - conf->mirrors[disk].head_position);
465 if (choose_first 548 if (choose_first
466 /* Don't change to another disk for sequential reads */ 549 /* Don't change to another disk for sequential reads */
@@ -489,10 +572,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
489 rdev_dec_pending(rdev, conf->mddev); 572 rdev_dec_pending(rdev, conf->mddev);
490 goto retry; 573 goto retry;
491 } 574 }
575 sectors = best_good_sectors;
492 conf->next_seq_sect = this_sector + sectors; 576 conf->next_seq_sect = this_sector + sectors;
493 conf->last_used = best_disk; 577 conf->last_used = best_disk;
494 } 578 }
495 rcu_read_unlock(); 579 rcu_read_unlock();
580 *max_sectors = sectors;
496 581
497 return best_disk; 582 return best_disk;
498} 583}
@@ -672,30 +757,31 @@ static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
672{ 757{
673 int i; 758 int i;
674 struct bio_vec *bvec; 759 struct bio_vec *bvec;
675 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), 760 struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
676 GFP_NOIO); 761 GFP_NOIO);
677 if (unlikely(!pages)) 762 if (unlikely(!bvecs))
678 return; 763 return;
679 764
680 bio_for_each_segment(bvec, bio, i) { 765 bio_for_each_segment(bvec, bio, i) {
681 pages[i] = alloc_page(GFP_NOIO); 766 bvecs[i] = *bvec;
682 if (unlikely(!pages[i])) 767 bvecs[i].bv_page = alloc_page(GFP_NOIO);
768 if (unlikely(!bvecs[i].bv_page))
683 goto do_sync_io; 769 goto do_sync_io;
684 memcpy(kmap(pages[i]) + bvec->bv_offset, 770 memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
685 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); 771 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
686 kunmap(pages[i]); 772 kunmap(bvecs[i].bv_page);
687 kunmap(bvec->bv_page); 773 kunmap(bvec->bv_page);
688 } 774 }
689 r1_bio->behind_pages = pages; 775 r1_bio->behind_bvecs = bvecs;
690 r1_bio->behind_page_count = bio->bi_vcnt; 776 r1_bio->behind_page_count = bio->bi_vcnt;
691 set_bit(R1BIO_BehindIO, &r1_bio->state); 777 set_bit(R1BIO_BehindIO, &r1_bio->state);
692 return; 778 return;
693 779
694do_sync_io: 780do_sync_io:
695 for (i = 0; i < bio->bi_vcnt; i++) 781 for (i = 0; i < bio->bi_vcnt; i++)
696 if (pages[i]) 782 if (bvecs[i].bv_page)
697 put_page(pages[i]); 783 put_page(bvecs[i].bv_page);
698 kfree(pages); 784 kfree(bvecs);
699 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 785 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
700} 786}
701 787
@@ -705,7 +791,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
705 mirror_info_t *mirror; 791 mirror_info_t *mirror;
706 r1bio_t *r1_bio; 792 r1bio_t *r1_bio;
707 struct bio *read_bio; 793 struct bio *read_bio;
708 int i, targets = 0, disks; 794 int i, disks;
709 struct bitmap *bitmap; 795 struct bitmap *bitmap;
710 unsigned long flags; 796 unsigned long flags;
711 const int rw = bio_data_dir(bio); 797 const int rw = bio_data_dir(bio);
@@ -713,6 +799,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
713 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 799 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
714 mdk_rdev_t *blocked_rdev; 800 mdk_rdev_t *blocked_rdev;
715 int plugged; 801 int plugged;
802 int first_clone;
803 int sectors_handled;
804 int max_sectors;
716 805
717 /* 806 /*
718 * Register the new request and wait if the reconstruction 807 * Register the new request and wait if the reconstruction
@@ -759,11 +848,24 @@ static int make_request(mddev_t *mddev, struct bio * bio)
759 r1_bio->mddev = mddev; 848 r1_bio->mddev = mddev;
760 r1_bio->sector = bio->bi_sector; 849 r1_bio->sector = bio->bi_sector;
761 850
851 /* We might need to issue multiple reads to different
852 * devices if there are bad blocks around, so we keep
853 * track of the number of reads in bio->bi_phys_segments.
854 * If this is 0, there is only one r1_bio and no locking
855 * will be needed when requests complete. If it is
856 * non-zero, then it is the number of not-completed requests.
857 */
858 bio->bi_phys_segments = 0;
859 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
860
762 if (rw == READ) { 861 if (rw == READ) {
763 /* 862 /*
764 * read balancing logic: 863 * read balancing logic:
765 */ 864 */
766 int rdisk = read_balance(conf, r1_bio); 865 int rdisk;
866
867read_again:
868 rdisk = read_balance(conf, r1_bio, &max_sectors);
767 869
768 if (rdisk < 0) { 870 if (rdisk < 0) {
769 /* couldn't find anywhere to read from */ 871 /* couldn't find anywhere to read from */
@@ -784,6 +886,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
784 r1_bio->read_disk = rdisk; 886 r1_bio->read_disk = rdisk;
785 887
786 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 888 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
889 md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector,
890 max_sectors);
787 891
788 r1_bio->bios[rdisk] = read_bio; 892 r1_bio->bios[rdisk] = read_bio;
789 893
@@ -793,16 +897,52 @@ static int make_request(mddev_t *mddev, struct bio * bio)
793 read_bio->bi_rw = READ | do_sync; 897 read_bio->bi_rw = READ | do_sync;
794 read_bio->bi_private = r1_bio; 898 read_bio->bi_private = r1_bio;
795 899
796 generic_make_request(read_bio); 900 if (max_sectors < r1_bio->sectors) {
901 /* could not read all from this device, so we will
902 * need another r1_bio.
903 */
904
905 sectors_handled = (r1_bio->sector + max_sectors
906 - bio->bi_sector);
907 r1_bio->sectors = max_sectors;
908 spin_lock_irq(&conf->device_lock);
909 if (bio->bi_phys_segments == 0)
910 bio->bi_phys_segments = 2;
911 else
912 bio->bi_phys_segments++;
913 spin_unlock_irq(&conf->device_lock);
914 /* Cannot call generic_make_request directly
915 * as that will be queued in __make_request
916 * and subsequent mempool_alloc might block waiting
917 * for it. So hand bio over to raid1d.
918 */
919 reschedule_retry(r1_bio);
920
921 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
922
923 r1_bio->master_bio = bio;
924 r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
925 r1_bio->state = 0;
926 r1_bio->mddev = mddev;
927 r1_bio->sector = bio->bi_sector + sectors_handled;
928 goto read_again;
929 } else
930 generic_make_request(read_bio);
797 return 0; 931 return 0;
798 } 932 }
799 933
800 /* 934 /*
801 * WRITE: 935 * WRITE:
802 */ 936 */
803 /* first select target devices under spinlock and 937 /* first select target devices under rcu_lock and
804 * inc refcount on their rdev. Record them by setting 938 * inc refcount on their rdev. Record them by setting
805 * bios[x] to bio 939 * bios[x] to bio
940 * If there are known/acknowledged bad blocks on any device on
941 * which we have seen a write error, we want to avoid writing those
942 * blocks.
943 * This potentially requires several writes to write around
944 * the bad blocks. Each set of writes gets it's own r1bio
945 * with a set of bios attached.
806 */ 946 */
807 plugged = mddev_check_plugged(mddev); 947 plugged = mddev_check_plugged(mddev);
808 948
@@ -810,6 +950,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
810 retry_write: 950 retry_write:
811 blocked_rdev = NULL; 951 blocked_rdev = NULL;
812 rcu_read_lock(); 952 rcu_read_lock();
953 max_sectors = r1_bio->sectors;
813 for (i = 0; i < disks; i++) { 954 for (i = 0; i < disks; i++) {
814 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 955 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
815 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 956 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -817,17 +958,56 @@ static int make_request(mddev_t *mddev, struct bio * bio)
817 blocked_rdev = rdev; 958 blocked_rdev = rdev;
818 break; 959 break;
819 } 960 }
820 if (rdev && !test_bit(Faulty, &rdev->flags)) { 961 r1_bio->bios[i] = NULL;
821 atomic_inc(&rdev->nr_pending); 962 if (!rdev || test_bit(Faulty, &rdev->flags)) {
822 if (test_bit(Faulty, &rdev->flags)) { 963 set_bit(R1BIO_Degraded, &r1_bio->state);
964 continue;
965 }
966
967 atomic_inc(&rdev->nr_pending);
968 if (test_bit(WriteErrorSeen, &rdev->flags)) {
969 sector_t first_bad;
970 int bad_sectors;
971 int is_bad;
972
973 is_bad = is_badblock(rdev, r1_bio->sector,
974 max_sectors,
975 &first_bad, &bad_sectors);
976 if (is_bad < 0) {
977 /* mustn't write here until the bad block is
978 * acknowledged*/
979 set_bit(BlockedBadBlocks, &rdev->flags);
980 blocked_rdev = rdev;
981 break;
982 }
983 if (is_bad && first_bad <= r1_bio->sector) {
984 /* Cannot write here at all */
985 bad_sectors -= (r1_bio->sector - first_bad);
986 if (bad_sectors < max_sectors)
987 /* mustn't write more than bad_sectors
988 * to other devices yet
989 */
990 max_sectors = bad_sectors;
823 rdev_dec_pending(rdev, mddev); 991 rdev_dec_pending(rdev, mddev);
824 r1_bio->bios[i] = NULL; 992 /* We don't set R1BIO_Degraded as that
825 } else { 993 * only applies if the disk is
826 r1_bio->bios[i] = bio; 994 * missing, so it might be re-added,
827 targets++; 995 * and we want to know to recover this
996 * chunk.
997 * In this case the device is here,
998 * and the fact that this chunk is not
999 * in-sync is recorded in the bad
1000 * block log
1001 */
1002 continue;
828 } 1003 }
829 } else 1004 if (is_bad) {
830 r1_bio->bios[i] = NULL; 1005 int good_sectors = first_bad - r1_bio->sector;
1006 if (good_sectors < max_sectors)
1007 max_sectors = good_sectors;
1008 }
1009 }
1010 r1_bio->bios[i] = bio;
831 } 1011 }
832 rcu_read_unlock(); 1012 rcu_read_unlock();
833 1013
@@ -838,51 +1018,57 @@ static int make_request(mddev_t *mddev, struct bio * bio)
838 for (j = 0; j < i; j++) 1018 for (j = 0; j < i; j++)
839 if (r1_bio->bios[j]) 1019 if (r1_bio->bios[j])
840 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1020 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
841 1021 r1_bio->state = 0;
842 allow_barrier(conf); 1022 allow_barrier(conf);
843 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1023 md_wait_for_blocked_rdev(blocked_rdev, mddev);
844 wait_barrier(conf); 1024 wait_barrier(conf);
845 goto retry_write; 1025 goto retry_write;
846 } 1026 }
847 1027
848 BUG_ON(targets == 0); /* we never fail the last device */ 1028 if (max_sectors < r1_bio->sectors) {
849 1029 /* We are splitting this write into multiple parts, so
850 if (targets < conf->raid_disks) { 1030 * we need to prepare for allocating another r1_bio.
851 /* array is degraded, we will not clear the bitmap 1031 */
852 * on I/O completion (see raid1_end_write_request) */ 1032 r1_bio->sectors = max_sectors;
853 set_bit(R1BIO_Degraded, &r1_bio->state); 1033 spin_lock_irq(&conf->device_lock);
1034 if (bio->bi_phys_segments == 0)
1035 bio->bi_phys_segments = 2;
1036 else
1037 bio->bi_phys_segments++;
1038 spin_unlock_irq(&conf->device_lock);
854 } 1039 }
855 1040 sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
856 /* do behind I/O ?
857 * Not if there are too many, or cannot allocate memory,
858 * or a reader on WriteMostly is waiting for behind writes
859 * to flush */
860 if (bitmap &&
861 (atomic_read(&bitmap->behind_writes)
862 < mddev->bitmap_info.max_write_behind) &&
863 !waitqueue_active(&bitmap->behind_wait))
864 alloc_behind_pages(bio, r1_bio);
865 1041
866 atomic_set(&r1_bio->remaining, 1); 1042 atomic_set(&r1_bio->remaining, 1);
867 atomic_set(&r1_bio->behind_remaining, 0); 1043 atomic_set(&r1_bio->behind_remaining, 0);
868 1044
869 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, 1045 first_clone = 1;
870 test_bit(R1BIO_BehindIO, &r1_bio->state));
871 for (i = 0; i < disks; i++) { 1046 for (i = 0; i < disks; i++) {
872 struct bio *mbio; 1047 struct bio *mbio;
873 if (!r1_bio->bios[i]) 1048 if (!r1_bio->bios[i])
874 continue; 1049 continue;
875 1050
876 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1051 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
877 r1_bio->bios[i] = mbio; 1052 md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
878 1053
879 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 1054 if (first_clone) {
880 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1055 /* do behind I/O ?
881 mbio->bi_end_io = raid1_end_write_request; 1056 * Not if there are too many, or cannot
882 mbio->bi_rw = WRITE | do_flush_fua | do_sync; 1057 * allocate memory, or a reader on WriteMostly
883 mbio->bi_private = r1_bio; 1058 * is waiting for behind writes to flush */
884 1059 if (bitmap &&
885 if (r1_bio->behind_pages) { 1060 (atomic_read(&bitmap->behind_writes)
1061 < mddev->bitmap_info.max_write_behind) &&
1062 !waitqueue_active(&bitmap->behind_wait))
1063 alloc_behind_pages(mbio, r1_bio);
1064
1065 bitmap_startwrite(bitmap, r1_bio->sector,
1066 r1_bio->sectors,
1067 test_bit(R1BIO_BehindIO,
1068 &r1_bio->state));
1069 first_clone = 0;
1070 }
1071 if (r1_bio->behind_bvecs) {
886 struct bio_vec *bvec; 1072 struct bio_vec *bvec;
887 int j; 1073 int j;
888 1074
@@ -894,11 +1080,20 @@ static int make_request(mddev_t *mddev, struct bio * bio)
894 * them all 1080 * them all
895 */ 1081 */
896 __bio_for_each_segment(bvec, mbio, j, 0) 1082 __bio_for_each_segment(bvec, mbio, j, 0)
897 bvec->bv_page = r1_bio->behind_pages[j]; 1083 bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
898 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 1084 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
899 atomic_inc(&r1_bio->behind_remaining); 1085 atomic_inc(&r1_bio->behind_remaining);
900 } 1086 }
901 1087
1088 r1_bio->bios[i] = mbio;
1089
1090 mbio->bi_sector = (r1_bio->sector +
1091 conf->mirrors[i].rdev->data_offset);
1092 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1093 mbio->bi_end_io = raid1_end_write_request;
1094 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
1095 mbio->bi_private = r1_bio;
1096
902 atomic_inc(&r1_bio->remaining); 1097 atomic_inc(&r1_bio->remaining);
903 spin_lock_irqsave(&conf->device_lock, flags); 1098 spin_lock_irqsave(&conf->device_lock, flags);
904 bio_list_add(&conf->pending_bio_list, mbio); 1099 bio_list_add(&conf->pending_bio_list, mbio);
@@ -909,6 +1104,19 @@ static int make_request(mddev_t *mddev, struct bio * bio)
909 /* In case raid1d snuck in to freeze_array */ 1104 /* In case raid1d snuck in to freeze_array */
910 wake_up(&conf->wait_barrier); 1105 wake_up(&conf->wait_barrier);
911 1106
1107 if (sectors_handled < (bio->bi_size >> 9)) {
1108 /* We need another r1_bio. It has already been counted
1109 * in bio->bi_phys_segments
1110 */
1111 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1112 r1_bio->master_bio = bio;
1113 r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1114 r1_bio->state = 0;
1115 r1_bio->mddev = mddev;
1116 r1_bio->sector = bio->bi_sector + sectors_handled;
1117 goto retry_write;
1118 }
1119
912 if (do_sync || !bitmap || !plugged) 1120 if (do_sync || !bitmap || !plugged)
913 md_wakeup_thread(mddev->thread); 1121 md_wakeup_thread(mddev->thread);
914 1122
@@ -952,9 +1160,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
952 * However don't try a recovery from this drive as 1160 * However don't try a recovery from this drive as
953 * it is very likely to fail. 1161 * it is very likely to fail.
954 */ 1162 */
955 mddev->recovery_disabled = 1; 1163 conf->recovery_disabled = mddev->recovery_disabled;
956 return; 1164 return;
957 } 1165 }
1166 set_bit(Blocked, &rdev->flags);
958 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1167 if (test_and_clear_bit(In_sync, &rdev->flags)) {
959 unsigned long flags; 1168 unsigned long flags;
960 spin_lock_irqsave(&conf->device_lock, flags); 1169 spin_lock_irqsave(&conf->device_lock, flags);
@@ -1027,7 +1236,7 @@ static int raid1_spare_active(mddev_t *mddev)
1027 && !test_bit(Faulty, &rdev->flags) 1236 && !test_bit(Faulty, &rdev->flags)
1028 && !test_and_set_bit(In_sync, &rdev->flags)) { 1237 && !test_and_set_bit(In_sync, &rdev->flags)) {
1029 count++; 1238 count++;
1030 sysfs_notify_dirent(rdev->sysfs_state); 1239 sysfs_notify_dirent_safe(rdev->sysfs_state);
1031 } 1240 }
1032 } 1241 }
1033 spin_lock_irqsave(&conf->device_lock, flags); 1242 spin_lock_irqsave(&conf->device_lock, flags);
@@ -1048,6 +1257,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1048 int first = 0; 1257 int first = 0;
1049 int last = mddev->raid_disks - 1; 1258 int last = mddev->raid_disks - 1;
1050 1259
1260 if (mddev->recovery_disabled == conf->recovery_disabled)
1261 return -EBUSY;
1262
1051 if (rdev->raid_disk >= 0) 1263 if (rdev->raid_disk >= 0)
1052 first = last = rdev->raid_disk; 1264 first = last = rdev->raid_disk;
1053 1265
@@ -1103,7 +1315,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
1103 * is not possible. 1315 * is not possible.
1104 */ 1316 */
1105 if (!test_bit(Faulty, &rdev->flags) && 1317 if (!test_bit(Faulty, &rdev->flags) &&
1106 !mddev->recovery_disabled && 1318 mddev->recovery_disabled != conf->recovery_disabled &&
1107 mddev->degraded < conf->raid_disks) { 1319 mddev->degraded < conf->raid_disks) {
1108 err = -EBUSY; 1320 err = -EBUSY;
1109 goto abort; 1321 goto abort;
@@ -1155,6 +1367,8 @@ static void end_sync_write(struct bio *bio, int error)
1155 conf_t *conf = mddev->private; 1367 conf_t *conf = mddev->private;
1156 int i; 1368 int i;
1157 int mirror=0; 1369 int mirror=0;
1370 sector_t first_bad;
1371 int bad_sectors;
1158 1372
1159 for (i = 0; i < conf->raid_disks; i++) 1373 for (i = 0; i < conf->raid_disks; i++)
1160 if (r1_bio->bios[i] == bio) { 1374 if (r1_bio->bios[i] == bio) {
@@ -1172,18 +1386,48 @@ static void end_sync_write(struct bio *bio, int error)
1172 s += sync_blocks; 1386 s += sync_blocks;
1173 sectors_to_go -= sync_blocks; 1387 sectors_to_go -= sync_blocks;
1174 } while (sectors_to_go > 0); 1388 } while (sectors_to_go > 0);
1175 md_error(mddev, conf->mirrors[mirror].rdev); 1389 set_bit(WriteErrorSeen,
1176 } 1390 &conf->mirrors[mirror].rdev->flags);
1391 set_bit(R1BIO_WriteError, &r1_bio->state);
1392 } else if (is_badblock(conf->mirrors[mirror].rdev,
1393 r1_bio->sector,
1394 r1_bio->sectors,
1395 &first_bad, &bad_sectors) &&
1396 !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
1397 r1_bio->sector,
1398 r1_bio->sectors,
1399 &first_bad, &bad_sectors)
1400 )
1401 set_bit(R1BIO_MadeGood, &r1_bio->state);
1177 1402
1178 update_head_pos(mirror, r1_bio); 1403 update_head_pos(mirror, r1_bio);
1179 1404
1180 if (atomic_dec_and_test(&r1_bio->remaining)) { 1405 if (atomic_dec_and_test(&r1_bio->remaining)) {
1181 sector_t s = r1_bio->sectors; 1406 int s = r1_bio->sectors;
1182 put_buf(r1_bio); 1407 if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1183 md_done_sync(mddev, s, uptodate); 1408 test_bit(R1BIO_WriteError, &r1_bio->state))
1409 reschedule_retry(r1_bio);
1410 else {
1411 put_buf(r1_bio);
1412 md_done_sync(mddev, s, uptodate);
1413 }
1184 } 1414 }
1185} 1415}
1186 1416
1417static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
1418 int sectors, struct page *page, int rw)
1419{
1420 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1421 /* success */
1422 return 1;
1423 if (rw == WRITE)
1424 set_bit(WriteErrorSeen, &rdev->flags);
1425 /* need to record an error - either for the block or the device */
1426 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1427 md_error(rdev->mddev, rdev);
1428 return 0;
1429}
1430
1187static int fix_sync_read_error(r1bio_t *r1_bio) 1431static int fix_sync_read_error(r1bio_t *r1_bio)
1188{ 1432{
1189 /* Try some synchronous reads of other devices to get 1433 /* Try some synchronous reads of other devices to get
@@ -1193,6 +1437,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1193 * We don't need to freeze the array, because being in an 1437 * We don't need to freeze the array, because being in an
1194 * active sync request, there is no normal IO, and 1438 * active sync request, there is no normal IO, and
1195 * no overlapping syncs. 1439 * no overlapping syncs.
1440 * We don't need to check is_badblock() again as we
1441 * made sure that anything with a bad block in range
1442 * will have bi_end_io clear.
1196 */ 1443 */
1197 mddev_t *mddev = r1_bio->mddev; 1444 mddev_t *mddev = r1_bio->mddev;
1198 conf_t *conf = mddev->private; 1445 conf_t *conf = mddev->private;
@@ -1217,9 +1464,7 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1217 * active, and resync is currently active 1464 * active, and resync is currently active
1218 */ 1465 */
1219 rdev = conf->mirrors[d].rdev; 1466 rdev = conf->mirrors[d].rdev;
1220 if (sync_page_io(rdev, 1467 if (sync_page_io(rdev, sect, s<<9,
1221 sect,
1222 s<<9,
1223 bio->bi_io_vec[idx].bv_page, 1468 bio->bi_io_vec[idx].bv_page,
1224 READ, false)) { 1469 READ, false)) {
1225 success = 1; 1470 success = 1;
@@ -1233,16 +1478,36 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1233 1478
1234 if (!success) { 1479 if (!success) {
1235 char b[BDEVNAME_SIZE]; 1480 char b[BDEVNAME_SIZE];
1236 /* Cannot read from anywhere, array is toast */ 1481 int abort = 0;
1237 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 1482 /* Cannot read from anywhere, this block is lost.
1483 * Record a bad block on each device. If that doesn't
1484 * work just disable and interrupt the recovery.
1485 * Don't fail devices as that won't really help.
1486 */
1238 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" 1487 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1239 " for block %llu\n", 1488 " for block %llu\n",
1240 mdname(mddev), 1489 mdname(mddev),
1241 bdevname(bio->bi_bdev, b), 1490 bdevname(bio->bi_bdev, b),
1242 (unsigned long long)r1_bio->sector); 1491 (unsigned long long)r1_bio->sector);
1243 md_done_sync(mddev, r1_bio->sectors, 0); 1492 for (d = 0; d < conf->raid_disks; d++) {
1244 put_buf(r1_bio); 1493 rdev = conf->mirrors[d].rdev;
1245 return 0; 1494 if (!rdev || test_bit(Faulty, &rdev->flags))
1495 continue;
1496 if (!rdev_set_badblocks(rdev, sect, s, 0))
1497 abort = 1;
1498 }
1499 if (abort) {
1500 mddev->recovery_disabled = 1;
1501 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1502 md_done_sync(mddev, r1_bio->sectors, 0);
1503 put_buf(r1_bio);
1504 return 0;
1505 }
1506 /* Try next page */
1507 sectors -= s;
1508 sect += s;
1509 idx++;
1510 continue;
1246 } 1511 }
1247 1512
1248 start = d; 1513 start = d;
@@ -1254,16 +1519,12 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1254 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1519 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1255 continue; 1520 continue;
1256 rdev = conf->mirrors[d].rdev; 1521 rdev = conf->mirrors[d].rdev;
1257 if (sync_page_io(rdev, 1522 if (r1_sync_page_io(rdev, sect, s,
1258 sect, 1523 bio->bi_io_vec[idx].bv_page,
1259 s<<9, 1524 WRITE) == 0) {
1260 bio->bi_io_vec[idx].bv_page,
1261 WRITE, false) == 0) {
1262 r1_bio->bios[d]->bi_end_io = NULL; 1525 r1_bio->bios[d]->bi_end_io = NULL;
1263 rdev_dec_pending(rdev, mddev); 1526 rdev_dec_pending(rdev, mddev);
1264 md_error(mddev, rdev); 1527 }
1265 } else
1266 atomic_add(s, &rdev->corrected_errors);
1267 } 1528 }
1268 d = start; 1529 d = start;
1269 while (d != r1_bio->read_disk) { 1530 while (d != r1_bio->read_disk) {
@@ -1273,12 +1534,10 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1273 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1534 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1274 continue; 1535 continue;
1275 rdev = conf->mirrors[d].rdev; 1536 rdev = conf->mirrors[d].rdev;
1276 if (sync_page_io(rdev, 1537 if (r1_sync_page_io(rdev, sect, s,
1277 sect, 1538 bio->bi_io_vec[idx].bv_page,
1278 s<<9, 1539 READ) != 0)
1279 bio->bi_io_vec[idx].bv_page, 1540 atomic_add(s, &rdev->corrected_errors);
1280 READ, false) == 0)
1281 md_error(mddev, rdev);
1282 } 1541 }
1283 sectors -= s; 1542 sectors -= s;
1284 sect += s; 1543 sect += s;
@@ -1420,7 +1679,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1420 * 1679 *
1421 * 1. Retries failed read operations on working mirrors. 1680 * 1. Retries failed read operations on working mirrors.
1422 * 2. Updates the raid superblock when problems encounter. 1681 * 2. Updates the raid superblock when problems encounter.
1423 * 3. Performs writes following reads for array syncronising. 1682 * 3. Performs writes following reads for array synchronising.
1424 */ 1683 */
1425 1684
1426static void fix_read_error(conf_t *conf, int read_disk, 1685static void fix_read_error(conf_t *conf, int read_disk,
@@ -1443,9 +1702,14 @@ static void fix_read_error(conf_t *conf, int read_disk,
1443 * which is the thread that might remove 1702 * which is the thread that might remove
1444 * a device. If raid1d ever becomes multi-threaded.... 1703 * a device. If raid1d ever becomes multi-threaded....
1445 */ 1704 */
1705 sector_t first_bad;
1706 int bad_sectors;
1707
1446 rdev = conf->mirrors[d].rdev; 1708 rdev = conf->mirrors[d].rdev;
1447 if (rdev && 1709 if (rdev &&
1448 test_bit(In_sync, &rdev->flags) && 1710 test_bit(In_sync, &rdev->flags) &&
1711 is_badblock(rdev, sect, s,
1712 &first_bad, &bad_sectors) == 0 &&
1449 sync_page_io(rdev, sect, s<<9, 1713 sync_page_io(rdev, sect, s<<9,
1450 conf->tmppage, READ, false)) 1714 conf->tmppage, READ, false))
1451 success = 1; 1715 success = 1;
@@ -1457,8 +1721,10 @@ static void fix_read_error(conf_t *conf, int read_disk,
1457 } while (!success && d != read_disk); 1721 } while (!success && d != read_disk);
1458 1722
1459 if (!success) { 1723 if (!success) {
1460 /* Cannot read from anywhere -- bye bye array */ 1724 /* Cannot read from anywhere - mark it bad */
1461 md_error(mddev, conf->mirrors[read_disk].rdev); 1725 mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev;
1726 if (!rdev_set_badblocks(rdev, sect, s, 0))
1727 md_error(mddev, rdev);
1462 break; 1728 break;
1463 } 1729 }
1464 /* write it back and re-read */ 1730 /* write it back and re-read */
@@ -1469,13 +1735,9 @@ static void fix_read_error(conf_t *conf, int read_disk,
1469 d--; 1735 d--;
1470 rdev = conf->mirrors[d].rdev; 1736 rdev = conf->mirrors[d].rdev;
1471 if (rdev && 1737 if (rdev &&
1472 test_bit(In_sync, &rdev->flags)) { 1738 test_bit(In_sync, &rdev->flags))
1473 if (sync_page_io(rdev, sect, s<<9, 1739 r1_sync_page_io(rdev, sect, s,
1474 conf->tmppage, WRITE, false) 1740 conf->tmppage, WRITE);
1475 == 0)
1476 /* Well, this device is dead */
1477 md_error(mddev, rdev);
1478 }
1479 } 1741 }
1480 d = start; 1742 d = start;
1481 while (d != read_disk) { 1743 while (d != read_disk) {
@@ -1486,12 +1748,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
1486 rdev = conf->mirrors[d].rdev; 1748 rdev = conf->mirrors[d].rdev;
1487 if (rdev && 1749 if (rdev &&
1488 test_bit(In_sync, &rdev->flags)) { 1750 test_bit(In_sync, &rdev->flags)) {
1489 if (sync_page_io(rdev, sect, s<<9, 1751 if (r1_sync_page_io(rdev, sect, s,
1490 conf->tmppage, READ, false) 1752 conf->tmppage, READ)) {
1491 == 0)
1492 /* Well, this device is dead */
1493 md_error(mddev, rdev);
1494 else {
1495 atomic_add(s, &rdev->corrected_errors); 1753 atomic_add(s, &rdev->corrected_errors);
1496 printk(KERN_INFO 1754 printk(KERN_INFO
1497 "md/raid1:%s: read error corrected " 1755 "md/raid1:%s: read error corrected "
@@ -1508,21 +1766,255 @@ static void fix_read_error(conf_t *conf, int read_disk,
1508 } 1766 }
1509} 1767}
1510 1768
1769static void bi_complete(struct bio *bio, int error)
1770{
1771 complete((struct completion *)bio->bi_private);
1772}
1773
1774static int submit_bio_wait(int rw, struct bio *bio)
1775{
1776 struct completion event;
1777 rw |= REQ_SYNC;
1778
1779 init_completion(&event);
1780 bio->bi_private = &event;
1781 bio->bi_end_io = bi_complete;
1782 submit_bio(rw, bio);
1783 wait_for_completion(&event);
1784
1785 return test_bit(BIO_UPTODATE, &bio->bi_flags);
1786}
1787
1788static int narrow_write_error(r1bio_t *r1_bio, int i)
1789{
1790 mddev_t *mddev = r1_bio->mddev;
1791 conf_t *conf = mddev->private;
1792 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
1793 int vcnt, idx;
1794 struct bio_vec *vec;
1795
1796 /* bio has the data to be written to device 'i' where
1797 * we just recently had a write error.
1798 * We repeatedly clone the bio and trim down to one block,
1799 * then try the write. Where the write fails we record
1800 * a bad block.
1801 * It is conceivable that the bio doesn't exactly align with
1802 * blocks. We must handle this somehow.
1803 *
1804 * We currently own a reference on the rdev.
1805 */
1806
1807 int block_sectors;
1808 sector_t sector;
1809 int sectors;
1810 int sect_to_write = r1_bio->sectors;
1811 int ok = 1;
1812
1813 if (rdev->badblocks.shift < 0)
1814 return 0;
1815
1816 block_sectors = 1 << rdev->badblocks.shift;
1817 sector = r1_bio->sector;
1818 sectors = ((sector + block_sectors)
1819 & ~(sector_t)(block_sectors - 1))
1820 - sector;
1821
1822 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
1823 vcnt = r1_bio->behind_page_count;
1824 vec = r1_bio->behind_bvecs;
1825 idx = 0;
1826 while (vec[idx].bv_page == NULL)
1827 idx++;
1828 } else {
1829 vcnt = r1_bio->master_bio->bi_vcnt;
1830 vec = r1_bio->master_bio->bi_io_vec;
1831 idx = r1_bio->master_bio->bi_idx;
1832 }
1833 while (sect_to_write) {
1834 struct bio *wbio;
1835 if (sectors > sect_to_write)
1836 sectors = sect_to_write;
1837 /* Write at 'sector' for 'sectors'*/
1838
1839 wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
1840 memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
1841 wbio->bi_sector = r1_bio->sector;
1842 wbio->bi_rw = WRITE;
1843 wbio->bi_vcnt = vcnt;
1844 wbio->bi_size = r1_bio->sectors << 9;
1845 wbio->bi_idx = idx;
1846
1847 md_trim_bio(wbio, sector - r1_bio->sector, sectors);
1848 wbio->bi_sector += rdev->data_offset;
1849 wbio->bi_bdev = rdev->bdev;
1850 if (submit_bio_wait(WRITE, wbio) == 0)
1851 /* failure! */
1852 ok = rdev_set_badblocks(rdev, sector,
1853 sectors, 0)
1854 && ok;
1855
1856 bio_put(wbio);
1857 sect_to_write -= sectors;
1858 sector += sectors;
1859 sectors = block_sectors;
1860 }
1861 return ok;
1862}
1863
1864static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio)
1865{
1866 int m;
1867 int s = r1_bio->sectors;
1868 for (m = 0; m < conf->raid_disks ; m++) {
1869 mdk_rdev_t *rdev = conf->mirrors[m].rdev;
1870 struct bio *bio = r1_bio->bios[m];
1871 if (bio->bi_end_io == NULL)
1872 continue;
1873 if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
1874 test_bit(R1BIO_MadeGood, &r1_bio->state)) {
1875 rdev_clear_badblocks(rdev, r1_bio->sector, s);
1876 }
1877 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
1878 test_bit(R1BIO_WriteError, &r1_bio->state)) {
1879 if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
1880 md_error(conf->mddev, rdev);
1881 }
1882 }
1883 put_buf(r1_bio);
1884 md_done_sync(conf->mddev, s, 1);
1885}
1886
1887static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio)
1888{
1889 int m;
1890 for (m = 0; m < conf->raid_disks ; m++)
1891 if (r1_bio->bios[m] == IO_MADE_GOOD) {
1892 mdk_rdev_t *rdev = conf->mirrors[m].rdev;
1893 rdev_clear_badblocks(rdev,
1894 r1_bio->sector,
1895 r1_bio->sectors);
1896 rdev_dec_pending(rdev, conf->mddev);
1897 } else if (r1_bio->bios[m] != NULL) {
1898 /* This drive got a write error. We need to
1899 * narrow down and record precise write
1900 * errors.
1901 */
1902 if (!narrow_write_error(r1_bio, m)) {
1903 md_error(conf->mddev,
1904 conf->mirrors[m].rdev);
1905 /* an I/O failed, we can't clear the bitmap */
1906 set_bit(R1BIO_Degraded, &r1_bio->state);
1907 }
1908 rdev_dec_pending(conf->mirrors[m].rdev,
1909 conf->mddev);
1910 }
1911 if (test_bit(R1BIO_WriteError, &r1_bio->state))
1912 close_write(r1_bio);
1913 raid_end_bio_io(r1_bio);
1914}
1915
1916static void handle_read_error(conf_t *conf, r1bio_t *r1_bio)
1917{
1918 int disk;
1919 int max_sectors;
1920 mddev_t *mddev = conf->mddev;
1921 struct bio *bio;
1922 char b[BDEVNAME_SIZE];
1923 mdk_rdev_t *rdev;
1924
1925 clear_bit(R1BIO_ReadError, &r1_bio->state);
1926 /* we got a read error. Maybe the drive is bad. Maybe just
1927 * the block and we can fix it.
1928 * We freeze all other IO, and try reading the block from
1929 * other devices. When we find one, we re-write
1930 * and check it that fixes the read error.
1931 * This is all done synchronously while the array is
1932 * frozen
1933 */
1934 if (mddev->ro == 0) {
1935 freeze_array(conf);
1936 fix_read_error(conf, r1_bio->read_disk,
1937 r1_bio->sector, r1_bio->sectors);
1938 unfreeze_array(conf);
1939 } else
1940 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1941
1942 bio = r1_bio->bios[r1_bio->read_disk];
1943 bdevname(bio->bi_bdev, b);
1944read_more:
1945 disk = read_balance(conf, r1_bio, &max_sectors);
1946 if (disk == -1) {
1947 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
1948 " read error for block %llu\n",
1949 mdname(mddev), b, (unsigned long long)r1_bio->sector);
1950 raid_end_bio_io(r1_bio);
1951 } else {
1952 const unsigned long do_sync
1953 = r1_bio->master_bio->bi_rw & REQ_SYNC;
1954 if (bio) {
1955 r1_bio->bios[r1_bio->read_disk] =
1956 mddev->ro ? IO_BLOCKED : NULL;
1957 bio_put(bio);
1958 }
1959 r1_bio->read_disk = disk;
1960 bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
1961 md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors);
1962 r1_bio->bios[r1_bio->read_disk] = bio;
1963 rdev = conf->mirrors[disk].rdev;
1964 printk_ratelimited(KERN_ERR
1965 "md/raid1:%s: redirecting sector %llu"
1966 " to other mirror: %s\n",
1967 mdname(mddev),
1968 (unsigned long long)r1_bio->sector,
1969 bdevname(rdev->bdev, b));
1970 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1971 bio->bi_bdev = rdev->bdev;
1972 bio->bi_end_io = raid1_end_read_request;
1973 bio->bi_rw = READ | do_sync;
1974 bio->bi_private = r1_bio;
1975 if (max_sectors < r1_bio->sectors) {
1976 /* Drat - have to split this up more */
1977 struct bio *mbio = r1_bio->master_bio;
1978 int sectors_handled = (r1_bio->sector + max_sectors
1979 - mbio->bi_sector);
1980 r1_bio->sectors = max_sectors;
1981 spin_lock_irq(&conf->device_lock);
1982 if (mbio->bi_phys_segments == 0)
1983 mbio->bi_phys_segments = 2;
1984 else
1985 mbio->bi_phys_segments++;
1986 spin_unlock_irq(&conf->device_lock);
1987 generic_make_request(bio);
1988 bio = NULL;
1989
1990 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1991
1992 r1_bio->master_bio = mbio;
1993 r1_bio->sectors = (mbio->bi_size >> 9)
1994 - sectors_handled;
1995 r1_bio->state = 0;
1996 set_bit(R1BIO_ReadError, &r1_bio->state);
1997 r1_bio->mddev = mddev;
1998 r1_bio->sector = mbio->bi_sector + sectors_handled;
1999
2000 goto read_more;
2001 } else
2002 generic_make_request(bio);
2003 }
2004}
2005
1511static void raid1d(mddev_t *mddev) 2006static void raid1d(mddev_t *mddev)
1512{ 2007{
1513 r1bio_t *r1_bio; 2008 r1bio_t *r1_bio;
1514 struct bio *bio;
1515 unsigned long flags; 2009 unsigned long flags;
1516 conf_t *conf = mddev->private; 2010 conf_t *conf = mddev->private;
1517 struct list_head *head = &conf->retry_list; 2011 struct list_head *head = &conf->retry_list;
1518 mdk_rdev_t *rdev;
1519 struct blk_plug plug; 2012 struct blk_plug plug;
1520 2013
1521 md_check_recovery(mddev); 2014 md_check_recovery(mddev);
1522 2015
1523 blk_start_plug(&plug); 2016 blk_start_plug(&plug);
1524 for (;;) { 2017 for (;;) {
1525 char b[BDEVNAME_SIZE];
1526 2018
1527 if (atomic_read(&mddev->plug_cnt) == 0) 2019 if (atomic_read(&mddev->plug_cnt) == 0)
1528 flush_pending_writes(conf); 2020 flush_pending_writes(conf);
@@ -1539,62 +2031,26 @@ static void raid1d(mddev_t *mddev)
1539 2031
1540 mddev = r1_bio->mddev; 2032 mddev = r1_bio->mddev;
1541 conf = mddev->private; 2033 conf = mddev->private;
1542 if (test_bit(R1BIO_IsSync, &r1_bio->state)) 2034 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1543 sync_request_write(mddev, r1_bio); 2035 if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1544 else { 2036 test_bit(R1BIO_WriteError, &r1_bio->state))
1545 int disk; 2037 handle_sync_write_finished(conf, r1_bio);
1546 2038 else
1547 /* we got a read error. Maybe the drive is bad. Maybe just 2039 sync_request_write(mddev, r1_bio);
1548 * the block and we can fix it. 2040 } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1549 * We freeze all other IO, and try reading the block from 2041 test_bit(R1BIO_WriteError, &r1_bio->state))
1550 * other devices. When we find one, we re-write 2042 handle_write_finished(conf, r1_bio);
1551 * and check it that fixes the read error. 2043 else if (test_bit(R1BIO_ReadError, &r1_bio->state))
1552 * This is all done synchronously while the array is 2044 handle_read_error(conf, r1_bio);
1553 * frozen 2045 else
2046 /* just a partial read to be scheduled from separate
2047 * context
1554 */ 2048 */
1555 if (mddev->ro == 0) { 2049 generic_make_request(r1_bio->bios[r1_bio->read_disk]);
1556 freeze_array(conf); 2050
1557 fix_read_error(conf, r1_bio->read_disk,
1558 r1_bio->sector,
1559 r1_bio->sectors);
1560 unfreeze_array(conf);
1561 } else
1562 md_error(mddev,
1563 conf->mirrors[r1_bio->read_disk].rdev);
1564
1565 bio = r1_bio->bios[r1_bio->read_disk];
1566 if ((disk=read_balance(conf, r1_bio)) == -1) {
1567 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
1568 " read error for block %llu\n",
1569 mdname(mddev),
1570 bdevname(bio->bi_bdev,b),
1571 (unsigned long long)r1_bio->sector);
1572 raid_end_bio_io(r1_bio);
1573 } else {
1574 const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
1575 r1_bio->bios[r1_bio->read_disk] =
1576 mddev->ro ? IO_BLOCKED : NULL;
1577 r1_bio->read_disk = disk;
1578 bio_put(bio);
1579 bio = bio_clone_mddev(r1_bio->master_bio,
1580 GFP_NOIO, mddev);
1581 r1_bio->bios[r1_bio->read_disk] = bio;
1582 rdev = conf->mirrors[disk].rdev;
1583 if (printk_ratelimit())
1584 printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
1585 " other mirror: %s\n",
1586 mdname(mddev),
1587 (unsigned long long)r1_bio->sector,
1588 bdevname(rdev->bdev,b));
1589 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1590 bio->bi_bdev = rdev->bdev;
1591 bio->bi_end_io = raid1_end_read_request;
1592 bio->bi_rw = READ | do_sync;
1593 bio->bi_private = r1_bio;
1594 generic_make_request(bio);
1595 }
1596 }
1597 cond_resched(); 2051 cond_resched();
2052 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2053 md_check_recovery(mddev);
1598 } 2054 }
1599 blk_finish_plug(&plug); 2055 blk_finish_plug(&plug);
1600} 2056}
@@ -1636,6 +2092,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1636 int write_targets = 0, read_targets = 0; 2092 int write_targets = 0, read_targets = 0;
1637 sector_t sync_blocks; 2093 sector_t sync_blocks;
1638 int still_degraded = 0; 2094 int still_degraded = 0;
2095 int good_sectors = RESYNC_SECTORS;
2096 int min_bad = 0; /* number of sectors that are bad in all devices */
1639 2097
1640 if (!conf->r1buf_pool) 2098 if (!conf->r1buf_pool)
1641 if (init_resync(conf)) 2099 if (init_resync(conf))
@@ -1723,36 +2181,89 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1723 2181
1724 rdev = rcu_dereference(conf->mirrors[i].rdev); 2182 rdev = rcu_dereference(conf->mirrors[i].rdev);
1725 if (rdev == NULL || 2183 if (rdev == NULL ||
1726 test_bit(Faulty, &rdev->flags)) { 2184 test_bit(Faulty, &rdev->flags)) {
1727 still_degraded = 1; 2185 still_degraded = 1;
1728 continue;
1729 } else if (!test_bit(In_sync, &rdev->flags)) { 2186 } else if (!test_bit(In_sync, &rdev->flags)) {
1730 bio->bi_rw = WRITE; 2187 bio->bi_rw = WRITE;
1731 bio->bi_end_io = end_sync_write; 2188 bio->bi_end_io = end_sync_write;
1732 write_targets ++; 2189 write_targets ++;
1733 } else { 2190 } else {
1734 /* may need to read from here */ 2191 /* may need to read from here */
1735 bio->bi_rw = READ; 2192 sector_t first_bad = MaxSector;
1736 bio->bi_end_io = end_sync_read; 2193 int bad_sectors;
1737 if (test_bit(WriteMostly, &rdev->flags)) { 2194
1738 if (wonly < 0) 2195 if (is_badblock(rdev, sector_nr, good_sectors,
1739 wonly = i; 2196 &first_bad, &bad_sectors)) {
1740 } else { 2197 if (first_bad > sector_nr)
1741 if (disk < 0) 2198 good_sectors = first_bad - sector_nr;
1742 disk = i; 2199 else {
2200 bad_sectors -= (sector_nr - first_bad);
2201 if (min_bad == 0 ||
2202 min_bad > bad_sectors)
2203 min_bad = bad_sectors;
2204 }
2205 }
2206 if (sector_nr < first_bad) {
2207 if (test_bit(WriteMostly, &rdev->flags)) {
2208 if (wonly < 0)
2209 wonly = i;
2210 } else {
2211 if (disk < 0)
2212 disk = i;
2213 }
2214 bio->bi_rw = READ;
2215 bio->bi_end_io = end_sync_read;
2216 read_targets++;
1743 } 2217 }
1744 read_targets++;
1745 } 2218 }
1746 atomic_inc(&rdev->nr_pending); 2219 if (bio->bi_end_io) {
1747 bio->bi_sector = sector_nr + rdev->data_offset; 2220 atomic_inc(&rdev->nr_pending);
1748 bio->bi_bdev = rdev->bdev; 2221 bio->bi_sector = sector_nr + rdev->data_offset;
1749 bio->bi_private = r1_bio; 2222 bio->bi_bdev = rdev->bdev;
2223 bio->bi_private = r1_bio;
2224 }
1750 } 2225 }
1751 rcu_read_unlock(); 2226 rcu_read_unlock();
1752 if (disk < 0) 2227 if (disk < 0)
1753 disk = wonly; 2228 disk = wonly;
1754 r1_bio->read_disk = disk; 2229 r1_bio->read_disk = disk;
1755 2230
2231 if (read_targets == 0 && min_bad > 0) {
2232 /* These sectors are bad on all InSync devices, so we
2233 * need to mark them bad on all write targets
2234 */
2235 int ok = 1;
2236 for (i = 0 ; i < conf->raid_disks ; i++)
2237 if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
2238 mdk_rdev_t *rdev =
2239 rcu_dereference(conf->mirrors[i].rdev);
2240 ok = rdev_set_badblocks(rdev, sector_nr,
2241 min_bad, 0
2242 ) && ok;
2243 }
2244 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2245 *skipped = 1;
2246 put_buf(r1_bio);
2247
2248 if (!ok) {
2249 /* Cannot record the badblocks, so need to
2250 * abort the resync.
2251 * If there are multiple read targets, could just
2252 * fail the really bad ones ???
2253 */
2254 conf->recovery_disabled = mddev->recovery_disabled;
2255 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2256 return 0;
2257 } else
2258 return min_bad;
2259
2260 }
2261 if (min_bad > 0 && min_bad < good_sectors) {
2262 /* only resync enough to reach the next bad->good
2263 * transition */
2264 good_sectors = min_bad;
2265 }
2266
1756 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) 2267 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
1757 /* extra read targets are also write targets */ 2268 /* extra read targets are also write targets */
1758 write_targets += read_targets-1; 2269 write_targets += read_targets-1;
@@ -1769,6 +2280,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1769 2280
1770 if (max_sector > mddev->resync_max) 2281 if (max_sector > mddev->resync_max)
1771 max_sector = mddev->resync_max; /* Don't do IO beyond here */ 2282 max_sector = mddev->resync_max; /* Don't do IO beyond here */
2283 if (max_sector > sector_nr + good_sectors)
2284 max_sector = sector_nr + good_sectors;
1772 nr_sectors = 0; 2285 nr_sectors = 0;
1773 sync_blocks = 0; 2286 sync_blocks = 0;
1774 do { 2287 do {
@@ -2154,18 +2667,13 @@ static int raid1_reshape(mddev_t *mddev)
2154 for (d = d2 = 0; d < conf->raid_disks; d++) { 2667 for (d = d2 = 0; d < conf->raid_disks; d++) {
2155 mdk_rdev_t *rdev = conf->mirrors[d].rdev; 2668 mdk_rdev_t *rdev = conf->mirrors[d].rdev;
2156 if (rdev && rdev->raid_disk != d2) { 2669 if (rdev && rdev->raid_disk != d2) {
2157 char nm[20]; 2670 sysfs_unlink_rdev(mddev, rdev);
2158 sprintf(nm, "rd%d", rdev->raid_disk);
2159 sysfs_remove_link(&mddev->kobj, nm);
2160 rdev->raid_disk = d2; 2671 rdev->raid_disk = d2;
2161 sprintf(nm, "rd%d", rdev->raid_disk); 2672 sysfs_unlink_rdev(mddev, rdev);
2162 sysfs_remove_link(&mddev->kobj, nm); 2673 if (sysfs_link_rdev(mddev, rdev))
2163 if (sysfs_create_link(&mddev->kobj,
2164 &rdev->kobj, nm))
2165 printk(KERN_WARNING 2674 printk(KERN_WARNING
2166 "md/raid1:%s: cannot register " 2675 "md/raid1:%s: cannot register rd%d\n",
2167 "%s\n", 2676 mdname(mddev), rdev->raid_disk);
2168 mdname(mddev), nm);
2169 } 2677 }
2170 if (rdev) 2678 if (rdev)
2171 newmirrors[d2++].rdev = rdev; 2679 newmirrors[d2++].rdev = rdev;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e743a64fac4f..e0d676b48974 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -48,6 +48,12 @@ struct r1_private_data_s {
48 * (fresh device added). 48 * (fresh device added).
49 * Cleared when a sync completes. 49 * Cleared when a sync completes.
50 */ 50 */
51 int recovery_disabled; /* when the same as
52 * mddev->recovery_disabled
53 * we don't allow recovery
54 * to be attempted as we
55 * expect a read error
56 */
51 57
52 wait_queue_head_t wait_barrier; 58 wait_queue_head_t wait_barrier;
53 59
@@ -95,7 +101,7 @@ struct r1bio_s {
95 101
96 struct list_head retry_list; 102 struct list_head retry_list;
97 /* Next two are only valid when R1BIO_BehindIO is set */ 103 /* Next two are only valid when R1BIO_BehindIO is set */
98 struct page **behind_pages; 104 struct bio_vec *behind_bvecs;
99 int behind_page_count; 105 int behind_page_count;
100 /* 106 /*
101 * if the IO is in WRITE direction, then multiple bios are used. 107 * if the IO is in WRITE direction, then multiple bios are used.
@@ -110,13 +116,24 @@ struct r1bio_s {
110 * correct the read error. To keep track of bad blocks on a per-bio 116 * correct the read error. To keep track of bad blocks on a per-bio
111 * level, we store IO_BLOCKED in the appropriate 'bios' pointer 117 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
112 */ 118 */
113#define IO_BLOCKED ((struct bio*)1) 119#define IO_BLOCKED ((struct bio *)1)
120/* When we successfully write to a known bad-block, we need to remove the
121 * bad-block marking which must be done from process context. So we record
122 * the success by setting bios[n] to IO_MADE_GOOD
123 */
124#define IO_MADE_GOOD ((struct bio *)2)
125
126#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
114 127
115/* bits for r1bio.state */ 128/* bits for r1bio.state */
116#define R1BIO_Uptodate 0 129#define R1BIO_Uptodate 0
117#define R1BIO_IsSync 1 130#define R1BIO_IsSync 1
118#define R1BIO_Degraded 2 131#define R1BIO_Degraded 2
119#define R1BIO_BehindIO 3 132#define R1BIO_BehindIO 3
133/* Set ReadError on bios that experience a readerror so that
134 * raid1d knows what to do with them.
135 */
136#define R1BIO_ReadError 4
120/* For write-behind requests, we call bi_end_io when 137/* For write-behind requests, we call bi_end_io when
121 * the last non-write-behind device completes, providing 138 * the last non-write-behind device completes, providing
122 * any write was successful. Otherwise we call when 139 * any write was successful. Otherwise we call when
@@ -125,6 +142,11 @@ struct r1bio_s {
125 * Record that bi_end_io was called with this flag... 142 * Record that bi_end_io was called with this flag...
126 */ 143 */
127#define R1BIO_Returned 6 144#define R1BIO_Returned 6
145/* If a write for this request means we can clear some
146 * known-bad-block records, we set this flag
147 */
148#define R1BIO_MadeGood 7
149#define R1BIO_WriteError 8
128 150
129extern int md_raid1_congested(mddev_t *mddev, int bits); 151extern int md_raid1_congested(mddev_t *mddev, int bits);
130 152
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6e846688962f..8b29cd4f01c8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -22,6 +22,7 @@
22#include <linux/delay.h> 22#include <linux/delay.h>
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <linux/seq_file.h> 24#include <linux/seq_file.h>
25#include <linux/ratelimit.h>
25#include "md.h" 26#include "md.h"
26#include "raid10.h" 27#include "raid10.h"
27#include "raid0.h" 28#include "raid0.h"
@@ -123,7 +124,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
123 for (j = 0 ; j < nalloc; j++) { 124 for (j = 0 ; j < nalloc; j++) {
124 bio = r10_bio->devs[j].bio; 125 bio = r10_bio->devs[j].bio;
125 for (i = 0; i < RESYNC_PAGES; i++) { 126 for (i = 0; i < RESYNC_PAGES; i++) {
126 page = alloc_page(gfp_flags); 127 if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
128 &conf->mddev->recovery)) {
129 /* we can share bv_page's during recovery */
130 struct bio *rbio = r10_bio->devs[0].bio;
131 page = rbio->bi_io_vec[i].bv_page;
132 get_page(page);
133 } else
134 page = alloc_page(gfp_flags);
127 if (unlikely(!page)) 135 if (unlikely(!page))
128 goto out_free_pages; 136 goto out_free_pages;
129 137
@@ -173,7 +181,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
173 181
174 for (i = 0; i < conf->copies; i++) { 182 for (i = 0; i < conf->copies; i++) {
175 struct bio **bio = & r10_bio->devs[i].bio; 183 struct bio **bio = & r10_bio->devs[i].bio;
176 if (*bio && *bio != IO_BLOCKED) 184 if (!BIO_SPECIAL(*bio))
177 bio_put(*bio); 185 bio_put(*bio);
178 *bio = NULL; 186 *bio = NULL;
179 } 187 }
@@ -183,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
183{ 191{
184 conf_t *conf = r10_bio->mddev->private; 192 conf_t *conf = r10_bio->mddev->private;
185 193
186 /*
187 * Wake up any possible resync thread that waits for the device
188 * to go idle.
189 */
190 allow_barrier(conf);
191
192 put_all_bios(conf, r10_bio); 194 put_all_bios(conf, r10_bio);
193 mempool_free(r10_bio, conf->r10bio_pool); 195 mempool_free(r10_bio, conf->r10bio_pool);
194} 196}
@@ -227,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
227static void raid_end_bio_io(r10bio_t *r10_bio) 229static void raid_end_bio_io(r10bio_t *r10_bio)
228{ 230{
229 struct bio *bio = r10_bio->master_bio; 231 struct bio *bio = r10_bio->master_bio;
232 int done;
233 conf_t *conf = r10_bio->mddev->private;
230 234
231 bio_endio(bio, 235 if (bio->bi_phys_segments) {
232 test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); 236 unsigned long flags;
237 spin_lock_irqsave(&conf->device_lock, flags);
238 bio->bi_phys_segments--;
239 done = (bio->bi_phys_segments == 0);
240 spin_unlock_irqrestore(&conf->device_lock, flags);
241 } else
242 done = 1;
243 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
244 clear_bit(BIO_UPTODATE, &bio->bi_flags);
245 if (done) {
246 bio_endio(bio, 0);
247 /*
248 * Wake up any possible resync thread that waits for the device
249 * to go idle.
250 */
251 allow_barrier(conf);
252 }
233 free_r10bio(r10_bio); 253 free_r10bio(r10_bio);
234} 254}
235 255
@@ -244,6 +264,26 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)
244 r10_bio->devs[slot].addr + (r10_bio->sectors); 264 r10_bio->devs[slot].addr + (r10_bio->sectors);
245} 265}
246 266
267/*
268 * Find the disk number which triggered given bio
269 */
270static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio,
271 struct bio *bio, int *slotp)
272{
273 int slot;
274
275 for (slot = 0; slot < conf->copies; slot++)
276 if (r10_bio->devs[slot].bio == bio)
277 break;
278
279 BUG_ON(slot == conf->copies);
280 update_head_pos(slot, r10_bio);
281
282 if (slotp)
283 *slotp = slot;
284 return r10_bio->devs[slot].devnum;
285}
286
247static void raid10_end_read_request(struct bio *bio, int error) 287static void raid10_end_read_request(struct bio *bio, int error)
248{ 288{
249 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 289 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -277,34 +317,45 @@ static void raid10_end_read_request(struct bio *bio, int error)
277 * oops, read error - keep the refcount on the rdev 317 * oops, read error - keep the refcount on the rdev
278 */ 318 */
279 char b[BDEVNAME_SIZE]; 319 char b[BDEVNAME_SIZE];
280 if (printk_ratelimit()) 320 printk_ratelimited(KERN_ERR
281 printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", 321 "md/raid10:%s: %s: rescheduling sector %llu\n",
282 mdname(conf->mddev), 322 mdname(conf->mddev),
283 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); 323 bdevname(conf->mirrors[dev].rdev->bdev, b),
324 (unsigned long long)r10_bio->sector);
325 set_bit(R10BIO_ReadError, &r10_bio->state);
284 reschedule_retry(r10_bio); 326 reschedule_retry(r10_bio);
285 } 327 }
286} 328}
287 329
330static void close_write(r10bio_t *r10_bio)
331{
332 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
334 r10_bio->sectors,
335 !test_bit(R10BIO_Degraded, &r10_bio->state),
336 0);
337 md_write_end(r10_bio->mddev);
338}
339
288static void raid10_end_write_request(struct bio *bio, int error) 340static void raid10_end_write_request(struct bio *bio, int error)
289{ 341{
290 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 342 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
291 r10bio_t *r10_bio = bio->bi_private; 343 r10bio_t *r10_bio = bio->bi_private;
292 int slot, dev; 344 int dev;
345 int dec_rdev = 1;
293 conf_t *conf = r10_bio->mddev->private; 346 conf_t *conf = r10_bio->mddev->private;
347 int slot;
294 348
295 for (slot = 0; slot < conf->copies; slot++) 349 dev = find_bio_disk(conf, r10_bio, bio, &slot);
296 if (r10_bio->devs[slot].bio == bio)
297 break;
298 dev = r10_bio->devs[slot].devnum;
299 350
300 /* 351 /*
301 * this branch is our 'one mirror IO has finished' event handler: 352 * this branch is our 'one mirror IO has finished' event handler:
302 */ 353 */
303 if (!uptodate) { 354 if (!uptodate) {
304 md_error(r10_bio->mddev, conf->mirrors[dev].rdev); 355 set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags);
305 /* an I/O failed, we can't clear the bitmap */ 356 set_bit(R10BIO_WriteError, &r10_bio->state);
306 set_bit(R10BIO_Degraded, &r10_bio->state); 357 dec_rdev = 0;
307 } else 358 } else {
308 /* 359 /*
309 * Set R10BIO_Uptodate in our master bio, so that 360 * Set R10BIO_Uptodate in our master bio, so that
310 * we will return a good error code for to the higher 361 * we will return a good error code for to the higher
@@ -314,9 +365,22 @@ static void raid10_end_write_request(struct bio *bio, int error)
314 * user-side. So if something waits for IO, then it will 365 * user-side. So if something waits for IO, then it will
315 * wait for the 'master' bio. 366 * wait for the 'master' bio.
316 */ 367 */
368 sector_t first_bad;
369 int bad_sectors;
370
317 set_bit(R10BIO_Uptodate, &r10_bio->state); 371 set_bit(R10BIO_Uptodate, &r10_bio->state);
318 372
319 update_head_pos(slot, r10_bio); 373 /* Maybe we can clear some bad blocks. */
374 if (is_badblock(conf->mirrors[dev].rdev,
375 r10_bio->devs[slot].addr,
376 r10_bio->sectors,
377 &first_bad, &bad_sectors)) {
378 bio_put(bio);
379 r10_bio->devs[slot].bio = IO_MADE_GOOD;
380 dec_rdev = 0;
381 set_bit(R10BIO_MadeGood, &r10_bio->state);
382 }
383 }
320 384
321 /* 385 /*
322 * 386 *
@@ -324,16 +388,18 @@ static void raid10_end_write_request(struct bio *bio, int error)
324 * already. 388 * already.
325 */ 389 */
326 if (atomic_dec_and_test(&r10_bio->remaining)) { 390 if (atomic_dec_and_test(&r10_bio->remaining)) {
327 /* clear the bitmap if all writes complete successfully */ 391 if (test_bit(R10BIO_WriteError, &r10_bio->state))
328 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 392 reschedule_retry(r10_bio);
329 r10_bio->sectors, 393 else {
330 !test_bit(R10BIO_Degraded, &r10_bio->state), 394 close_write(r10_bio);
331 0); 395 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
332 md_write_end(r10_bio->mddev); 396 reschedule_retry(r10_bio);
333 raid_end_bio_io(r10_bio); 397 else
398 raid_end_bio_io(r10_bio);
399 }
334 } 400 }
335 401 if (dec_rdev)
336 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); 402 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
337} 403}
338 404
339 405
@@ -484,11 +550,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
484 * FIXME: possibly should rethink readbalancing and do it differently 550 * FIXME: possibly should rethink readbalancing and do it differently
485 * depending on near_copies / far_copies geometry. 551 * depending on near_copies / far_copies geometry.
486 */ 552 */
487static int read_balance(conf_t *conf, r10bio_t *r10_bio) 553static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
488{ 554{
489 const sector_t this_sector = r10_bio->sector; 555 const sector_t this_sector = r10_bio->sector;
490 int disk, slot; 556 int disk, slot;
491 const int sectors = r10_bio->sectors; 557 int sectors = r10_bio->sectors;
558 int best_good_sectors;
492 sector_t new_distance, best_dist; 559 sector_t new_distance, best_dist;
493 mdk_rdev_t *rdev; 560 mdk_rdev_t *rdev;
494 int do_balance; 561 int do_balance;
@@ -497,8 +564,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
497 raid10_find_phys(conf, r10_bio); 564 raid10_find_phys(conf, r10_bio);
498 rcu_read_lock(); 565 rcu_read_lock();
499retry: 566retry:
567 sectors = r10_bio->sectors;
500 best_slot = -1; 568 best_slot = -1;
501 best_dist = MaxSector; 569 best_dist = MaxSector;
570 best_good_sectors = 0;
502 do_balance = 1; 571 do_balance = 1;
503 /* 572 /*
504 * Check if we can balance. We can balance on the whole 573 * Check if we can balance. We can balance on the whole
@@ -511,6 +580,10 @@ retry:
511 do_balance = 0; 580 do_balance = 0;
512 581
513 for (slot = 0; slot < conf->copies ; slot++) { 582 for (slot = 0; slot < conf->copies ; slot++) {
583 sector_t first_bad;
584 int bad_sectors;
585 sector_t dev_sector;
586
514 if (r10_bio->devs[slot].bio == IO_BLOCKED) 587 if (r10_bio->devs[slot].bio == IO_BLOCKED)
515 continue; 588 continue;
516 disk = r10_bio->devs[slot].devnum; 589 disk = r10_bio->devs[slot].devnum;
@@ -520,6 +593,37 @@ retry:
520 if (!test_bit(In_sync, &rdev->flags)) 593 if (!test_bit(In_sync, &rdev->flags))
521 continue; 594 continue;
522 595
596 dev_sector = r10_bio->devs[slot].addr;
597 if (is_badblock(rdev, dev_sector, sectors,
598 &first_bad, &bad_sectors)) {
599 if (best_dist < MaxSector)
600 /* Already have a better slot */
601 continue;
602 if (first_bad <= dev_sector) {
603 /* Cannot read here. If this is the
604 * 'primary' device, then we must not read
605 * beyond 'bad_sectors' from another device.
606 */
607 bad_sectors -= (dev_sector - first_bad);
608 if (!do_balance && sectors > bad_sectors)
609 sectors = bad_sectors;
610 if (best_good_sectors > sectors)
611 best_good_sectors = sectors;
612 } else {
613 sector_t good_sectors =
614 first_bad - dev_sector;
615 if (good_sectors > best_good_sectors) {
616 best_good_sectors = good_sectors;
617 best_slot = slot;
618 }
619 if (!do_balance)
620 /* Must read from here */
621 break;
622 }
623 continue;
624 } else
625 best_good_sectors = sectors;
626
523 if (!do_balance) 627 if (!do_balance)
524 break; 628 break;
525 629
@@ -561,6 +665,7 @@ retry:
561 } else 665 } else
562 disk = -1; 666 disk = -1;
563 rcu_read_unlock(); 667 rcu_read_unlock();
668 *max_sectors = best_good_sectors;
564 669
565 return disk; 670 return disk;
566} 671}
@@ -734,6 +839,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
734 unsigned long flags; 839 unsigned long flags;
735 mdk_rdev_t *blocked_rdev; 840 mdk_rdev_t *blocked_rdev;
736 int plugged; 841 int plugged;
842 int sectors_handled;
843 int max_sectors;
737 844
738 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 845 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
739 md_flush_request(mddev, bio); 846 md_flush_request(mddev, bio);
@@ -808,12 +915,26 @@ static int make_request(mddev_t *mddev, struct bio * bio)
808 r10_bio->sector = bio->bi_sector; 915 r10_bio->sector = bio->bi_sector;
809 r10_bio->state = 0; 916 r10_bio->state = 0;
810 917
918 /* We might need to issue multiple reads to different
919 * devices if there are bad blocks around, so we keep
920 * track of the number of reads in bio->bi_phys_segments.
921 * If this is 0, there is only one r10_bio and no locking
922 * will be needed when the request completes. If it is
923 * non-zero, then it is the number of not-completed requests.
924 */
925 bio->bi_phys_segments = 0;
926 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
927
811 if (rw == READ) { 928 if (rw == READ) {
812 /* 929 /*
813 * read balancing logic: 930 * read balancing logic:
814 */ 931 */
815 int disk = read_balance(conf, r10_bio); 932 int disk;
816 int slot = r10_bio->read_slot; 933 int slot;
934
935read_again:
936 disk = read_balance(conf, r10_bio, &max_sectors);
937 slot = r10_bio->read_slot;
817 if (disk < 0) { 938 if (disk < 0) {
818 raid_end_bio_io(r10_bio); 939 raid_end_bio_io(r10_bio);
819 return 0; 940 return 0;
@@ -821,6 +942,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
821 mirror = conf->mirrors + disk; 942 mirror = conf->mirrors + disk;
822 943
823 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 944 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
945 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
946 max_sectors);
824 947
825 r10_bio->devs[slot].bio = read_bio; 948 r10_bio->devs[slot].bio = read_bio;
826 949
@@ -831,7 +954,37 @@ static int make_request(mddev_t *mddev, struct bio * bio)
831 read_bio->bi_rw = READ | do_sync; 954 read_bio->bi_rw = READ | do_sync;
832 read_bio->bi_private = r10_bio; 955 read_bio->bi_private = r10_bio;
833 956
834 generic_make_request(read_bio); 957 if (max_sectors < r10_bio->sectors) {
958 /* Could not read all from this device, so we will
959 * need another r10_bio.
960 */
961 sectors_handled = (r10_bio->sectors + max_sectors
962 - bio->bi_sector);
963 r10_bio->sectors = max_sectors;
964 spin_lock_irq(&conf->device_lock);
965 if (bio->bi_phys_segments == 0)
966 bio->bi_phys_segments = 2;
967 else
968 bio->bi_phys_segments++;
969 spin_unlock(&conf->device_lock);
970 /* Cannot call generic_make_request directly
971 * as that will be queued in __generic_make_request
972 * and subsequent mempool_alloc might block
973 * waiting for it. so hand bio over to raid10d.
974 */
975 reschedule_retry(r10_bio);
976
977 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
978
979 r10_bio->master_bio = bio;
980 r10_bio->sectors = ((bio->bi_size >> 9)
981 - sectors_handled);
982 r10_bio->state = 0;
983 r10_bio->mddev = mddev;
984 r10_bio->sector = bio->bi_sector + sectors_handled;
985 goto read_again;
986 } else
987 generic_make_request(read_bio);
835 return 0; 988 return 0;
836 } 989 }
837 990
@@ -841,13 +994,22 @@ static int make_request(mddev_t *mddev, struct bio * bio)
841 /* first select target devices under rcu_lock and 994 /* first select target devices under rcu_lock and
842 * inc refcount on their rdev. Record them by setting 995 * inc refcount on their rdev. Record them by setting
843 * bios[x] to bio 996 * bios[x] to bio
997 * If there are known/acknowledged bad blocks on any device
998 * on which we have seen a write error, we want to avoid
999 * writing to those blocks. This potentially requires several
1000 * writes to write around the bad blocks. Each set of writes
1001 * gets its own r10_bio with a set of bios attached. The number
1002 * of r10_bios is recored in bio->bi_phys_segments just as with
1003 * the read case.
844 */ 1004 */
845 plugged = mddev_check_plugged(mddev); 1005 plugged = mddev_check_plugged(mddev);
846 1006
847 raid10_find_phys(conf, r10_bio); 1007 raid10_find_phys(conf, r10_bio);
848 retry_write: 1008retry_write:
849 blocked_rdev = NULL; 1009 blocked_rdev = NULL;
850 rcu_read_lock(); 1010 rcu_read_lock();
1011 max_sectors = r10_bio->sectors;
1012
851 for (i = 0; i < conf->copies; i++) { 1013 for (i = 0; i < conf->copies; i++) {
852 int d = r10_bio->devs[i].devnum; 1014 int d = r10_bio->devs[i].devnum;
853 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); 1015 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -856,13 +1018,55 @@ static int make_request(mddev_t *mddev, struct bio * bio)
856 blocked_rdev = rdev; 1018 blocked_rdev = rdev;
857 break; 1019 break;
858 } 1020 }
859 if (rdev && !test_bit(Faulty, &rdev->flags)) { 1021 r10_bio->devs[i].bio = NULL;
860 atomic_inc(&rdev->nr_pending); 1022 if (!rdev || test_bit(Faulty, &rdev->flags)) {
861 r10_bio->devs[i].bio = bio;
862 } else {
863 r10_bio->devs[i].bio = NULL;
864 set_bit(R10BIO_Degraded, &r10_bio->state); 1023 set_bit(R10BIO_Degraded, &r10_bio->state);
1024 continue;
865 } 1025 }
1026 if (test_bit(WriteErrorSeen, &rdev->flags)) {
1027 sector_t first_bad;
1028 sector_t dev_sector = r10_bio->devs[i].addr;
1029 int bad_sectors;
1030 int is_bad;
1031
1032 is_bad = is_badblock(rdev, dev_sector,
1033 max_sectors,
1034 &first_bad, &bad_sectors);
1035 if (is_bad < 0) {
1036 /* Mustn't write here until the bad block
1037 * is acknowledged
1038 */
1039 atomic_inc(&rdev->nr_pending);
1040 set_bit(BlockedBadBlocks, &rdev->flags);
1041 blocked_rdev = rdev;
1042 break;
1043 }
1044 if (is_bad && first_bad <= dev_sector) {
1045 /* Cannot write here at all */
1046 bad_sectors -= (dev_sector - first_bad);
1047 if (bad_sectors < max_sectors)
1048 /* Mustn't write more than bad_sectors
1049 * to other devices yet
1050 */
1051 max_sectors = bad_sectors;
1052 /* We don't set R10BIO_Degraded as that
1053 * only applies if the disk is missing,
1054 * so it might be re-added, and we want to
1055 * know to recover this chunk.
1056 * In this case the device is here, and the
1057 * fact that this chunk is not in-sync is
1058 * recorded in the bad block log.
1059 */
1060 continue;
1061 }
1062 if (is_bad) {
1063 int good_sectors = first_bad - dev_sector;
1064 if (good_sectors < max_sectors)
1065 max_sectors = good_sectors;
1066 }
1067 }
1068 r10_bio->devs[i].bio = bio;
1069 atomic_inc(&rdev->nr_pending);
866 } 1070 }
867 rcu_read_unlock(); 1071 rcu_read_unlock();
868 1072
@@ -882,8 +1086,22 @@ static int make_request(mddev_t *mddev, struct bio * bio)
882 goto retry_write; 1086 goto retry_write;
883 } 1087 }
884 1088
1089 if (max_sectors < r10_bio->sectors) {
1090 /* We are splitting this into multiple parts, so
1091 * we need to prepare for allocating another r10_bio.
1092 */
1093 r10_bio->sectors = max_sectors;
1094 spin_lock_irq(&conf->device_lock);
1095 if (bio->bi_phys_segments == 0)
1096 bio->bi_phys_segments = 2;
1097 else
1098 bio->bi_phys_segments++;
1099 spin_unlock_irq(&conf->device_lock);
1100 }
1101 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1102
885 atomic_set(&r10_bio->remaining, 1); 1103 atomic_set(&r10_bio->remaining, 1);
886 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); 1104 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
887 1105
888 for (i = 0; i < conf->copies; i++) { 1106 for (i = 0; i < conf->copies; i++) {
889 struct bio *mbio; 1107 struct bio *mbio;
@@ -892,10 +1110,12 @@ static int make_request(mddev_t *mddev, struct bio * bio)
892 continue; 1110 continue;
893 1111
894 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1112 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1113 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1114 max_sectors);
895 r10_bio->devs[i].bio = mbio; 1115 r10_bio->devs[i].bio = mbio;
896 1116
897 mbio->bi_sector = r10_bio->devs[i].addr+ 1117 mbio->bi_sector = (r10_bio->devs[i].addr+
898 conf->mirrors[d].rdev->data_offset; 1118 conf->mirrors[d].rdev->data_offset);
899 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 1119 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
900 mbio->bi_end_io = raid10_end_write_request; 1120 mbio->bi_end_io = raid10_end_write_request;
901 mbio->bi_rw = WRITE | do_sync | do_fua; 1121 mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -920,6 +1140,21 @@ static int make_request(mddev_t *mddev, struct bio * bio)
920 /* In case raid10d snuck in to freeze_array */ 1140 /* In case raid10d snuck in to freeze_array */
921 wake_up(&conf->wait_barrier); 1141 wake_up(&conf->wait_barrier);
922 1142
1143 if (sectors_handled < (bio->bi_size >> 9)) {
1144 /* We need another r10_bio. It has already been counted
1145 * in bio->bi_phys_segments.
1146 */
1147 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1148
1149 r10_bio->master_bio = bio;
1150 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1151
1152 r10_bio->mddev = mddev;
1153 r10_bio->sector = bio->bi_sector + sectors_handled;
1154 r10_bio->state = 0;
1155 goto retry_write;
1156 }
1157
923 if (do_sync || !mddev->bitmap || !plugged) 1158 if (do_sync || !mddev->bitmap || !plugged)
924 md_wakeup_thread(mddev->thread); 1159 md_wakeup_thread(mddev->thread);
925 return 0; 1160 return 0;
@@ -949,6 +1184,30 @@ static void status(struct seq_file *seq, mddev_t *mddev)
949 seq_printf(seq, "]"); 1184 seq_printf(seq, "]");
950} 1185}
951 1186
1187/* check if there are enough drives for
1188 * every block to appear on atleast one.
1189 * Don't consider the device numbered 'ignore'
1190 * as we might be about to remove it.
1191 */
1192static int enough(conf_t *conf, int ignore)
1193{
1194 int first = 0;
1195
1196 do {
1197 int n = conf->copies;
1198 int cnt = 0;
1199 while (n--) {
1200 if (conf->mirrors[first].rdev &&
1201 first != ignore)
1202 cnt++;
1203 first = (first+1) % conf->raid_disks;
1204 }
1205 if (cnt == 0)
1206 return 0;
1207 } while (first != 0);
1208 return 1;
1209}
1210
952static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1211static void error(mddev_t *mddev, mdk_rdev_t *rdev)
953{ 1212{
954 char b[BDEVNAME_SIZE]; 1213 char b[BDEVNAME_SIZE];
@@ -961,13 +1220,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
961 * else mark the drive as failed 1220 * else mark the drive as failed
962 */ 1221 */
963 if (test_bit(In_sync, &rdev->flags) 1222 if (test_bit(In_sync, &rdev->flags)
964 && conf->raid_disks-mddev->degraded == 1) 1223 && !enough(conf, rdev->raid_disk))
965 /* 1224 /*
966 * Don't fail the drive, just return an IO error. 1225 * Don't fail the drive, just return an IO error.
967 * The test should really be more sophisticated than
968 * "working_disks == 1", but it isn't critical, and
969 * can wait until we do more sophisticated "is the drive
970 * really dead" tests...
971 */ 1226 */
972 return; 1227 return;
973 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1228 if (test_and_clear_bit(In_sync, &rdev->flags)) {
@@ -980,6 +1235,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
980 */ 1235 */
981 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1236 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
982 } 1237 }
1238 set_bit(Blocked, &rdev->flags);
983 set_bit(Faulty, &rdev->flags); 1239 set_bit(Faulty, &rdev->flags);
984 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1240 set_bit(MD_CHANGE_DEVS, &mddev->flags);
985 printk(KERN_ALERT 1241 printk(KERN_ALERT
@@ -1022,27 +1278,6 @@ static void close_sync(conf_t *conf)
1022 conf->r10buf_pool = NULL; 1278 conf->r10buf_pool = NULL;
1023} 1279}
1024 1280
1025/* check if there are enough drives for
1026 * every block to appear on atleast one
1027 */
1028static int enough(conf_t *conf)
1029{
1030 int first = 0;
1031
1032 do {
1033 int n = conf->copies;
1034 int cnt = 0;
1035 while (n--) {
1036 if (conf->mirrors[first].rdev)
1037 cnt++;
1038 first = (first+1) % conf->raid_disks;
1039 }
1040 if (cnt == 0)
1041 return 0;
1042 } while (first != 0);
1043 return 1;
1044}
1045
1046static int raid10_spare_active(mddev_t *mddev) 1281static int raid10_spare_active(mddev_t *mddev)
1047{ 1282{
1048 int i; 1283 int i;
@@ -1078,7 +1313,6 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1078 conf_t *conf = mddev->private; 1313 conf_t *conf = mddev->private;
1079 int err = -EEXIST; 1314 int err = -EEXIST;
1080 int mirror; 1315 int mirror;
1081 mirror_info_t *p;
1082 int first = 0; 1316 int first = 0;
1083 int last = conf->raid_disks - 1; 1317 int last = conf->raid_disks - 1;
1084 1318
@@ -1087,44 +1321,47 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1087 * very different from resync 1321 * very different from resync
1088 */ 1322 */
1089 return -EBUSY; 1323 return -EBUSY;
1090 if (!enough(conf)) 1324 if (!enough(conf, -1))
1091 return -EINVAL; 1325 return -EINVAL;
1092 1326
1093 if (rdev->raid_disk >= 0) 1327 if (rdev->raid_disk >= 0)
1094 first = last = rdev->raid_disk; 1328 first = last = rdev->raid_disk;
1095 1329
1096 if (rdev->saved_raid_disk >= 0 && 1330 if (rdev->saved_raid_disk >= first &&
1097 rdev->saved_raid_disk >= first &&
1098 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1331 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1099 mirror = rdev->saved_raid_disk; 1332 mirror = rdev->saved_raid_disk;
1100 else 1333 else
1101 mirror = first; 1334 mirror = first;
1102 for ( ; mirror <= last ; mirror++) 1335 for ( ; mirror <= last ; mirror++) {
1103 if ( !(p=conf->mirrors+mirror)->rdev) { 1336 mirror_info_t *p = &conf->mirrors[mirror];
1104 1337 if (p->recovery_disabled == mddev->recovery_disabled)
1105 disk_stack_limits(mddev->gendisk, rdev->bdev, 1338 continue;
1106 rdev->data_offset << 9); 1339 if (!p->rdev)
1107 /* as we don't honour merge_bvec_fn, we must 1340 continue;
1108 * never risk violating it, so limit
1109 * ->max_segments to one lying with a single
1110 * page, as a one page request is never in
1111 * violation.
1112 */
1113 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1114 blk_queue_max_segments(mddev->queue, 1);
1115 blk_queue_segment_boundary(mddev->queue,
1116 PAGE_CACHE_SIZE - 1);
1117 }
1118 1341
1119 p->head_position = 0; 1342 disk_stack_limits(mddev->gendisk, rdev->bdev,
1120 rdev->raid_disk = mirror; 1343 rdev->data_offset << 9);
1121 err = 0; 1344 /* as we don't honour merge_bvec_fn, we must
1122 if (rdev->saved_raid_disk != mirror) 1345 * never risk violating it, so limit
1123 conf->fullsync = 1; 1346 * ->max_segments to one lying with a single
1124 rcu_assign_pointer(p->rdev, rdev); 1347 * page, as a one page request is never in
1125 break; 1348 * violation.
1349 */
1350 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1351 blk_queue_max_segments(mddev->queue, 1);
1352 blk_queue_segment_boundary(mddev->queue,
1353 PAGE_CACHE_SIZE - 1);
1126 } 1354 }
1127 1355
1356 p->head_position = 0;
1357 rdev->raid_disk = mirror;
1358 err = 0;
1359 if (rdev->saved_raid_disk != mirror)
1360 conf->fullsync = 1;
1361 rcu_assign_pointer(p->rdev, rdev);
1362 break;
1363 }
1364
1128 md_integrity_add_rdev(rdev, mddev); 1365 md_integrity_add_rdev(rdev, mddev);
1129 print_conf(conf); 1366 print_conf(conf);
1130 return err; 1367 return err;
@@ -1149,7 +1386,8 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
1149 * is not possible. 1386 * is not possible.
1150 */ 1387 */
1151 if (!test_bit(Faulty, &rdev->flags) && 1388 if (!test_bit(Faulty, &rdev->flags) &&
1152 enough(conf)) { 1389 mddev->recovery_disabled != p->recovery_disabled &&
1390 enough(conf, -1)) {
1153 err = -EBUSY; 1391 err = -EBUSY;
1154 goto abort; 1392 goto abort;
1155 } 1393 }
@@ -1174,24 +1412,18 @@ static void end_sync_read(struct bio *bio, int error)
1174{ 1412{
1175 r10bio_t *r10_bio = bio->bi_private; 1413 r10bio_t *r10_bio = bio->bi_private;
1176 conf_t *conf = r10_bio->mddev->private; 1414 conf_t *conf = r10_bio->mddev->private;
1177 int i,d; 1415 int d;
1178 1416
1179 for (i=0; i<conf->copies; i++) 1417 d = find_bio_disk(conf, r10_bio, bio, NULL);
1180 if (r10_bio->devs[i].bio == bio)
1181 break;
1182 BUG_ON(i == conf->copies);
1183 update_head_pos(i, r10_bio);
1184 d = r10_bio->devs[i].devnum;
1185 1418
1186 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1419 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1187 set_bit(R10BIO_Uptodate, &r10_bio->state); 1420 set_bit(R10BIO_Uptodate, &r10_bio->state);
1188 else { 1421 else
1422 /* The write handler will notice the lack of
1423 * R10BIO_Uptodate and record any errors etc
1424 */
1189 atomic_add(r10_bio->sectors, 1425 atomic_add(r10_bio->sectors,
1190 &conf->mirrors[d].rdev->corrected_errors); 1426 &conf->mirrors[d].rdev->corrected_errors);
1191 if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1192 md_error(r10_bio->mddev,
1193 conf->mirrors[d].rdev);
1194 }
1195 1427
1196 /* for reconstruct, we always reschedule after a read. 1428 /* for reconstruct, we always reschedule after a read.
1197 * for resync, only after all reads 1429 * for resync, only after all reads
@@ -1206,40 +1438,60 @@ static void end_sync_read(struct bio *bio, int error)
1206 } 1438 }
1207} 1439}
1208 1440
1209static void end_sync_write(struct bio *bio, int error) 1441static void end_sync_request(r10bio_t *r10_bio)
1210{ 1442{
1211 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1212 r10bio_t *r10_bio = bio->bi_private;
1213 mddev_t *mddev = r10_bio->mddev; 1443 mddev_t *mddev = r10_bio->mddev;
1214 conf_t *conf = mddev->private;
1215 int i,d;
1216
1217 for (i = 0; i < conf->copies; i++)
1218 if (r10_bio->devs[i].bio == bio)
1219 break;
1220 d = r10_bio->devs[i].devnum;
1221 1444
1222 if (!uptodate)
1223 md_error(mddev, conf->mirrors[d].rdev);
1224
1225 update_head_pos(i, r10_bio);
1226
1227 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1228 while (atomic_dec_and_test(&r10_bio->remaining)) { 1445 while (atomic_dec_and_test(&r10_bio->remaining)) {
1229 if (r10_bio->master_bio == NULL) { 1446 if (r10_bio->master_bio == NULL) {
1230 /* the primary of several recovery bios */ 1447 /* the primary of several recovery bios */
1231 sector_t s = r10_bio->sectors; 1448 sector_t s = r10_bio->sectors;
1232 put_buf(r10_bio); 1449 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1450 test_bit(R10BIO_WriteError, &r10_bio->state))
1451 reschedule_retry(r10_bio);
1452 else
1453 put_buf(r10_bio);
1233 md_done_sync(mddev, s, 1); 1454 md_done_sync(mddev, s, 1);
1234 break; 1455 break;
1235 } else { 1456 } else {
1236 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; 1457 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1237 put_buf(r10_bio); 1458 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1459 test_bit(R10BIO_WriteError, &r10_bio->state))
1460 reschedule_retry(r10_bio);
1461 else
1462 put_buf(r10_bio);
1238 r10_bio = r10_bio2; 1463 r10_bio = r10_bio2;
1239 } 1464 }
1240 } 1465 }
1241} 1466}
1242 1467
1468static void end_sync_write(struct bio *bio, int error)
1469{
1470 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1471 r10bio_t *r10_bio = bio->bi_private;
1472 mddev_t *mddev = r10_bio->mddev;
1473 conf_t *conf = mddev->private;
1474 int d;
1475 sector_t first_bad;
1476 int bad_sectors;
1477 int slot;
1478
1479 d = find_bio_disk(conf, r10_bio, bio, &slot);
1480
1481 if (!uptodate) {
1482 set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
1483 set_bit(R10BIO_WriteError, &r10_bio->state);
1484 } else if (is_badblock(conf->mirrors[d].rdev,
1485 r10_bio->devs[slot].addr,
1486 r10_bio->sectors,
1487 &first_bad, &bad_sectors))
1488 set_bit(R10BIO_MadeGood, &r10_bio->state);
1489
1490 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1491
1492 end_sync_request(r10_bio);
1493}
1494
1243/* 1495/*
1244 * Note: sync and recover and handled very differently for raid10 1496 * Note: sync and recover and handled very differently for raid10
1245 * This code is for resync. 1497 * This code is for resync.
@@ -1299,11 +1551,12 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1299 if (j == vcnt) 1551 if (j == vcnt)
1300 continue; 1552 continue;
1301 mddev->resync_mismatches += r10_bio->sectors; 1553 mddev->resync_mismatches += r10_bio->sectors;
1554 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1555 /* Don't fix anything. */
1556 continue;
1302 } 1557 }
1303 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 1558 /* Ok, we need to write this bio, either to correct an
1304 /* Don't fix anything. */ 1559 * inconsistency or to correct an unreadable block.
1305 continue;
1306 /* Ok, we need to write this bio
1307 * First we need to fixup bv_offset, bv_len and 1560 * First we need to fixup bv_offset, bv_len and
1308 * bi_vecs, as the read request might have corrupted these 1561 * bi_vecs, as the read request might have corrupted these
1309 */ 1562 */
@@ -1355,32 +1608,107 @@ done:
1355 * The second for writing. 1608 * The second for writing.
1356 * 1609 *
1357 */ 1610 */
1611static void fix_recovery_read_error(r10bio_t *r10_bio)
1612{
1613 /* We got a read error during recovery.
1614 * We repeat the read in smaller page-sized sections.
1615 * If a read succeeds, write it to the new device or record
1616 * a bad block if we cannot.
1617 * If a read fails, record a bad block on both old and
1618 * new devices.
1619 */
1620 mddev_t *mddev = r10_bio->mddev;
1621 conf_t *conf = mddev->private;
1622 struct bio *bio = r10_bio->devs[0].bio;
1623 sector_t sect = 0;
1624 int sectors = r10_bio->sectors;
1625 int idx = 0;
1626 int dr = r10_bio->devs[0].devnum;
1627 int dw = r10_bio->devs[1].devnum;
1628
1629 while (sectors) {
1630 int s = sectors;
1631 mdk_rdev_t *rdev;
1632 sector_t addr;
1633 int ok;
1634
1635 if (s > (PAGE_SIZE>>9))
1636 s = PAGE_SIZE >> 9;
1637
1638 rdev = conf->mirrors[dr].rdev;
1639 addr = r10_bio->devs[0].addr + sect,
1640 ok = sync_page_io(rdev,
1641 addr,
1642 s << 9,
1643 bio->bi_io_vec[idx].bv_page,
1644 READ, false);
1645 if (ok) {
1646 rdev = conf->mirrors[dw].rdev;
1647 addr = r10_bio->devs[1].addr + sect;
1648 ok = sync_page_io(rdev,
1649 addr,
1650 s << 9,
1651 bio->bi_io_vec[idx].bv_page,
1652 WRITE, false);
1653 if (!ok)
1654 set_bit(WriteErrorSeen, &rdev->flags);
1655 }
1656 if (!ok) {
1657 /* We don't worry if we cannot set a bad block -
1658 * it really is bad so there is no loss in not
1659 * recording it yet
1660 */
1661 rdev_set_badblocks(rdev, addr, s, 0);
1662
1663 if (rdev != conf->mirrors[dw].rdev) {
1664 /* need bad block on destination too */
1665 mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev;
1666 addr = r10_bio->devs[1].addr + sect;
1667 ok = rdev_set_badblocks(rdev2, addr, s, 0);
1668 if (!ok) {
1669 /* just abort the recovery */
1670 printk(KERN_NOTICE
1671 "md/raid10:%s: recovery aborted"
1672 " due to read error\n",
1673 mdname(mddev));
1674
1675 conf->mirrors[dw].recovery_disabled
1676 = mddev->recovery_disabled;
1677 set_bit(MD_RECOVERY_INTR,
1678 &mddev->recovery);
1679 break;
1680 }
1681 }
1682 }
1683
1684 sectors -= s;
1685 sect += s;
1686 idx++;
1687 }
1688}
1358 1689
1359static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) 1690static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1360{ 1691{
1361 conf_t *conf = mddev->private; 1692 conf_t *conf = mddev->private;
1362 int i, d; 1693 int d;
1363 struct bio *bio, *wbio; 1694 struct bio *wbio;
1364 1695
1696 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
1697 fix_recovery_read_error(r10_bio);
1698 end_sync_request(r10_bio);
1699 return;
1700 }
1365 1701
1366 /* move the pages across to the second bio 1702 /*
1703 * share the pages with the first bio
1367 * and submit the write request 1704 * and submit the write request
1368 */ 1705 */
1369 bio = r10_bio->devs[0].bio;
1370 wbio = r10_bio->devs[1].bio; 1706 wbio = r10_bio->devs[1].bio;
1371 for (i=0; i < wbio->bi_vcnt; i++) {
1372 struct page *p = bio->bi_io_vec[i].bv_page;
1373 bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1374 wbio->bi_io_vec[i].bv_page = p;
1375 }
1376 d = r10_bio->devs[1].devnum; 1707 d = r10_bio->devs[1].devnum;
1377 1708
1378 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 1709 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1379 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 1710 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1380 if (test_bit(R10BIO_Uptodate, &r10_bio->state)) 1711 generic_make_request(wbio);
1381 generic_make_request(wbio);
1382 else
1383 bio_endio(wbio, -EIO);
1384} 1712}
1385 1713
1386 1714
@@ -1421,6 +1749,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1421 atomic_set(&rdev->read_errors, read_errors >> hours_since_last); 1749 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1422} 1750}
1423 1751
1752static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
1753 int sectors, struct page *page, int rw)
1754{
1755 sector_t first_bad;
1756 int bad_sectors;
1757
1758 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
1759 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
1760 return -1;
1761 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1762 /* success */
1763 return 1;
1764 if (rw == WRITE)
1765 set_bit(WriteErrorSeen, &rdev->flags);
1766 /* need to record an error - either for the block or the device */
1767 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1768 md_error(rdev->mddev, rdev);
1769 return 0;
1770}
1771
1424/* 1772/*
1425 * This is a kernel thread which: 1773 * This is a kernel thread which:
1426 * 1774 *
@@ -1476,10 +1824,15 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1476 1824
1477 rcu_read_lock(); 1825 rcu_read_lock();
1478 do { 1826 do {
1827 sector_t first_bad;
1828 int bad_sectors;
1829
1479 d = r10_bio->devs[sl].devnum; 1830 d = r10_bio->devs[sl].devnum;
1480 rdev = rcu_dereference(conf->mirrors[d].rdev); 1831 rdev = rcu_dereference(conf->mirrors[d].rdev);
1481 if (rdev && 1832 if (rdev &&
1482 test_bit(In_sync, &rdev->flags)) { 1833 test_bit(In_sync, &rdev->flags) &&
1834 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
1835 &first_bad, &bad_sectors) == 0) {
1483 atomic_inc(&rdev->nr_pending); 1836 atomic_inc(&rdev->nr_pending);
1484 rcu_read_unlock(); 1837 rcu_read_unlock();
1485 success = sync_page_io(rdev, 1838 success = sync_page_io(rdev,
@@ -1499,9 +1852,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1499 rcu_read_unlock(); 1852 rcu_read_unlock();
1500 1853
1501 if (!success) { 1854 if (!success) {
1502 /* Cannot read from anywhere -- bye bye array */ 1855 /* Cannot read from anywhere, just mark the block
1856 * as bad on the first device to discourage future
1857 * reads.
1858 */
1503 int dn = r10_bio->devs[r10_bio->read_slot].devnum; 1859 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
1504 md_error(mddev, conf->mirrors[dn].rdev); 1860 rdev = conf->mirrors[dn].rdev;
1861
1862 if (!rdev_set_badblocks(
1863 rdev,
1864 r10_bio->devs[r10_bio->read_slot].addr
1865 + sect,
1866 s, 0))
1867 md_error(mddev, rdev);
1505 break; 1868 break;
1506 } 1869 }
1507 1870
@@ -1516,80 +1879,82 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1516 sl--; 1879 sl--;
1517 d = r10_bio->devs[sl].devnum; 1880 d = r10_bio->devs[sl].devnum;
1518 rdev = rcu_dereference(conf->mirrors[d].rdev); 1881 rdev = rcu_dereference(conf->mirrors[d].rdev);
1519 if (rdev && 1882 if (!rdev ||
1520 test_bit(In_sync, &rdev->flags)) { 1883 !test_bit(In_sync, &rdev->flags))
1521 atomic_inc(&rdev->nr_pending); 1884 continue;
1522 rcu_read_unlock(); 1885
1523 atomic_add(s, &rdev->corrected_errors); 1886 atomic_inc(&rdev->nr_pending);
1524 if (sync_page_io(rdev, 1887 rcu_read_unlock();
1525 r10_bio->devs[sl].addr + 1888 if (r10_sync_page_io(rdev,
1526 sect, 1889 r10_bio->devs[sl].addr +
1527 s<<9, conf->tmppage, WRITE, false) 1890 sect,
1528 == 0) { 1891 s<<9, conf->tmppage, WRITE)
1529 /* Well, this device is dead */ 1892 == 0) {
1530 printk(KERN_NOTICE 1893 /* Well, this device is dead */
1531 "md/raid10:%s: read correction " 1894 printk(KERN_NOTICE
1532 "write failed" 1895 "md/raid10:%s: read correction "
1533 " (%d sectors at %llu on %s)\n", 1896 "write failed"
1534 mdname(mddev), s, 1897 " (%d sectors at %llu on %s)\n",
1535 (unsigned long long)( 1898 mdname(mddev), s,
1536 sect + rdev->data_offset), 1899 (unsigned long long)(
1537 bdevname(rdev->bdev, b)); 1900 sect + rdev->data_offset),
1538 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 1901 bdevname(rdev->bdev, b));
1539 "drive\n", 1902 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1540 mdname(mddev), 1903 "drive\n",
1541 bdevname(rdev->bdev, b)); 1904 mdname(mddev),
1542 md_error(mddev, rdev); 1905 bdevname(rdev->bdev, b));
1543 }
1544 rdev_dec_pending(rdev, mddev);
1545 rcu_read_lock();
1546 } 1906 }
1907 rdev_dec_pending(rdev, mddev);
1908 rcu_read_lock();
1547 } 1909 }
1548 sl = start; 1910 sl = start;
1549 while (sl != r10_bio->read_slot) { 1911 while (sl != r10_bio->read_slot) {
1912 char b[BDEVNAME_SIZE];
1550 1913
1551 if (sl==0) 1914 if (sl==0)
1552 sl = conf->copies; 1915 sl = conf->copies;
1553 sl--; 1916 sl--;
1554 d = r10_bio->devs[sl].devnum; 1917 d = r10_bio->devs[sl].devnum;
1555 rdev = rcu_dereference(conf->mirrors[d].rdev); 1918 rdev = rcu_dereference(conf->mirrors[d].rdev);
1556 if (rdev && 1919 if (!rdev ||
1557 test_bit(In_sync, &rdev->flags)) { 1920 !test_bit(In_sync, &rdev->flags))
1558 char b[BDEVNAME_SIZE]; 1921 continue;
1559 atomic_inc(&rdev->nr_pending);
1560 rcu_read_unlock();
1561 if (sync_page_io(rdev,
1562 r10_bio->devs[sl].addr +
1563 sect,
1564 s<<9, conf->tmppage,
1565 READ, false) == 0) {
1566 /* Well, this device is dead */
1567 printk(KERN_NOTICE
1568 "md/raid10:%s: unable to read back "
1569 "corrected sectors"
1570 " (%d sectors at %llu on %s)\n",
1571 mdname(mddev), s,
1572 (unsigned long long)(
1573 sect + rdev->data_offset),
1574 bdevname(rdev->bdev, b));
1575 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
1576 mdname(mddev),
1577 bdevname(rdev->bdev, b));
1578
1579 md_error(mddev, rdev);
1580 } else {
1581 printk(KERN_INFO
1582 "md/raid10:%s: read error corrected"
1583 " (%d sectors at %llu on %s)\n",
1584 mdname(mddev), s,
1585 (unsigned long long)(
1586 sect + rdev->data_offset),
1587 bdevname(rdev->bdev, b));
1588 }
1589 1922
1590 rdev_dec_pending(rdev, mddev); 1923 atomic_inc(&rdev->nr_pending);
1591 rcu_read_lock(); 1924 rcu_read_unlock();
1925 switch (r10_sync_page_io(rdev,
1926 r10_bio->devs[sl].addr +
1927 sect,
1928 s<<9, conf->tmppage,
1929 READ)) {
1930 case 0:
1931 /* Well, this device is dead */
1932 printk(KERN_NOTICE
1933 "md/raid10:%s: unable to read back "
1934 "corrected sectors"
1935 " (%d sectors at %llu on %s)\n",
1936 mdname(mddev), s,
1937 (unsigned long long)(
1938 sect + rdev->data_offset),
1939 bdevname(rdev->bdev, b));
1940 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1941 "drive\n",
1942 mdname(mddev),
1943 bdevname(rdev->bdev, b));
1944 break;
1945 case 1:
1946 printk(KERN_INFO
1947 "md/raid10:%s: read error corrected"
1948 " (%d sectors at %llu on %s)\n",
1949 mdname(mddev), s,
1950 (unsigned long long)(
1951 sect + rdev->data_offset),
1952 bdevname(rdev->bdev, b));
1953 atomic_add(s, &rdev->corrected_errors);
1592 } 1954 }
1955
1956 rdev_dec_pending(rdev, mddev);
1957 rcu_read_lock();
1593 } 1958 }
1594 rcu_read_unlock(); 1959 rcu_read_unlock();
1595 1960
@@ -1598,21 +1963,254 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1598 } 1963 }
1599} 1964}
1600 1965
1966static void bi_complete(struct bio *bio, int error)
1967{
1968 complete((struct completion *)bio->bi_private);
1969}
1970
1971static int submit_bio_wait(int rw, struct bio *bio)
1972{
1973 struct completion event;
1974 rw |= REQ_SYNC;
1975
1976 init_completion(&event);
1977 bio->bi_private = &event;
1978 bio->bi_end_io = bi_complete;
1979 submit_bio(rw, bio);
1980 wait_for_completion(&event);
1981
1982 return test_bit(BIO_UPTODATE, &bio->bi_flags);
1983}
1984
1985static int narrow_write_error(r10bio_t *r10_bio, int i)
1986{
1987 struct bio *bio = r10_bio->master_bio;
1988 mddev_t *mddev = r10_bio->mddev;
1989 conf_t *conf = mddev->private;
1990 mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
1991 /* bio has the data to be written to slot 'i' where
1992 * we just recently had a write error.
1993 * We repeatedly clone the bio and trim down to one block,
1994 * then try the write. Where the write fails we record
1995 * a bad block.
1996 * It is conceivable that the bio doesn't exactly align with
1997 * blocks. We must handle this.
1998 *
1999 * We currently own a reference to the rdev.
2000 */
2001
2002 int block_sectors;
2003 sector_t sector;
2004 int sectors;
2005 int sect_to_write = r10_bio->sectors;
2006 int ok = 1;
2007
2008 if (rdev->badblocks.shift < 0)
2009 return 0;
2010
2011 block_sectors = 1 << rdev->badblocks.shift;
2012 sector = r10_bio->sector;
2013 sectors = ((r10_bio->sector + block_sectors)
2014 & ~(sector_t)(block_sectors - 1))
2015 - sector;
2016
2017 while (sect_to_write) {
2018 struct bio *wbio;
2019 if (sectors > sect_to_write)
2020 sectors = sect_to_write;
2021 /* Write at 'sector' for 'sectors' */
2022 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2023 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2024 wbio->bi_sector = (r10_bio->devs[i].addr+
2025 rdev->data_offset+
2026 (sector - r10_bio->sector));
2027 wbio->bi_bdev = rdev->bdev;
2028 if (submit_bio_wait(WRITE, wbio) == 0)
2029 /* Failure! */
2030 ok = rdev_set_badblocks(rdev, sector,
2031 sectors, 0)
2032 && ok;
2033
2034 bio_put(wbio);
2035 sect_to_write -= sectors;
2036 sector += sectors;
2037 sectors = block_sectors;
2038 }
2039 return ok;
2040}
2041
2042static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
2043{
2044 int slot = r10_bio->read_slot;
2045 int mirror = r10_bio->devs[slot].devnum;
2046 struct bio *bio;
2047 conf_t *conf = mddev->private;
2048 mdk_rdev_t *rdev;
2049 char b[BDEVNAME_SIZE];
2050 unsigned long do_sync;
2051 int max_sectors;
2052
2053 /* we got a read error. Maybe the drive is bad. Maybe just
2054 * the block and we can fix it.
2055 * We freeze all other IO, and try reading the block from
2056 * other devices. When we find one, we re-write
2057 * and check it that fixes the read error.
2058 * This is all done synchronously while the array is
2059 * frozen.
2060 */
2061 if (mddev->ro == 0) {
2062 freeze_array(conf);
2063 fix_read_error(conf, mddev, r10_bio);
2064 unfreeze_array(conf);
2065 }
2066 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
2067
2068 bio = r10_bio->devs[slot].bio;
2069 bdevname(bio->bi_bdev, b);
2070 r10_bio->devs[slot].bio =
2071 mddev->ro ? IO_BLOCKED : NULL;
2072read_more:
2073 mirror = read_balance(conf, r10_bio, &max_sectors);
2074 if (mirror == -1) {
2075 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2076 " read error for block %llu\n",
2077 mdname(mddev), b,
2078 (unsigned long long)r10_bio->sector);
2079 raid_end_bio_io(r10_bio);
2080 bio_put(bio);
2081 return;
2082 }
2083
2084 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2085 if (bio)
2086 bio_put(bio);
2087 slot = r10_bio->read_slot;
2088 rdev = conf->mirrors[mirror].rdev;
2089 printk_ratelimited(
2090 KERN_ERR
2091 "md/raid10:%s: %s: redirecting"
2092 "sector %llu to another mirror\n",
2093 mdname(mddev),
2094 bdevname(rdev->bdev, b),
2095 (unsigned long long)r10_bio->sector);
2096 bio = bio_clone_mddev(r10_bio->master_bio,
2097 GFP_NOIO, mddev);
2098 md_trim_bio(bio,
2099 r10_bio->sector - bio->bi_sector,
2100 max_sectors);
2101 r10_bio->devs[slot].bio = bio;
2102 bio->bi_sector = r10_bio->devs[slot].addr
2103 + rdev->data_offset;
2104 bio->bi_bdev = rdev->bdev;
2105 bio->bi_rw = READ | do_sync;
2106 bio->bi_private = r10_bio;
2107 bio->bi_end_io = raid10_end_read_request;
2108 if (max_sectors < r10_bio->sectors) {
2109 /* Drat - have to split this up more */
2110 struct bio *mbio = r10_bio->master_bio;
2111 int sectors_handled =
2112 r10_bio->sector + max_sectors
2113 - mbio->bi_sector;
2114 r10_bio->sectors = max_sectors;
2115 spin_lock_irq(&conf->device_lock);
2116 if (mbio->bi_phys_segments == 0)
2117 mbio->bi_phys_segments = 2;
2118 else
2119 mbio->bi_phys_segments++;
2120 spin_unlock_irq(&conf->device_lock);
2121 generic_make_request(bio);
2122 bio = NULL;
2123
2124 r10_bio = mempool_alloc(conf->r10bio_pool,
2125 GFP_NOIO);
2126 r10_bio->master_bio = mbio;
2127 r10_bio->sectors = (mbio->bi_size >> 9)
2128 - sectors_handled;
2129 r10_bio->state = 0;
2130 set_bit(R10BIO_ReadError,
2131 &r10_bio->state);
2132 r10_bio->mddev = mddev;
2133 r10_bio->sector = mbio->bi_sector
2134 + sectors_handled;
2135
2136 goto read_more;
2137 } else
2138 generic_make_request(bio);
2139}
2140
2141static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio)
2142{
2143 /* Some sort of write request has finished and it
2144 * succeeded in writing where we thought there was a
2145 * bad block. So forget the bad block.
2146 * Or possibly if failed and we need to record
2147 * a bad block.
2148 */
2149 int m;
2150 mdk_rdev_t *rdev;
2151
2152 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2153 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2154 for (m = 0; m < conf->copies; m++) {
2155 int dev = r10_bio->devs[m].devnum;
2156 rdev = conf->mirrors[dev].rdev;
2157 if (r10_bio->devs[m].bio == NULL)
2158 continue;
2159 if (test_bit(BIO_UPTODATE,
2160 &r10_bio->devs[m].bio->bi_flags)) {
2161 rdev_clear_badblocks(
2162 rdev,
2163 r10_bio->devs[m].addr,
2164 r10_bio->sectors);
2165 } else {
2166 if (!rdev_set_badblocks(
2167 rdev,
2168 r10_bio->devs[m].addr,
2169 r10_bio->sectors, 0))
2170 md_error(conf->mddev, rdev);
2171 }
2172 }
2173 put_buf(r10_bio);
2174 } else {
2175 for (m = 0; m < conf->copies; m++) {
2176 int dev = r10_bio->devs[m].devnum;
2177 struct bio *bio = r10_bio->devs[m].bio;
2178 rdev = conf->mirrors[dev].rdev;
2179 if (bio == IO_MADE_GOOD) {
2180 rdev_clear_badblocks(
2181 rdev,
2182 r10_bio->devs[m].addr,
2183 r10_bio->sectors);
2184 rdev_dec_pending(rdev, conf->mddev);
2185 } else if (bio != NULL &&
2186 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2187 if (!narrow_write_error(r10_bio, m)) {
2188 md_error(conf->mddev, rdev);
2189 set_bit(R10BIO_Degraded,
2190 &r10_bio->state);
2191 }
2192 rdev_dec_pending(rdev, conf->mddev);
2193 }
2194 }
2195 if (test_bit(R10BIO_WriteError,
2196 &r10_bio->state))
2197 close_write(r10_bio);
2198 raid_end_bio_io(r10_bio);
2199 }
2200}
2201
1601static void raid10d(mddev_t *mddev) 2202static void raid10d(mddev_t *mddev)
1602{ 2203{
1603 r10bio_t *r10_bio; 2204 r10bio_t *r10_bio;
1604 struct bio *bio;
1605 unsigned long flags; 2205 unsigned long flags;
1606 conf_t *conf = mddev->private; 2206 conf_t *conf = mddev->private;
1607 struct list_head *head = &conf->retry_list; 2207 struct list_head *head = &conf->retry_list;
1608 mdk_rdev_t *rdev;
1609 struct blk_plug plug; 2208 struct blk_plug plug;
1610 2209
1611 md_check_recovery(mddev); 2210 md_check_recovery(mddev);
1612 2211
1613 blk_start_plug(&plug); 2212 blk_start_plug(&plug);
1614 for (;;) { 2213 for (;;) {
1615 char b[BDEVNAME_SIZE];
1616 2214
1617 flush_pending_writes(conf); 2215 flush_pending_writes(conf);
1618 2216
@@ -1628,64 +2226,26 @@ static void raid10d(mddev_t *mddev)
1628 2226
1629 mddev = r10_bio->mddev; 2227 mddev = r10_bio->mddev;
1630 conf = mddev->private; 2228 conf = mddev->private;
1631 if (test_bit(R10BIO_IsSync, &r10_bio->state)) 2229 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2230 test_bit(R10BIO_WriteError, &r10_bio->state))
2231 handle_write_completed(conf, r10_bio);
2232 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
1632 sync_request_write(mddev, r10_bio); 2233 sync_request_write(mddev, r10_bio);
1633 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 2234 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
1634 recovery_request_write(mddev, r10_bio); 2235 recovery_request_write(mddev, r10_bio);
2236 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2237 handle_read_error(mddev, r10_bio);
1635 else { 2238 else {
1636 int slot = r10_bio->read_slot; 2239 /* just a partial read to be scheduled from a
1637 int mirror = r10_bio->devs[slot].devnum; 2240 * separate context
1638 /* we got a read error. Maybe the drive is bad. Maybe just
1639 * the block and we can fix it.
1640 * We freeze all other IO, and try reading the block from
1641 * other devices. When we find one, we re-write
1642 * and check it that fixes the read error.
1643 * This is all done synchronously while the array is
1644 * frozen.
1645 */ 2241 */
1646 if (mddev->ro == 0) { 2242 int slot = r10_bio->read_slot;
1647 freeze_array(conf); 2243 generic_make_request(r10_bio->devs[slot].bio);
1648 fix_read_error(conf, mddev, r10_bio);
1649 unfreeze_array(conf);
1650 }
1651 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
1652
1653 bio = r10_bio->devs[slot].bio;
1654 r10_bio->devs[slot].bio =
1655 mddev->ro ? IO_BLOCKED : NULL;
1656 mirror = read_balance(conf, r10_bio);
1657 if (mirror == -1) {
1658 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
1659 " read error for block %llu\n",
1660 mdname(mddev),
1661 bdevname(bio->bi_bdev,b),
1662 (unsigned long long)r10_bio->sector);
1663 raid_end_bio_io(r10_bio);
1664 bio_put(bio);
1665 } else {
1666 const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
1667 bio_put(bio);
1668 slot = r10_bio->read_slot;
1669 rdev = conf->mirrors[mirror].rdev;
1670 if (printk_ratelimit())
1671 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
1672 " another mirror\n",
1673 mdname(mddev),
1674 bdevname(rdev->bdev,b),
1675 (unsigned long long)r10_bio->sector);
1676 bio = bio_clone_mddev(r10_bio->master_bio,
1677 GFP_NOIO, mddev);
1678 r10_bio->devs[slot].bio = bio;
1679 bio->bi_sector = r10_bio->devs[slot].addr
1680 + rdev->data_offset;
1681 bio->bi_bdev = rdev->bdev;
1682 bio->bi_rw = READ | do_sync;
1683 bio->bi_private = r10_bio;
1684 bio->bi_end_io = raid10_end_read_request;
1685 generic_make_request(bio);
1686 }
1687 } 2244 }
2245
1688 cond_resched(); 2246 cond_resched();
2247 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2248 md_check_recovery(mddev);
1689 } 2249 }
1690 blk_finish_plug(&plug); 2250 blk_finish_plug(&plug);
1691} 2251}
@@ -1746,7 +2306,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1746 int i; 2306 int i;
1747 int max_sync; 2307 int max_sync;
1748 sector_t sync_blocks; 2308 sector_t sync_blocks;
1749
1750 sector_t sectors_skipped = 0; 2309 sector_t sectors_skipped = 0;
1751 int chunks_skipped = 0; 2310 int chunks_skipped = 0;
1752 2311
@@ -1828,7 +2387,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1828 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); 2387 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1829 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2388 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1830 /* recovery... the complicated one */ 2389 /* recovery... the complicated one */
1831 int j, k; 2390 int j;
1832 r10_bio = NULL; 2391 r10_bio = NULL;
1833 2392
1834 for (i=0 ; i<conf->raid_disks; i++) { 2393 for (i=0 ; i<conf->raid_disks; i++) {
@@ -1836,6 +2395,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1836 r10bio_t *rb2; 2395 r10bio_t *rb2;
1837 sector_t sect; 2396 sector_t sect;
1838 int must_sync; 2397 int must_sync;
2398 int any_working;
1839 2399
1840 if (conf->mirrors[i].rdev == NULL || 2400 if (conf->mirrors[i].rdev == NULL ||
1841 test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 2401 test_bit(In_sync, &conf->mirrors[i].rdev->flags))
@@ -1887,19 +2447,42 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1887 must_sync = bitmap_start_sync(mddev->bitmap, sect, 2447 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1888 &sync_blocks, still_degraded); 2448 &sync_blocks, still_degraded);
1889 2449
2450 any_working = 0;
1890 for (j=0; j<conf->copies;j++) { 2451 for (j=0; j<conf->copies;j++) {
2452 int k;
1891 int d = r10_bio->devs[j].devnum; 2453 int d = r10_bio->devs[j].devnum;
2454 sector_t from_addr, to_addr;
2455 mdk_rdev_t *rdev;
2456 sector_t sector, first_bad;
2457 int bad_sectors;
1892 if (!conf->mirrors[d].rdev || 2458 if (!conf->mirrors[d].rdev ||
1893 !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) 2459 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
1894 continue; 2460 continue;
1895 /* This is where we read from */ 2461 /* This is where we read from */
2462 any_working = 1;
2463 rdev = conf->mirrors[d].rdev;
2464 sector = r10_bio->devs[j].addr;
2465
2466 if (is_badblock(rdev, sector, max_sync,
2467 &first_bad, &bad_sectors)) {
2468 if (first_bad > sector)
2469 max_sync = first_bad - sector;
2470 else {
2471 bad_sectors -= (sector
2472 - first_bad);
2473 if (max_sync > bad_sectors)
2474 max_sync = bad_sectors;
2475 continue;
2476 }
2477 }
1896 bio = r10_bio->devs[0].bio; 2478 bio = r10_bio->devs[0].bio;
1897 bio->bi_next = biolist; 2479 bio->bi_next = biolist;
1898 biolist = bio; 2480 biolist = bio;
1899 bio->bi_private = r10_bio; 2481 bio->bi_private = r10_bio;
1900 bio->bi_end_io = end_sync_read; 2482 bio->bi_end_io = end_sync_read;
1901 bio->bi_rw = READ; 2483 bio->bi_rw = READ;
1902 bio->bi_sector = r10_bio->devs[j].addr + 2484 from_addr = r10_bio->devs[j].addr;
2485 bio->bi_sector = from_addr +
1903 conf->mirrors[d].rdev->data_offset; 2486 conf->mirrors[d].rdev->data_offset;
1904 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2487 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1905 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2488 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
@@ -1916,26 +2499,48 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1916 bio->bi_private = r10_bio; 2499 bio->bi_private = r10_bio;
1917 bio->bi_end_io = end_sync_write; 2500 bio->bi_end_io = end_sync_write;
1918 bio->bi_rw = WRITE; 2501 bio->bi_rw = WRITE;
1919 bio->bi_sector = r10_bio->devs[k].addr + 2502 to_addr = r10_bio->devs[k].addr;
2503 bio->bi_sector = to_addr +
1920 conf->mirrors[i].rdev->data_offset; 2504 conf->mirrors[i].rdev->data_offset;
1921 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 2505 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1922 2506
1923 r10_bio->devs[0].devnum = d; 2507 r10_bio->devs[0].devnum = d;
2508 r10_bio->devs[0].addr = from_addr;
1924 r10_bio->devs[1].devnum = i; 2509 r10_bio->devs[1].devnum = i;
2510 r10_bio->devs[1].addr = to_addr;
1925 2511
1926 break; 2512 break;
1927 } 2513 }
1928 if (j == conf->copies) { 2514 if (j == conf->copies) {
1929 /* Cannot recover, so abort the recovery */ 2515 /* Cannot recover, so abort the recovery or
2516 * record a bad block */
1930 put_buf(r10_bio); 2517 put_buf(r10_bio);
1931 if (rb2) 2518 if (rb2)
1932 atomic_dec(&rb2->remaining); 2519 atomic_dec(&rb2->remaining);
1933 r10_bio = rb2; 2520 r10_bio = rb2;
1934 if (!test_and_set_bit(MD_RECOVERY_INTR, 2521 if (any_working) {
1935 &mddev->recovery)) 2522 /* problem is that there are bad blocks
1936 printk(KERN_INFO "md/raid10:%s: insufficient " 2523 * on other device(s)
1937 "working devices for recovery.\n", 2524 */
1938 mdname(mddev)); 2525 int k;
2526 for (k = 0; k < conf->copies; k++)
2527 if (r10_bio->devs[k].devnum == i)
2528 break;
2529 if (!rdev_set_badblocks(
2530 conf->mirrors[i].rdev,
2531 r10_bio->devs[k].addr,
2532 max_sync, 0))
2533 any_working = 0;
2534 }
2535 if (!any_working) {
2536 if (!test_and_set_bit(MD_RECOVERY_INTR,
2537 &mddev->recovery))
2538 printk(KERN_INFO "md/raid10:%s: insufficient "
2539 "working devices for recovery.\n",
2540 mdname(mddev));
2541 conf->mirrors[i].recovery_disabled
2542 = mddev->recovery_disabled;
2543 }
1939 break; 2544 break;
1940 } 2545 }
1941 } 2546 }
@@ -1979,12 +2584,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1979 2584
1980 for (i=0; i<conf->copies; i++) { 2585 for (i=0; i<conf->copies; i++) {
1981 int d = r10_bio->devs[i].devnum; 2586 int d = r10_bio->devs[i].devnum;
2587 sector_t first_bad, sector;
2588 int bad_sectors;
2589
1982 bio = r10_bio->devs[i].bio; 2590 bio = r10_bio->devs[i].bio;
1983 bio->bi_end_io = NULL; 2591 bio->bi_end_io = NULL;
1984 clear_bit(BIO_UPTODATE, &bio->bi_flags); 2592 clear_bit(BIO_UPTODATE, &bio->bi_flags);
1985 if (conf->mirrors[d].rdev == NULL || 2593 if (conf->mirrors[d].rdev == NULL ||
1986 test_bit(Faulty, &conf->mirrors[d].rdev->flags)) 2594 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1987 continue; 2595 continue;
2596 sector = r10_bio->devs[i].addr;
2597 if (is_badblock(conf->mirrors[d].rdev,
2598 sector, max_sync,
2599 &first_bad, &bad_sectors)) {
2600 if (first_bad > sector)
2601 max_sync = first_bad - sector;
2602 else {
2603 bad_sectors -= (sector - first_bad);
2604 if (max_sync > bad_sectors)
2605 max_sync = max_sync;
2606 continue;
2607 }
2608 }
1988 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2609 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1989 atomic_inc(&r10_bio->remaining); 2610 atomic_inc(&r10_bio->remaining);
1990 bio->bi_next = biolist; 2611 bio->bi_next = biolist;
@@ -1992,7 +2613,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1992 bio->bi_private = r10_bio; 2613 bio->bi_private = r10_bio;
1993 bio->bi_end_io = end_sync_read; 2614 bio->bi_end_io = end_sync_read;
1994 bio->bi_rw = READ; 2615 bio->bi_rw = READ;
1995 bio->bi_sector = r10_bio->devs[i].addr + 2616 bio->bi_sector = sector +
1996 conf->mirrors[d].rdev->data_offset; 2617 conf->mirrors[d].rdev->data_offset;
1997 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2618 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1998 count++; 2619 count++;
@@ -2079,7 +2700,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
2079 return sectors_skipped + nr_sectors; 2700 return sectors_skipped + nr_sectors;
2080 giveup: 2701 giveup:
2081 /* There is nowhere to write, so all non-sync 2702 /* There is nowhere to write, so all non-sync
2082 * drives must be failed, so try the next chunk... 2703 * drives must be failed or in resync, all drives
2704 * have a bad block, so try the next chunk...
2083 */ 2705 */
2084 if (sector_nr + max_sync < max_sector) 2706 if (sector_nr + max_sync < max_sector)
2085 max_sector = sector_nr + max_sync; 2707 max_sector = sector_nr + max_sync;
@@ -2249,6 +2871,7 @@ static int run(mddev_t *mddev)
2249 (conf->raid_disks / conf->near_copies)); 2871 (conf->raid_disks / conf->near_copies));
2250 2872
2251 list_for_each_entry(rdev, &mddev->disks, same_set) { 2873 list_for_each_entry(rdev, &mddev->disks, same_set) {
2874
2252 disk_idx = rdev->raid_disk; 2875 disk_idx = rdev->raid_disk;
2253 if (disk_idx >= conf->raid_disks 2876 if (disk_idx >= conf->raid_disks
2254 || disk_idx < 0) 2877 || disk_idx < 0)
@@ -2271,7 +2894,7 @@ static int run(mddev_t *mddev)
2271 disk->head_position = 0; 2894 disk->head_position = 0;
2272 } 2895 }
2273 /* need to check that every block has at least one working mirror */ 2896 /* need to check that every block has at least one working mirror */
2274 if (!enough(conf)) { 2897 if (!enough(conf, -1)) {
2275 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", 2898 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
2276 mdname(mddev)); 2899 mdname(mddev));
2277 goto out_free_conf; 2900 goto out_free_conf;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 944b1104d3b4..79cb52a0d4a2 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -6,6 +6,11 @@ typedef struct mirror_info mirror_info_t;
6struct mirror_info { 6struct mirror_info {
7 mdk_rdev_t *rdev; 7 mdk_rdev_t *rdev;
8 sector_t head_position; 8 sector_t head_position;
9 int recovery_disabled; /* matches
10 * mddev->recovery_disabled
11 * when we shouldn't try
12 * recovering this device.
13 */
9}; 14};
10 15
11typedef struct r10bio_s r10bio_t; 16typedef struct r10bio_s r10bio_t;
@@ -113,10 +118,26 @@ struct r10bio_s {
113 * level, we store IO_BLOCKED in the appropriate 'bios' pointer 118 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
114 */ 119 */
115#define IO_BLOCKED ((struct bio*)1) 120#define IO_BLOCKED ((struct bio*)1)
121/* When we successfully write to a known bad-block, we need to remove the
122 * bad-block marking which must be done from process context. So we record
123 * the success by setting devs[n].bio to IO_MADE_GOOD
124 */
125#define IO_MADE_GOOD ((struct bio *)2)
126
127#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
116 128
117/* bits for r10bio.state */ 129/* bits for r10bio.state */
118#define R10BIO_Uptodate 0 130#define R10BIO_Uptodate 0
119#define R10BIO_IsSync 1 131#define R10BIO_IsSync 1
120#define R10BIO_IsRecover 2 132#define R10BIO_IsRecover 2
121#define R10BIO_Degraded 3 133#define R10BIO_Degraded 3
134/* Set ReadError on bios that experience a read error
135 * so that raid10d knows what to do with them.
136 */
137#define R10BIO_ReadError 4
138/* If a write for this request means we can clear some
139 * known-bad-block records, we set this flag.
140 */
141#define R10BIO_MadeGood 5
142#define R10BIO_WriteError 6
122#endif 143#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b72edf35ec54..dbae459fb02d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -51,6 +51,7 @@
51#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/cpu.h> 52#include <linux/cpu.h>
53#include <linux/slab.h> 53#include <linux/slab.h>
54#include <linux/ratelimit.h>
54#include "md.h" 55#include "md.h"
55#include "raid5.h" 56#include "raid5.h"
56#include "raid0.h" 57#include "raid0.h"
@@ -96,8 +97,6 @@
96#define __inline__ 97#define __inline__
97#endif 98#endif
98 99
99#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
100
101/* 100/*
102 * We maintain a biased count of active stripes in the bottom 16 bits of 101 * We maintain a biased count of active stripes in the bottom 16 bits of
103 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 102 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -341,7 +340,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
341 (unsigned long long)sh->sector, i, dev->toread, 340 (unsigned long long)sh->sector, i, dev->toread,
342 dev->read, dev->towrite, dev->written, 341 dev->read, dev->towrite, dev->written,
343 test_bit(R5_LOCKED, &dev->flags)); 342 test_bit(R5_LOCKED, &dev->flags));
344 BUG(); 343 WARN_ON(1);
345 } 344 }
346 dev->flags = 0; 345 dev->flags = 0;
347 raid5_build_block(sh, i, previous); 346 raid5_build_block(sh, i, previous);
@@ -527,6 +526,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
527 atomic_inc(&rdev->nr_pending); 526 atomic_inc(&rdev->nr_pending);
528 rcu_read_unlock(); 527 rcu_read_unlock();
529 528
529 /* We have already checked bad blocks for reads. Now
530 * need to check for writes.
531 */
532 while ((rw & WRITE) && rdev &&
533 test_bit(WriteErrorSeen, &rdev->flags)) {
534 sector_t first_bad;
535 int bad_sectors;
536 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
537 &first_bad, &bad_sectors);
538 if (!bad)
539 break;
540
541 if (bad < 0) {
542 set_bit(BlockedBadBlocks, &rdev->flags);
543 if (!conf->mddev->external &&
544 conf->mddev->flags) {
545 /* It is very unlikely, but we might
546 * still need to write out the
547 * bad block log - better give it
548 * a chance*/
549 md_check_recovery(conf->mddev);
550 }
551 md_wait_for_blocked_rdev(rdev, conf->mddev);
552 } else {
553 /* Acknowledged bad block - skip the write */
554 rdev_dec_pending(rdev, conf->mddev);
555 rdev = NULL;
556 }
557 }
558
530 if (rdev) { 559 if (rdev) {
531 if (s->syncing || s->expanding || s->expanded) 560 if (s->syncing || s->expanding || s->expanded)
532 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 561 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
@@ -548,10 +577,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
548 bi->bi_io_vec[0].bv_offset = 0; 577 bi->bi_io_vec[0].bv_offset = 0;
549 bi->bi_size = STRIPE_SIZE; 578 bi->bi_size = STRIPE_SIZE;
550 bi->bi_next = NULL; 579 bi->bi_next = NULL;
551 if ((rw & WRITE) &&
552 test_bit(R5_ReWrite, &sh->dev[i].flags))
553 atomic_add(STRIPE_SECTORS,
554 &rdev->corrected_errors);
555 generic_make_request(bi); 580 generic_make_request(bi);
556 } else { 581 } else {
557 if (rw & WRITE) 582 if (rw & WRITE)
@@ -1020,12 +1045,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1020 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1045 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1021 struct bio *wbi; 1046 struct bio *wbi;
1022 1047
1023 spin_lock(&sh->lock); 1048 spin_lock_irq(&sh->raid_conf->device_lock);
1024 chosen = dev->towrite; 1049 chosen = dev->towrite;
1025 dev->towrite = NULL; 1050 dev->towrite = NULL;
1026 BUG_ON(dev->written); 1051 BUG_ON(dev->written);
1027 wbi = dev->written = chosen; 1052 wbi = dev->written = chosen;
1028 spin_unlock(&sh->lock); 1053 spin_unlock_irq(&sh->raid_conf->device_lock);
1029 1054
1030 while (wbi && wbi->bi_sector < 1055 while (wbi && wbi->bi_sector <
1031 dev->sector + STRIPE_SECTORS) { 1056 dev->sector + STRIPE_SECTORS) {
@@ -1315,12 +1340,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1315static int grow_one_stripe(raid5_conf_t *conf) 1340static int grow_one_stripe(raid5_conf_t *conf)
1316{ 1341{
1317 struct stripe_head *sh; 1342 struct stripe_head *sh;
1318 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1343 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
1319 if (!sh) 1344 if (!sh)
1320 return 0; 1345 return 0;
1321 memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); 1346
1322 sh->raid_conf = conf; 1347 sh->raid_conf = conf;
1323 spin_lock_init(&sh->lock);
1324 #ifdef CONFIG_MULTICORE_RAID456 1348 #ifdef CONFIG_MULTICORE_RAID456
1325 init_waitqueue_head(&sh->ops.wait_for_ops); 1349 init_waitqueue_head(&sh->ops.wait_for_ops);
1326 #endif 1350 #endif
@@ -1435,14 +1459,11 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1435 return -ENOMEM; 1459 return -ENOMEM;
1436 1460
1437 for (i = conf->max_nr_stripes; i; i--) { 1461 for (i = conf->max_nr_stripes; i; i--) {
1438 nsh = kmem_cache_alloc(sc, GFP_KERNEL); 1462 nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
1439 if (!nsh) 1463 if (!nsh)
1440 break; 1464 break;
1441 1465
1442 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
1443
1444 nsh->raid_conf = conf; 1466 nsh->raid_conf = conf;
1445 spin_lock_init(&nsh->lock);
1446 #ifdef CONFIG_MULTICORE_RAID456 1467 #ifdef CONFIG_MULTICORE_RAID456
1447 init_waitqueue_head(&nsh->ops.wait_for_ops); 1468 init_waitqueue_head(&nsh->ops.wait_for_ops);
1448 #endif 1469 #endif
@@ -1587,12 +1608,15 @@ static void raid5_end_read_request(struct bio * bi, int error)
1587 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1608 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1588 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1609 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1589 rdev = conf->disks[i].rdev; 1610 rdev = conf->disks[i].rdev;
1590 printk_rl(KERN_INFO "md/raid:%s: read error corrected" 1611 printk_ratelimited(
1591 " (%lu sectors at %llu on %s)\n", 1612 KERN_INFO
1592 mdname(conf->mddev), STRIPE_SECTORS, 1613 "md/raid:%s: read error corrected"
1593 (unsigned long long)(sh->sector 1614 " (%lu sectors at %llu on %s)\n",
1594 + rdev->data_offset), 1615 mdname(conf->mddev), STRIPE_SECTORS,
1595 bdevname(rdev->bdev, b)); 1616 (unsigned long long)(sh->sector
1617 + rdev->data_offset),
1618 bdevname(rdev->bdev, b));
1619 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1596 clear_bit(R5_ReadError, &sh->dev[i].flags); 1620 clear_bit(R5_ReadError, &sh->dev[i].flags);
1597 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1621 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1598 } 1622 }
@@ -1606,22 +1630,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
1606 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1630 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1607 atomic_inc(&rdev->read_errors); 1631 atomic_inc(&rdev->read_errors);
1608 if (conf->mddev->degraded >= conf->max_degraded) 1632 if (conf->mddev->degraded >= conf->max_degraded)
1609 printk_rl(KERN_WARNING 1633 printk_ratelimited(
1610 "md/raid:%s: read error not correctable " 1634 KERN_WARNING
1611 "(sector %llu on %s).\n", 1635 "md/raid:%s: read error not correctable "
1612 mdname(conf->mddev), 1636 "(sector %llu on %s).\n",
1613 (unsigned long long)(sh->sector 1637 mdname(conf->mddev),
1614 + rdev->data_offset), 1638 (unsigned long long)(sh->sector
1615 bdn); 1639 + rdev->data_offset),
1640 bdn);
1616 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1641 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1617 /* Oh, no!!! */ 1642 /* Oh, no!!! */
1618 printk_rl(KERN_WARNING 1643 printk_ratelimited(
1619 "md/raid:%s: read error NOT corrected!! " 1644 KERN_WARNING
1620 "(sector %llu on %s).\n", 1645 "md/raid:%s: read error NOT corrected!! "
1621 mdname(conf->mddev), 1646 "(sector %llu on %s).\n",
1622 (unsigned long long)(sh->sector 1647 mdname(conf->mddev),
1623 + rdev->data_offset), 1648 (unsigned long long)(sh->sector
1624 bdn); 1649 + rdev->data_offset),
1650 bdn);
1625 else if (atomic_read(&rdev->read_errors) 1651 else if (atomic_read(&rdev->read_errors)
1626 > conf->max_nr_stripes) 1652 > conf->max_nr_stripes)
1627 printk(KERN_WARNING 1653 printk(KERN_WARNING
@@ -1649,6 +1675,8 @@ static void raid5_end_write_request(struct bio *bi, int error)
1649 raid5_conf_t *conf = sh->raid_conf; 1675 raid5_conf_t *conf = sh->raid_conf;
1650 int disks = sh->disks, i; 1676 int disks = sh->disks, i;
1651 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1677 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1678 sector_t first_bad;
1679 int bad_sectors;
1652 1680
1653 for (i=0 ; i<disks; i++) 1681 for (i=0 ; i<disks; i++)
1654 if (bi == &sh->dev[i].req) 1682 if (bi == &sh->dev[i].req)
@@ -1662,8 +1690,12 @@ static void raid5_end_write_request(struct bio *bi, int error)
1662 return; 1690 return;
1663 } 1691 }
1664 1692
1665 if (!uptodate) 1693 if (!uptodate) {
1666 md_error(conf->mddev, conf->disks[i].rdev); 1694 set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
1695 set_bit(R5_WriteError, &sh->dev[i].flags);
1696 } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS,
1697 &first_bad, &bad_sectors))
1698 set_bit(R5_MadeGood, &sh->dev[i].flags);
1667 1699
1668 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1700 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1669 1701
@@ -1710,6 +1742,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1710 */ 1742 */
1711 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1743 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1712 } 1744 }
1745 set_bit(Blocked, &rdev->flags);
1713 set_bit(Faulty, &rdev->flags); 1746 set_bit(Faulty, &rdev->flags);
1714 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1747 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1715 printk(KERN_ALERT 1748 printk(KERN_ALERT
@@ -1760,7 +1793,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1760 /* 1793 /*
1761 * Select the parity disk based on the user selected algorithm. 1794 * Select the parity disk based on the user selected algorithm.
1762 */ 1795 */
1763 pd_idx = qd_idx = ~0; 1796 pd_idx = qd_idx = -1;
1764 switch(conf->level) { 1797 switch(conf->level) {
1765 case 4: 1798 case 4:
1766 pd_idx = data_disks; 1799 pd_idx = data_disks;
@@ -2143,12 +2176,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2143 raid5_conf_t *conf = sh->raid_conf; 2176 raid5_conf_t *conf = sh->raid_conf;
2144 int firstwrite=0; 2177 int firstwrite=0;
2145 2178
2146 pr_debug("adding bh b#%llu to stripe s#%llu\n", 2179 pr_debug("adding bi b#%llu to stripe s#%llu\n",
2147 (unsigned long long)bi->bi_sector, 2180 (unsigned long long)bi->bi_sector,
2148 (unsigned long long)sh->sector); 2181 (unsigned long long)sh->sector);
2149 2182
2150 2183
2151 spin_lock(&sh->lock);
2152 spin_lock_irq(&conf->device_lock); 2184 spin_lock_irq(&conf->device_lock);
2153 if (forwrite) { 2185 if (forwrite) {
2154 bip = &sh->dev[dd_idx].towrite; 2186 bip = &sh->dev[dd_idx].towrite;
@@ -2169,19 +2201,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2169 bi->bi_next = *bip; 2201 bi->bi_next = *bip;
2170 *bip = bi; 2202 *bip = bi;
2171 bi->bi_phys_segments++; 2203 bi->bi_phys_segments++;
2172 spin_unlock_irq(&conf->device_lock);
2173 spin_unlock(&sh->lock);
2174
2175 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2176 (unsigned long long)bi->bi_sector,
2177 (unsigned long long)sh->sector, dd_idx);
2178
2179 if (conf->mddev->bitmap && firstwrite) {
2180 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2181 STRIPE_SECTORS, 0);
2182 sh->bm_seq = conf->seq_flush+1;
2183 set_bit(STRIPE_BIT_DELAY, &sh->state);
2184 }
2185 2204
2186 if (forwrite) { 2205 if (forwrite) {
2187 /* check if page is covered */ 2206 /* check if page is covered */
@@ -2196,12 +2215,23 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2196 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2215 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2197 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2216 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2198 } 2217 }
2218 spin_unlock_irq(&conf->device_lock);
2219
2220 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2221 (unsigned long long)(*bip)->bi_sector,
2222 (unsigned long long)sh->sector, dd_idx);
2223
2224 if (conf->mddev->bitmap && firstwrite) {
2225 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2226 STRIPE_SECTORS, 0);
2227 sh->bm_seq = conf->seq_flush+1;
2228 set_bit(STRIPE_BIT_DELAY, &sh->state);
2229 }
2199 return 1; 2230 return 1;
2200 2231
2201 overlap: 2232 overlap:
2202 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2233 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2203 spin_unlock_irq(&conf->device_lock); 2234 spin_unlock_irq(&conf->device_lock);
2204 spin_unlock(&sh->lock);
2205 return 0; 2235 return 0;
2206} 2236}
2207 2237
@@ -2238,9 +2268,18 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2238 rcu_read_lock(); 2268 rcu_read_lock();
2239 rdev = rcu_dereference(conf->disks[i].rdev); 2269 rdev = rcu_dereference(conf->disks[i].rdev);
2240 if (rdev && test_bit(In_sync, &rdev->flags)) 2270 if (rdev && test_bit(In_sync, &rdev->flags))
2241 /* multiple read failures in one stripe */ 2271 atomic_inc(&rdev->nr_pending);
2242 md_error(conf->mddev, rdev); 2272 else
2273 rdev = NULL;
2243 rcu_read_unlock(); 2274 rcu_read_unlock();
2275 if (rdev) {
2276 if (!rdev_set_badblocks(
2277 rdev,
2278 sh->sector,
2279 STRIPE_SECTORS, 0))
2280 md_error(conf->mddev, rdev);
2281 rdev_dec_pending(rdev, conf->mddev);
2282 }
2244 } 2283 }
2245 spin_lock_irq(&conf->device_lock); 2284 spin_lock_irq(&conf->device_lock);
2246 /* fail all writes first */ 2285 /* fail all writes first */
@@ -2308,6 +2347,10 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2308 if (bitmap_end) 2347 if (bitmap_end)
2309 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2348 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2310 STRIPE_SECTORS, 0, 0); 2349 STRIPE_SECTORS, 0, 0);
2350 /* If we were in the middle of a write the parity block might
2351 * still be locked - so just clear all R5_LOCKED flags
2352 */
2353 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2311 } 2354 }
2312 2355
2313 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2356 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
@@ -2315,109 +2358,73 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2315 md_wakeup_thread(conf->mddev->thread); 2358 md_wakeup_thread(conf->mddev->thread);
2316} 2359}
2317 2360
2318/* fetch_block5 - checks the given member device to see if its data needs 2361static void
2319 * to be read or computed to satisfy a request. 2362handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh,
2320 * 2363 struct stripe_head_state *s)
2321 * Returns 1 when no more member devices need to be checked, otherwise returns
2322 * 0 to tell the loop in handle_stripe_fill5 to continue
2323 */
2324static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
2325 int disk_idx, int disks)
2326{
2327 struct r5dev *dev = &sh->dev[disk_idx];
2328 struct r5dev *failed_dev = &sh->dev[s->failed_num];
2329
2330 /* is the data in this block needed, and can we get it? */
2331 if (!test_bit(R5_LOCKED, &dev->flags) &&
2332 !test_bit(R5_UPTODATE, &dev->flags) &&
2333 (dev->toread ||
2334 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2335 s->syncing || s->expanding ||
2336 (s->failed &&
2337 (failed_dev->toread ||
2338 (failed_dev->towrite &&
2339 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
2340 /* We would like to get this block, possibly by computing it,
2341 * otherwise read it if the backing disk is insync
2342 */
2343 if ((s->uptodate == disks - 1) &&
2344 (s->failed && disk_idx == s->failed_num)) {
2345 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2346 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2347 set_bit(R5_Wantcompute, &dev->flags);
2348 sh->ops.target = disk_idx;
2349 sh->ops.target2 = -1;
2350 s->req_compute = 1;
2351 /* Careful: from this point on 'uptodate' is in the eye
2352 * of raid_run_ops which services 'compute' operations
2353 * before writes. R5_Wantcompute flags a block that will
2354 * be R5_UPTODATE by the time it is needed for a
2355 * subsequent operation.
2356 */
2357 s->uptodate++;
2358 return 1; /* uptodate + compute == disks */
2359 } else if (test_bit(R5_Insync, &dev->flags)) {
2360 set_bit(R5_LOCKED, &dev->flags);
2361 set_bit(R5_Wantread, &dev->flags);
2362 s->locked++;
2363 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2364 s->syncing);
2365 }
2366 }
2367
2368 return 0;
2369}
2370
2371/**
2372 * handle_stripe_fill5 - read or compute data to satisfy pending requests.
2373 */
2374static void handle_stripe_fill5(struct stripe_head *sh,
2375 struct stripe_head_state *s, int disks)
2376{ 2364{
2365 int abort = 0;
2377 int i; 2366 int i;
2378 2367
2379 /* look for blocks to read/compute, skip this if a compute 2368 md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2380 * is already in flight, or if the stripe contents are in the 2369 clear_bit(STRIPE_SYNCING, &sh->state);
2381 * midst of changing due to a write 2370 s->syncing = 0;
2371 /* There is nothing more to do for sync/check/repair.
2372 * For recover we need to record a bad block on all
2373 * non-sync devices, or abort the recovery
2382 */ 2374 */
2383 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2375 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
2384 !sh->reconstruct_state) 2376 return;
2385 for (i = disks; i--; ) 2377 /* During recovery devices cannot be removed, so locking and
2386 if (fetch_block5(sh, s, i, disks)) 2378 * refcounting of rdevs is not needed
2387 break; 2379 */
2388 set_bit(STRIPE_HANDLE, &sh->state); 2380 for (i = 0; i < conf->raid_disks; i++) {
2381 mdk_rdev_t *rdev = conf->disks[i].rdev;
2382 if (!rdev
2383 || test_bit(Faulty, &rdev->flags)
2384 || test_bit(In_sync, &rdev->flags))
2385 continue;
2386 if (!rdev_set_badblocks(rdev, sh->sector,
2387 STRIPE_SECTORS, 0))
2388 abort = 1;
2389 }
2390 if (abort) {
2391 conf->recovery_disabled = conf->mddev->recovery_disabled;
2392 set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
2393 }
2389} 2394}
2390 2395
2391/* fetch_block6 - checks the given member device to see if its data needs 2396/* fetch_block - checks the given member device to see if its data needs
2392 * to be read or computed to satisfy a request. 2397 * to be read or computed to satisfy a request.
2393 * 2398 *
2394 * Returns 1 when no more member devices need to be checked, otherwise returns 2399 * Returns 1 when no more member devices need to be checked, otherwise returns
2395 * 0 to tell the loop in handle_stripe_fill6 to continue 2400 * 0 to tell the loop in handle_stripe_fill to continue
2396 */ 2401 */
2397static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, 2402static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2398 struct r6_state *r6s, int disk_idx, int disks) 2403 int disk_idx, int disks)
2399{ 2404{
2400 struct r5dev *dev = &sh->dev[disk_idx]; 2405 struct r5dev *dev = &sh->dev[disk_idx];
2401 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], 2406 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
2402 &sh->dev[r6s->failed_num[1]] }; 2407 &sh->dev[s->failed_num[1]] };
2403 2408
2409 /* is the data in this block needed, and can we get it? */
2404 if (!test_bit(R5_LOCKED, &dev->flags) && 2410 if (!test_bit(R5_LOCKED, &dev->flags) &&
2405 !test_bit(R5_UPTODATE, &dev->flags) && 2411 !test_bit(R5_UPTODATE, &dev->flags) &&
2406 (dev->toread || 2412 (dev->toread ||
2407 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2413 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2408 s->syncing || s->expanding || 2414 s->syncing || s->expanding ||
2409 (s->failed >= 1 && 2415 (s->failed >= 1 && fdev[0]->toread) ||
2410 (fdev[0]->toread || s->to_write)) || 2416 (s->failed >= 2 && fdev[1]->toread) ||
2411 (s->failed >= 2 && 2417 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
2412 (fdev[1]->toread || s->to_write)))) { 2418 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
2419 (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
2413 /* we would like to get this block, possibly by computing it, 2420 /* we would like to get this block, possibly by computing it,
2414 * otherwise read it if the backing disk is insync 2421 * otherwise read it if the backing disk is insync
2415 */ 2422 */
2416 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2423 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2417 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2424 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2418 if ((s->uptodate == disks - 1) && 2425 if ((s->uptodate == disks - 1) &&
2419 (s->failed && (disk_idx == r6s->failed_num[0] || 2426 (s->failed && (disk_idx == s->failed_num[0] ||
2420 disk_idx == r6s->failed_num[1]))) { 2427 disk_idx == s->failed_num[1]))) {
2421 /* have disk failed, and we're requested to fetch it; 2428 /* have disk failed, and we're requested to fetch it;
2422 * do compute it 2429 * do compute it
2423 */ 2430 */
@@ -2429,6 +2436,12 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2429 sh->ops.target = disk_idx; 2436 sh->ops.target = disk_idx;
2430 sh->ops.target2 = -1; /* no 2nd target */ 2437 sh->ops.target2 = -1; /* no 2nd target */
2431 s->req_compute = 1; 2438 s->req_compute = 1;
2439 /* Careful: from this point on 'uptodate' is in the eye
2440 * of raid_run_ops which services 'compute' operations
2441 * before writes. R5_Wantcompute flags a block that will
2442 * be R5_UPTODATE by the time it is needed for a
2443 * subsequent operation.
2444 */
2432 s->uptodate++; 2445 s->uptodate++;
2433 return 1; 2446 return 1;
2434 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2447 } else if (s->uptodate == disks-2 && s->failed >= 2) {
@@ -2469,11 +2482,11 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2469} 2482}
2470 2483
2471/** 2484/**
2472 * handle_stripe_fill6 - read or compute data to satisfy pending requests. 2485 * handle_stripe_fill - read or compute data to satisfy pending requests.
2473 */ 2486 */
2474static void handle_stripe_fill6(struct stripe_head *sh, 2487static void handle_stripe_fill(struct stripe_head *sh,
2475 struct stripe_head_state *s, struct r6_state *r6s, 2488 struct stripe_head_state *s,
2476 int disks) 2489 int disks)
2477{ 2490{
2478 int i; 2491 int i;
2479 2492
@@ -2484,7 +2497,7 @@ static void handle_stripe_fill6(struct stripe_head *sh,
2484 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2497 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2485 !sh->reconstruct_state) 2498 !sh->reconstruct_state)
2486 for (i = disks; i--; ) 2499 for (i = disks; i--; )
2487 if (fetch_block6(sh, s, r6s, i, disks)) 2500 if (fetch_block(sh, s, i, disks))
2488 break; 2501 break;
2489 set_bit(STRIPE_HANDLE, &sh->state); 2502 set_bit(STRIPE_HANDLE, &sh->state);
2490} 2503}
@@ -2540,11 +2553,19 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
2540 md_wakeup_thread(conf->mddev->thread); 2553 md_wakeup_thread(conf->mddev->thread);
2541} 2554}
2542 2555
2543static void handle_stripe_dirtying5(raid5_conf_t *conf, 2556static void handle_stripe_dirtying(raid5_conf_t *conf,
2544 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2557 struct stripe_head *sh,
2558 struct stripe_head_state *s,
2559 int disks)
2545{ 2560{
2546 int rmw = 0, rcw = 0, i; 2561 int rmw = 0, rcw = 0, i;
2547 for (i = disks; i--; ) { 2562 if (conf->max_degraded == 2) {
2563 /* RAID6 requires 'rcw' in current implementation
2564 * Calculate the real rcw later - for now fake it
2565 * look like rcw is cheaper
2566 */
2567 rcw = 1; rmw = 2;
2568 } else for (i = disks; i--; ) {
2548 /* would I have to read this buffer for read_modify_write */ 2569 /* would I have to read this buffer for read_modify_write */
2549 struct r5dev *dev = &sh->dev[i]; 2570 struct r5dev *dev = &sh->dev[i];
2550 if ((dev->towrite || i == sh->pd_idx) && 2571 if ((dev->towrite || i == sh->pd_idx) &&
@@ -2591,16 +2612,19 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2591 } 2612 }
2592 } 2613 }
2593 } 2614 }
2594 if (rcw <= rmw && rcw > 0) 2615 if (rcw <= rmw && rcw > 0) {
2595 /* want reconstruct write, but need to get some data */ 2616 /* want reconstruct write, but need to get some data */
2617 rcw = 0;
2596 for (i = disks; i--; ) { 2618 for (i = disks; i--; ) {
2597 struct r5dev *dev = &sh->dev[i]; 2619 struct r5dev *dev = &sh->dev[i];
2598 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2620 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2599 i != sh->pd_idx && 2621 i != sh->pd_idx && i != sh->qd_idx &&
2600 !test_bit(R5_LOCKED, &dev->flags) && 2622 !test_bit(R5_LOCKED, &dev->flags) &&
2601 !(test_bit(R5_UPTODATE, &dev->flags) || 2623 !(test_bit(R5_UPTODATE, &dev->flags) ||
2602 test_bit(R5_Wantcompute, &dev->flags)) && 2624 test_bit(R5_Wantcompute, &dev->flags))) {
2603 test_bit(R5_Insync, &dev->flags)) { 2625 rcw++;
2626 if (!test_bit(R5_Insync, &dev->flags))
2627 continue; /* it's a failed drive */
2604 if ( 2628 if (
2605 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2629 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2606 pr_debug("Read_old block " 2630 pr_debug("Read_old block "
@@ -2614,6 +2638,7 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2614 } 2638 }
2615 } 2639 }
2616 } 2640 }
2641 }
2617 /* now if nothing is locked, and if we have enough data, 2642 /* now if nothing is locked, and if we have enough data,
2618 * we can start a write request 2643 * we can start a write request
2619 */ 2644 */
@@ -2630,53 +2655,6 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2630 schedule_reconstruction(sh, s, rcw == 0, 0); 2655 schedule_reconstruction(sh, s, rcw == 0, 0);
2631} 2656}
2632 2657
2633static void handle_stripe_dirtying6(raid5_conf_t *conf,
2634 struct stripe_head *sh, struct stripe_head_state *s,
2635 struct r6_state *r6s, int disks)
2636{
2637 int rcw = 0, pd_idx = sh->pd_idx, i;
2638 int qd_idx = sh->qd_idx;
2639
2640 set_bit(STRIPE_HANDLE, &sh->state);
2641 for (i = disks; i--; ) {
2642 struct r5dev *dev = &sh->dev[i];
2643 /* check if we haven't enough data */
2644 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2645 i != pd_idx && i != qd_idx &&
2646 !test_bit(R5_LOCKED, &dev->flags) &&
2647 !(test_bit(R5_UPTODATE, &dev->flags) ||
2648 test_bit(R5_Wantcompute, &dev->flags))) {
2649 rcw++;
2650 if (!test_bit(R5_Insync, &dev->flags))
2651 continue; /* it's a failed drive */
2652
2653 if (
2654 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2655 pr_debug("Read_old stripe %llu "
2656 "block %d for Reconstruct\n",
2657 (unsigned long long)sh->sector, i);
2658 set_bit(R5_LOCKED, &dev->flags);
2659 set_bit(R5_Wantread, &dev->flags);
2660 s->locked++;
2661 } else {
2662 pr_debug("Request delayed stripe %llu "
2663 "block %d for Reconstruct\n",
2664 (unsigned long long)sh->sector, i);
2665 set_bit(STRIPE_DELAYED, &sh->state);
2666 set_bit(STRIPE_HANDLE, &sh->state);
2667 }
2668 }
2669 }
2670 /* now if nothing is locked, and if we have enough data, we can start a
2671 * write request
2672 */
2673 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2674 s->locked == 0 && rcw == 0 &&
2675 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2676 schedule_reconstruction(sh, s, 1, 0);
2677 }
2678}
2679
2680static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2658static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2681 struct stripe_head_state *s, int disks) 2659 struct stripe_head_state *s, int disks)
2682{ 2660{
@@ -2695,7 +2673,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2695 s->uptodate--; 2673 s->uptodate--;
2696 break; 2674 break;
2697 } 2675 }
2698 dev = &sh->dev[s->failed_num]; 2676 dev = &sh->dev[s->failed_num[0]];
2699 /* fall through */ 2677 /* fall through */
2700 case check_state_compute_result: 2678 case check_state_compute_result:
2701 sh->check_state = check_state_idle; 2679 sh->check_state = check_state_idle;
@@ -2767,7 +2745,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2767 2745
2768static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2746static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2769 struct stripe_head_state *s, 2747 struct stripe_head_state *s,
2770 struct r6_state *r6s, int disks) 2748 int disks)
2771{ 2749{
2772 int pd_idx = sh->pd_idx; 2750 int pd_idx = sh->pd_idx;
2773 int qd_idx = sh->qd_idx; 2751 int qd_idx = sh->qd_idx;
@@ -2786,14 +2764,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2786 switch (sh->check_state) { 2764 switch (sh->check_state) {
2787 case check_state_idle: 2765 case check_state_idle:
2788 /* start a new check operation if there are < 2 failures */ 2766 /* start a new check operation if there are < 2 failures */
2789 if (s->failed == r6s->q_failed) { 2767 if (s->failed == s->q_failed) {
2790 /* The only possible failed device holds Q, so it 2768 /* The only possible failed device holds Q, so it
2791 * makes sense to check P (If anything else were failed, 2769 * makes sense to check P (If anything else were failed,
2792 * we would have used P to recreate it). 2770 * we would have used P to recreate it).
2793 */ 2771 */
2794 sh->check_state = check_state_run; 2772 sh->check_state = check_state_run;
2795 } 2773 }
2796 if (!r6s->q_failed && s->failed < 2) { 2774 if (!s->q_failed && s->failed < 2) {
2797 /* Q is not failed, and we didn't use it to generate 2775 /* Q is not failed, and we didn't use it to generate
2798 * anything, so it makes sense to check it 2776 * anything, so it makes sense to check it
2799 */ 2777 */
@@ -2835,13 +2813,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2835 */ 2813 */
2836 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 2814 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2837 if (s->failed == 2) { 2815 if (s->failed == 2) {
2838 dev = &sh->dev[r6s->failed_num[1]]; 2816 dev = &sh->dev[s->failed_num[1]];
2839 s->locked++; 2817 s->locked++;
2840 set_bit(R5_LOCKED, &dev->flags); 2818 set_bit(R5_LOCKED, &dev->flags);
2841 set_bit(R5_Wantwrite, &dev->flags); 2819 set_bit(R5_Wantwrite, &dev->flags);
2842 } 2820 }
2843 if (s->failed >= 1) { 2821 if (s->failed >= 1) {
2844 dev = &sh->dev[r6s->failed_num[0]]; 2822 dev = &sh->dev[s->failed_num[0]];
2845 s->locked++; 2823 s->locked++;
2846 set_bit(R5_LOCKED, &dev->flags); 2824 set_bit(R5_LOCKED, &dev->flags);
2847 set_bit(R5_Wantwrite, &dev->flags); 2825 set_bit(R5_Wantwrite, &dev->flags);
@@ -2928,8 +2906,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2928 } 2906 }
2929} 2907}
2930 2908
2931static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, 2909static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh)
2932 struct r6_state *r6s)
2933{ 2910{
2934 int i; 2911 int i;
2935 2912
@@ -2971,7 +2948,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2971 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2948 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2972 for (j = 0; j < conf->raid_disks; j++) 2949 for (j = 0; j < conf->raid_disks; j++)
2973 if (j != sh2->pd_idx && 2950 if (j != sh2->pd_idx &&
2974 (!r6s || j != sh2->qd_idx) && 2951 j != sh2->qd_idx &&
2975 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2952 !test_bit(R5_Expanded, &sh2->dev[j].flags))
2976 break; 2953 break;
2977 if (j == conf->raid_disks) { 2954 if (j == conf->raid_disks) {
@@ -3006,43 +2983,35 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
3006 * 2983 *
3007 */ 2984 */
3008 2985
3009static void handle_stripe5(struct stripe_head *sh) 2986static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3010{ 2987{
3011 raid5_conf_t *conf = sh->raid_conf; 2988 raid5_conf_t *conf = sh->raid_conf;
3012 int disks = sh->disks, i; 2989 int disks = sh->disks;
3013 struct bio *return_bi = NULL;
3014 struct stripe_head_state s;
3015 struct r5dev *dev; 2990 struct r5dev *dev;
3016 mdk_rdev_t *blocked_rdev = NULL; 2991 int i;
3017 int prexor;
3018 int dec_preread_active = 0;
3019 2992
3020 memset(&s, 0, sizeof(s)); 2993 memset(s, 0, sizeof(*s));
3021 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
3022 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
3023 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
3024 sh->reconstruct_state);
3025 2994
3026 spin_lock(&sh->lock); 2995 s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
3027 clear_bit(STRIPE_HANDLE, &sh->state); 2996 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3028 clear_bit(STRIPE_DELAYED, &sh->state); 2997 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3029 2998 s->failed_num[0] = -1;
3030 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 2999 s->failed_num[1] = -1;
3031 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3032 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3033 3000
3034 /* Now to look around and see what can be done */ 3001 /* Now to look around and see what can be done */
3035 rcu_read_lock(); 3002 rcu_read_lock();
3003 spin_lock_irq(&conf->device_lock);
3036 for (i=disks; i--; ) { 3004 for (i=disks; i--; ) {
3037 mdk_rdev_t *rdev; 3005 mdk_rdev_t *rdev;
3006 sector_t first_bad;
3007 int bad_sectors;
3008 int is_bad = 0;
3038 3009
3039 dev = &sh->dev[i]; 3010 dev = &sh->dev[i];
3040 3011
3041 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 3012 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3042 "written %p\n", i, dev->flags, dev->toread, dev->read, 3013 i, dev->flags, dev->toread, dev->towrite, dev->written);
3043 dev->towrite, dev->written); 3014 /* maybe we can reply to a read
3044
3045 /* maybe we can request a biofill operation
3046 * 3015 *
3047 * new wantfill requests are only permitted while 3016 * new wantfill requests are only permitted while
3048 * ops_complete_biofill is guaranteed to be inactive 3017 * ops_complete_biofill is guaranteed to be inactive
@@ -3052,37 +3021,74 @@ static void handle_stripe5(struct stripe_head *sh)
3052 set_bit(R5_Wantfill, &dev->flags); 3021 set_bit(R5_Wantfill, &dev->flags);
3053 3022
3054 /* now count some things */ 3023 /* now count some things */
3055 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3024 if (test_bit(R5_LOCKED, &dev->flags))
3056 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3025 s->locked++;
3057 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; 3026 if (test_bit(R5_UPTODATE, &dev->flags))
3027 s->uptodate++;
3028 if (test_bit(R5_Wantcompute, &dev->flags)) {
3029 s->compute++;
3030 BUG_ON(s->compute > 2);
3031 }
3058 3032
3059 if (test_bit(R5_Wantfill, &dev->flags)) 3033 if (test_bit(R5_Wantfill, &dev->flags))
3060 s.to_fill++; 3034 s->to_fill++;
3061 else if (dev->toread) 3035 else if (dev->toread)
3062 s.to_read++; 3036 s->to_read++;
3063 if (dev->towrite) { 3037 if (dev->towrite) {
3064 s.to_write++; 3038 s->to_write++;
3065 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3039 if (!test_bit(R5_OVERWRITE, &dev->flags))
3066 s.non_overwrite++; 3040 s->non_overwrite++;
3067 } 3041 }
3068 if (dev->written) 3042 if (dev->written)
3069 s.written++; 3043 s->written++;
3070 rdev = rcu_dereference(conf->disks[i].rdev); 3044 rdev = rcu_dereference(conf->disks[i].rdev);
3071 if (blocked_rdev == NULL && 3045 if (rdev) {
3072 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3046 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3073 blocked_rdev = rdev; 3047 &first_bad, &bad_sectors);
3074 atomic_inc(&rdev->nr_pending); 3048 if (s->blocked_rdev == NULL
3049 && (test_bit(Blocked, &rdev->flags)
3050 || is_bad < 0)) {
3051 if (is_bad < 0)
3052 set_bit(BlockedBadBlocks,
3053 &rdev->flags);
3054 s->blocked_rdev = rdev;
3055 atomic_inc(&rdev->nr_pending);
3056 }
3075 } 3057 }
3076 clear_bit(R5_Insync, &dev->flags); 3058 clear_bit(R5_Insync, &dev->flags);
3077 if (!rdev) 3059 if (!rdev)
3078 /* Not in-sync */; 3060 /* Not in-sync */;
3079 else if (test_bit(In_sync, &rdev->flags)) 3061 else if (is_bad) {
3062 /* also not in-sync */
3063 if (!test_bit(WriteErrorSeen, &rdev->flags)) {
3064 /* treat as in-sync, but with a read error
3065 * which we can now try to correct
3066 */
3067 set_bit(R5_Insync, &dev->flags);
3068 set_bit(R5_ReadError, &dev->flags);
3069 }
3070 } else if (test_bit(In_sync, &rdev->flags))
3080 set_bit(R5_Insync, &dev->flags); 3071 set_bit(R5_Insync, &dev->flags);
3081 else { 3072 else {
3082 /* could be in-sync depending on recovery/reshape status */ 3073 /* in sync if before recovery_offset */
3083 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3074 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3084 set_bit(R5_Insync, &dev->flags); 3075 set_bit(R5_Insync, &dev->flags);
3085 } 3076 }
3077 if (test_bit(R5_WriteError, &dev->flags)) {
3078 clear_bit(R5_Insync, &dev->flags);
3079 if (!test_bit(Faulty, &rdev->flags)) {
3080 s->handle_bad_blocks = 1;
3081 atomic_inc(&rdev->nr_pending);
3082 } else
3083 clear_bit(R5_WriteError, &dev->flags);
3084 }
3085 if (test_bit(R5_MadeGood, &dev->flags)) {
3086 if (!test_bit(Faulty, &rdev->flags)) {
3087 s->handle_bad_blocks = 1;
3088 atomic_inc(&rdev->nr_pending);
3089 } else
3090 clear_bit(R5_MadeGood, &dev->flags);
3091 }
3086 if (!test_bit(R5_Insync, &dev->flags)) { 3092 if (!test_bit(R5_Insync, &dev->flags)) {
3087 /* The ReadError flag will just be confusing now */ 3093 /* The ReadError flag will just be confusing now */
3088 clear_bit(R5_ReadError, &dev->flags); 3094 clear_bit(R5_ReadError, &dev->flags);
@@ -3091,313 +3097,60 @@ static void handle_stripe5(struct stripe_head *sh)
3091 if (test_bit(R5_ReadError, &dev->flags)) 3097 if (test_bit(R5_ReadError, &dev->flags))
3092 clear_bit(R5_Insync, &dev->flags); 3098 clear_bit(R5_Insync, &dev->flags);
3093 if (!test_bit(R5_Insync, &dev->flags)) { 3099 if (!test_bit(R5_Insync, &dev->flags)) {
3094 s.failed++; 3100 if (s->failed < 2)
3095 s.failed_num = i; 3101 s->failed_num[s->failed] = i;
3102 s->failed++;
3096 } 3103 }
3097 } 3104 }
3105 spin_unlock_irq(&conf->device_lock);
3098 rcu_read_unlock(); 3106 rcu_read_unlock();
3099
3100 if (unlikely(blocked_rdev)) {
3101 if (s.syncing || s.expanding || s.expanded ||
3102 s.to_write || s.written) {
3103 set_bit(STRIPE_HANDLE, &sh->state);
3104 goto unlock;
3105 }
3106 /* There is nothing for the blocked_rdev to block */
3107 rdev_dec_pending(blocked_rdev, conf->mddev);
3108 blocked_rdev = NULL;
3109 }
3110
3111 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3112 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3113 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3114 }
3115
3116 pr_debug("locked=%d uptodate=%d to_read=%d"
3117 " to_write=%d failed=%d failed_num=%d\n",
3118 s.locked, s.uptodate, s.to_read, s.to_write,
3119 s.failed, s.failed_num);
3120 /* check if the array has lost two devices and, if so, some requests might
3121 * need to be failed
3122 */
3123 if (s.failed > 1 && s.to_read+s.to_write+s.written)
3124 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
3125 if (s.failed > 1 && s.syncing) {
3126 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3127 clear_bit(STRIPE_SYNCING, &sh->state);
3128 s.syncing = 0;
3129 }
3130
3131 /* might be able to return some write requests if the parity block
3132 * is safe, or on a failed drive
3133 */
3134 dev = &sh->dev[sh->pd_idx];
3135 if ( s.written &&
3136 ((test_bit(R5_Insync, &dev->flags) &&
3137 !test_bit(R5_LOCKED, &dev->flags) &&
3138 test_bit(R5_UPTODATE, &dev->flags)) ||
3139 (s.failed == 1 && s.failed_num == sh->pd_idx)))
3140 handle_stripe_clean_event(conf, sh, disks, &return_bi);
3141
3142 /* Now we might consider reading some blocks, either to check/generate
3143 * parity, or to satisfy requests
3144 * or to load a block that is being partially written.
3145 */
3146 if (s.to_read || s.non_overwrite ||
3147 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3148 handle_stripe_fill5(sh, &s, disks);
3149
3150 /* Now we check to see if any write operations have recently
3151 * completed
3152 */
3153 prexor = 0;
3154 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3155 prexor = 1;
3156 if (sh->reconstruct_state == reconstruct_state_drain_result ||
3157 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3158 sh->reconstruct_state = reconstruct_state_idle;
3159
3160 /* All the 'written' buffers and the parity block are ready to
3161 * be written back to disk
3162 */
3163 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3164 for (i = disks; i--; ) {
3165 dev = &sh->dev[i];
3166 if (test_bit(R5_LOCKED, &dev->flags) &&
3167 (i == sh->pd_idx || dev->written)) {
3168 pr_debug("Writing block %d\n", i);
3169 set_bit(R5_Wantwrite, &dev->flags);
3170 if (prexor)
3171 continue;
3172 if (!test_bit(R5_Insync, &dev->flags) ||
3173 (i == sh->pd_idx && s.failed == 0))
3174 set_bit(STRIPE_INSYNC, &sh->state);
3175 }
3176 }
3177 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3178 dec_preread_active = 1;
3179 }
3180
3181 /* Now to consider new write requests and what else, if anything
3182 * should be read. We do not handle new writes when:
3183 * 1/ A 'write' operation (copy+xor) is already in flight.
3184 * 2/ A 'check' operation is in flight, as it may clobber the parity
3185 * block.
3186 */
3187 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3188 handle_stripe_dirtying5(conf, sh, &s, disks);
3189
3190 /* maybe we need to check and possibly fix the parity for this stripe
3191 * Any reads will already have been scheduled, so we just see if enough
3192 * data is available. The parity check is held off while parity
3193 * dependent operations are in flight.
3194 */
3195 if (sh->check_state ||
3196 (s.syncing && s.locked == 0 &&
3197 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3198 !test_bit(STRIPE_INSYNC, &sh->state)))
3199 handle_parity_checks5(conf, sh, &s, disks);
3200
3201 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3202 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
3203 clear_bit(STRIPE_SYNCING, &sh->state);
3204 }
3205
3206 /* If the failed drive is just a ReadError, then we might need to progress
3207 * the repair/check process
3208 */
3209 if (s.failed == 1 && !conf->mddev->ro &&
3210 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
3211 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
3212 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
3213 ) {
3214 dev = &sh->dev[s.failed_num];
3215 if (!test_bit(R5_ReWrite, &dev->flags)) {
3216 set_bit(R5_Wantwrite, &dev->flags);
3217 set_bit(R5_ReWrite, &dev->flags);
3218 set_bit(R5_LOCKED, &dev->flags);
3219 s.locked++;
3220 } else {
3221 /* let's read it back */
3222 set_bit(R5_Wantread, &dev->flags);
3223 set_bit(R5_LOCKED, &dev->flags);
3224 s.locked++;
3225 }
3226 }
3227
3228 /* Finish reconstruct operations initiated by the expansion process */
3229 if (sh->reconstruct_state == reconstruct_state_result) {
3230 struct stripe_head *sh2
3231 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3232 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3233 /* sh cannot be written until sh2 has been read.
3234 * so arrange for sh to be delayed a little
3235 */
3236 set_bit(STRIPE_DELAYED, &sh->state);
3237 set_bit(STRIPE_HANDLE, &sh->state);
3238 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3239 &sh2->state))
3240 atomic_inc(&conf->preread_active_stripes);
3241 release_stripe(sh2);
3242 goto unlock;
3243 }
3244 if (sh2)
3245 release_stripe(sh2);
3246
3247 sh->reconstruct_state = reconstruct_state_idle;
3248 clear_bit(STRIPE_EXPANDING, &sh->state);
3249 for (i = conf->raid_disks; i--; ) {
3250 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3251 set_bit(R5_LOCKED, &sh->dev[i].flags);
3252 s.locked++;
3253 }
3254 }
3255
3256 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3257 !sh->reconstruct_state) {
3258 /* Need to write out all blocks after computing parity */
3259 sh->disks = conf->raid_disks;
3260 stripe_set_idx(sh->sector, conf, 0, sh);
3261 schedule_reconstruction(sh, &s, 1, 1);
3262 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3263 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3264 atomic_dec(&conf->reshape_stripes);
3265 wake_up(&conf->wait_for_overlap);
3266 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3267 }
3268
3269 if (s.expanding && s.locked == 0 &&
3270 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3271 handle_stripe_expansion(conf, sh, NULL);
3272
3273 unlock:
3274 spin_unlock(&sh->lock);
3275
3276 /* wait for this device to become unblocked */
3277 if (unlikely(blocked_rdev))
3278 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3279
3280 if (s.ops_request)
3281 raid_run_ops(sh, s.ops_request);
3282
3283 ops_run_io(sh, &s);
3284
3285 if (dec_preread_active) {
3286 /* We delay this until after ops_run_io so that if make_request
3287 * is waiting on a flush, it won't continue until the writes
3288 * have actually been submitted.
3289 */
3290 atomic_dec(&conf->preread_active_stripes);
3291 if (atomic_read(&conf->preread_active_stripes) <
3292 IO_THRESHOLD)
3293 md_wakeup_thread(conf->mddev->thread);
3294 }
3295 return_io(return_bi);
3296} 3107}
3297 3108
3298static void handle_stripe6(struct stripe_head *sh) 3109static void handle_stripe(struct stripe_head *sh)
3299{ 3110{
3111 struct stripe_head_state s;
3300 raid5_conf_t *conf = sh->raid_conf; 3112 raid5_conf_t *conf = sh->raid_conf;
3113 int i;
3114 int prexor;
3301 int disks = sh->disks; 3115 int disks = sh->disks;
3302 struct bio *return_bi = NULL; 3116 struct r5dev *pdev, *qdev;
3303 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; 3117
3304 struct stripe_head_state s; 3118 clear_bit(STRIPE_HANDLE, &sh->state);
3305 struct r6_state r6s; 3119 if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) {
3306 struct r5dev *dev, *pdev, *qdev; 3120 /* already being handled, ensure it gets handled
3307 mdk_rdev_t *blocked_rdev = NULL; 3121 * again when current action finishes */
3308 int dec_preread_active = 0; 3122 set_bit(STRIPE_HANDLE, &sh->state);
3123 return;
3124 }
3125
3126 if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
3127 set_bit(STRIPE_SYNCING, &sh->state);
3128 clear_bit(STRIPE_INSYNC, &sh->state);
3129 }
3130 clear_bit(STRIPE_DELAYED, &sh->state);
3309 3131
3310 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3132 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3311 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3133 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3312 (unsigned long long)sh->sector, sh->state, 3134 (unsigned long long)sh->sector, sh->state,
3313 atomic_read(&sh->count), pd_idx, qd_idx, 3135 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
3314 sh->check_state, sh->reconstruct_state); 3136 sh->check_state, sh->reconstruct_state);
3315 memset(&s, 0, sizeof(s));
3316
3317 spin_lock(&sh->lock);
3318 clear_bit(STRIPE_HANDLE, &sh->state);
3319 clear_bit(STRIPE_DELAYED, &sh->state);
3320
3321 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
3322 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3323 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3324 /* Now to look around and see what can be done */
3325
3326 rcu_read_lock();
3327 for (i=disks; i--; ) {
3328 mdk_rdev_t *rdev;
3329 dev = &sh->dev[i];
3330 3137
3331 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3138 analyse_stripe(sh, &s);
3332 i, dev->flags, dev->toread, dev->towrite, dev->written);
3333 /* maybe we can reply to a read
3334 *
3335 * new wantfill requests are only permitted while
3336 * ops_complete_biofill is guaranteed to be inactive
3337 */
3338 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3339 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3340 set_bit(R5_Wantfill, &dev->flags);
3341 3139
3342 /* now count some things */ 3140 if (s.handle_bad_blocks) {
3343 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3141 set_bit(STRIPE_HANDLE, &sh->state);
3344 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3142 goto finish;
3345 if (test_bit(R5_Wantcompute, &dev->flags)) {
3346 s.compute++;
3347 BUG_ON(s.compute > 2);
3348 }
3349
3350 if (test_bit(R5_Wantfill, &dev->flags)) {
3351 s.to_fill++;
3352 } else if (dev->toread)
3353 s.to_read++;
3354 if (dev->towrite) {
3355 s.to_write++;
3356 if (!test_bit(R5_OVERWRITE, &dev->flags))
3357 s.non_overwrite++;
3358 }
3359 if (dev->written)
3360 s.written++;
3361 rdev = rcu_dereference(conf->disks[i].rdev);
3362 if (blocked_rdev == NULL &&
3363 rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
3364 blocked_rdev = rdev;
3365 atomic_inc(&rdev->nr_pending);
3366 }
3367 clear_bit(R5_Insync, &dev->flags);
3368 if (!rdev)
3369 /* Not in-sync */;
3370 else if (test_bit(In_sync, &rdev->flags))
3371 set_bit(R5_Insync, &dev->flags);
3372 else {
3373 /* in sync if before recovery_offset */
3374 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3375 set_bit(R5_Insync, &dev->flags);
3376 }
3377 if (!test_bit(R5_Insync, &dev->flags)) {
3378 /* The ReadError flag will just be confusing now */
3379 clear_bit(R5_ReadError, &dev->flags);
3380 clear_bit(R5_ReWrite, &dev->flags);
3381 }
3382 if (test_bit(R5_ReadError, &dev->flags))
3383 clear_bit(R5_Insync, &dev->flags);
3384 if (!test_bit(R5_Insync, &dev->flags)) {
3385 if (s.failed < 2)
3386 r6s.failed_num[s.failed] = i;
3387 s.failed++;
3388 }
3389 } 3143 }
3390 rcu_read_unlock();
3391 3144
3392 if (unlikely(blocked_rdev)) { 3145 if (unlikely(s.blocked_rdev)) {
3393 if (s.syncing || s.expanding || s.expanded || 3146 if (s.syncing || s.expanding || s.expanded ||
3394 s.to_write || s.written) { 3147 s.to_write || s.written) {
3395 set_bit(STRIPE_HANDLE, &sh->state); 3148 set_bit(STRIPE_HANDLE, &sh->state);
3396 goto unlock; 3149 goto finish;
3397 } 3150 }
3398 /* There is nothing for the blocked_rdev to block */ 3151 /* There is nothing for the blocked_rdev to block */
3399 rdev_dec_pending(blocked_rdev, conf->mddev); 3152 rdev_dec_pending(s.blocked_rdev, conf->mddev);
3400 blocked_rdev = NULL; 3153 s.blocked_rdev = NULL;
3401 } 3154 }
3402 3155
3403 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3156 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
@@ -3408,83 +3161,88 @@ static void handle_stripe6(struct stripe_head *sh)
3408 pr_debug("locked=%d uptodate=%d to_read=%d" 3161 pr_debug("locked=%d uptodate=%d to_read=%d"
3409 " to_write=%d failed=%d failed_num=%d,%d\n", 3162 " to_write=%d failed=%d failed_num=%d,%d\n",
3410 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3163 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3411 r6s.failed_num[0], r6s.failed_num[1]); 3164 s.failed_num[0], s.failed_num[1]);
3412 /* check if the array has lost >2 devices and, if so, some requests 3165 /* check if the array has lost more than max_degraded devices and,
3413 * might need to be failed 3166 * if so, some requests might need to be failed.
3414 */ 3167 */
3415 if (s.failed > 2 && s.to_read+s.to_write+s.written) 3168 if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written)
3416 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3169 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3417 if (s.failed > 2 && s.syncing) { 3170 if (s.failed > conf->max_degraded && s.syncing)
3418 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3171 handle_failed_sync(conf, sh, &s);
3419 clear_bit(STRIPE_SYNCING, &sh->state);
3420 s.syncing = 0;
3421 }
3422 3172
3423 /* 3173 /*
3424 * might be able to return some write requests if the parity blocks 3174 * might be able to return some write requests if the parity blocks
3425 * are safe, or on a failed drive 3175 * are safe, or on a failed drive
3426 */ 3176 */
3427 pdev = &sh->dev[pd_idx]; 3177 pdev = &sh->dev[sh->pd_idx];
3428 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) 3178 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
3429 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); 3179 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
3430 qdev = &sh->dev[qd_idx]; 3180 qdev = &sh->dev[sh->qd_idx];
3431 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) 3181 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
3432 || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); 3182 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
3433 3183 || conf->level < 6;
3434 if ( s.written && 3184
3435 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3185 if (s.written &&
3186 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3436 && !test_bit(R5_LOCKED, &pdev->flags) 3187 && !test_bit(R5_LOCKED, &pdev->flags)
3437 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3188 && test_bit(R5_UPTODATE, &pdev->flags)))) &&
3438 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3189 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3439 && !test_bit(R5_LOCKED, &qdev->flags) 3190 && !test_bit(R5_LOCKED, &qdev->flags)
3440 && test_bit(R5_UPTODATE, &qdev->flags))))) 3191 && test_bit(R5_UPTODATE, &qdev->flags)))))
3441 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3192 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
3442 3193
3443 /* Now we might consider reading some blocks, either to check/generate 3194 /* Now we might consider reading some blocks, either to check/generate
3444 * parity, or to satisfy requests 3195 * parity, or to satisfy requests
3445 * or to load a block that is being partially written. 3196 * or to load a block that is being partially written.
3446 */ 3197 */
3447 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3198 if (s.to_read || s.non_overwrite
3448 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3199 || (conf->level == 6 && s.to_write && s.failed)
3449 handle_stripe_fill6(sh, &s, &r6s, disks); 3200 || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3201 handle_stripe_fill(sh, &s, disks);
3450 3202
3451 /* Now we check to see if any write operations have recently 3203 /* Now we check to see if any write operations have recently
3452 * completed 3204 * completed
3453 */ 3205 */
3454 if (sh->reconstruct_state == reconstruct_state_drain_result) { 3206 prexor = 0;
3455 3207 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3208 prexor = 1;
3209 if (sh->reconstruct_state == reconstruct_state_drain_result ||
3210 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3456 sh->reconstruct_state = reconstruct_state_idle; 3211 sh->reconstruct_state = reconstruct_state_idle;
3457 /* All the 'written' buffers and the parity blocks are ready to 3212
3213 /* All the 'written' buffers and the parity block are ready to
3458 * be written back to disk 3214 * be written back to disk
3459 */ 3215 */
3460 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3216 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3461 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); 3217 BUG_ON(sh->qd_idx >= 0 &&
3218 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
3462 for (i = disks; i--; ) { 3219 for (i = disks; i--; ) {
3463 dev = &sh->dev[i]; 3220 struct r5dev *dev = &sh->dev[i];
3464 if (test_bit(R5_LOCKED, &dev->flags) && 3221 if (test_bit(R5_LOCKED, &dev->flags) &&
3465 (i == sh->pd_idx || i == qd_idx || 3222 (i == sh->pd_idx || i == sh->qd_idx ||
3466 dev->written)) { 3223 dev->written)) {
3467 pr_debug("Writing block %d\n", i); 3224 pr_debug("Writing block %d\n", i);
3468 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3469 set_bit(R5_Wantwrite, &dev->flags); 3225 set_bit(R5_Wantwrite, &dev->flags);
3226 if (prexor)
3227 continue;
3470 if (!test_bit(R5_Insync, &dev->flags) || 3228 if (!test_bit(R5_Insync, &dev->flags) ||
3471 ((i == sh->pd_idx || i == qd_idx) && 3229 ((i == sh->pd_idx || i == sh->qd_idx) &&
3472 s.failed == 0)) 3230 s.failed == 0))
3473 set_bit(STRIPE_INSYNC, &sh->state); 3231 set_bit(STRIPE_INSYNC, &sh->state);
3474 } 3232 }
3475 } 3233 }
3476 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3234 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3477 dec_preread_active = 1; 3235 s.dec_preread_active = 1;
3478 } 3236 }
3479 3237
3480 /* Now to consider new write requests and what else, if anything 3238 /* Now to consider new write requests and what else, if anything
3481 * should be read. We do not handle new writes when: 3239 * should be read. We do not handle new writes when:
3482 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. 3240 * 1/ A 'write' operation (copy+xor) is already in flight.
3483 * 2/ A 'check' operation is in flight, as it may clobber the parity 3241 * 2/ A 'check' operation is in flight, as it may clobber the parity
3484 * block. 3242 * block.
3485 */ 3243 */
3486 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3244 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3487 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3245 handle_stripe_dirtying(conf, sh, &s, disks);
3488 3246
3489 /* maybe we need to check and possibly fix the parity for this stripe 3247 /* maybe we need to check and possibly fix the parity for this stripe
3490 * Any reads will already have been scheduled, so we just see if enough 3248 * Any reads will already have been scheduled, so we just see if enough
@@ -3494,20 +3252,24 @@ static void handle_stripe6(struct stripe_head *sh)
3494 if (sh->check_state || 3252 if (sh->check_state ||
3495 (s.syncing && s.locked == 0 && 3253 (s.syncing && s.locked == 0 &&
3496 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3254 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3497 !test_bit(STRIPE_INSYNC, &sh->state))) 3255 !test_bit(STRIPE_INSYNC, &sh->state))) {
3498 handle_parity_checks6(conf, sh, &s, &r6s, disks); 3256 if (conf->level == 6)
3257 handle_parity_checks6(conf, sh, &s, disks);
3258 else
3259 handle_parity_checks5(conf, sh, &s, disks);
3260 }
3499 3261
3500 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3262 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3501 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3263 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3502 clear_bit(STRIPE_SYNCING, &sh->state); 3264 clear_bit(STRIPE_SYNCING, &sh->state);
3503 } 3265 }
3504 3266
3505 /* If the failed drives are just a ReadError, then we might need 3267 /* If the failed drives are just a ReadError, then we might need
3506 * to progress the repair/check process 3268 * to progress the repair/check process
3507 */ 3269 */
3508 if (s.failed <= 2 && !conf->mddev->ro) 3270 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
3509 for (i = 0; i < s.failed; i++) { 3271 for (i = 0; i < s.failed; i++) {
3510 dev = &sh->dev[r6s.failed_num[i]]; 3272 struct r5dev *dev = &sh->dev[s.failed_num[i]];
3511 if (test_bit(R5_ReadError, &dev->flags) 3273 if (test_bit(R5_ReadError, &dev->flags)
3512 && !test_bit(R5_LOCKED, &dev->flags) 3274 && !test_bit(R5_LOCKED, &dev->flags)
3513 && test_bit(R5_UPTODATE, &dev->flags) 3275 && test_bit(R5_UPTODATE, &dev->flags)
@@ -3526,8 +3288,26 @@ static void handle_stripe6(struct stripe_head *sh)
3526 } 3288 }
3527 } 3289 }
3528 3290
3291
3529 /* Finish reconstruct operations initiated by the expansion process */ 3292 /* Finish reconstruct operations initiated by the expansion process */
3530 if (sh->reconstruct_state == reconstruct_state_result) { 3293 if (sh->reconstruct_state == reconstruct_state_result) {
3294 struct stripe_head *sh_src
3295 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3296 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
3297 /* sh cannot be written until sh_src has been read.
3298 * so arrange for sh to be delayed a little
3299 */
3300 set_bit(STRIPE_DELAYED, &sh->state);
3301 set_bit(STRIPE_HANDLE, &sh->state);
3302 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3303 &sh_src->state))
3304 atomic_inc(&conf->preread_active_stripes);
3305 release_stripe(sh_src);
3306 goto finish;
3307 }
3308 if (sh_src)
3309 release_stripe(sh_src);
3310
3531 sh->reconstruct_state = reconstruct_state_idle; 3311 sh->reconstruct_state = reconstruct_state_idle;
3532 clear_bit(STRIPE_EXPANDING, &sh->state); 3312 clear_bit(STRIPE_EXPANDING, &sh->state);
3533 for (i = conf->raid_disks; i--; ) { 3313 for (i = conf->raid_disks; i--; ) {
@@ -3539,24 +3319,7 @@ static void handle_stripe6(struct stripe_head *sh)
3539 3319
3540 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3320 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3541 !sh->reconstruct_state) { 3321 !sh->reconstruct_state) {
3542 struct stripe_head *sh2 3322 /* Need to write out all blocks after computing parity */
3543 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3544 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3545 /* sh cannot be written until sh2 has been read.
3546 * so arrange for sh to be delayed a little
3547 */
3548 set_bit(STRIPE_DELAYED, &sh->state);
3549 set_bit(STRIPE_HANDLE, &sh->state);
3550 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3551 &sh2->state))
3552 atomic_inc(&conf->preread_active_stripes);
3553 release_stripe(sh2);
3554 goto unlock;
3555 }
3556 if (sh2)
3557 release_stripe(sh2);
3558
3559 /* Need to write out all blocks after computing P&Q */
3560 sh->disks = conf->raid_disks; 3323 sh->disks = conf->raid_disks;
3561 stripe_set_idx(sh->sector, conf, 0, sh); 3324 stripe_set_idx(sh->sector, conf, 0, sh);
3562 schedule_reconstruction(sh, &s, 1, 1); 3325 schedule_reconstruction(sh, &s, 1, 1);
@@ -3569,22 +3332,39 @@ static void handle_stripe6(struct stripe_head *sh)
3569 3332
3570 if (s.expanding && s.locked == 0 && 3333 if (s.expanding && s.locked == 0 &&
3571 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3334 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3572 handle_stripe_expansion(conf, sh, &r6s); 3335 handle_stripe_expansion(conf, sh);
3573
3574 unlock:
3575 spin_unlock(&sh->lock);
3576 3336
3337finish:
3577 /* wait for this device to become unblocked */ 3338 /* wait for this device to become unblocked */
3578 if (unlikely(blocked_rdev)) 3339 if (unlikely(s.blocked_rdev))
3579 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3340 md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
3341
3342 if (s.handle_bad_blocks)
3343 for (i = disks; i--; ) {
3344 mdk_rdev_t *rdev;
3345 struct r5dev *dev = &sh->dev[i];
3346 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
3347 /* We own a safe reference to the rdev */
3348 rdev = conf->disks[i].rdev;
3349 if (!rdev_set_badblocks(rdev, sh->sector,
3350 STRIPE_SECTORS, 0))
3351 md_error(conf->mddev, rdev);
3352 rdev_dec_pending(rdev, conf->mddev);
3353 }
3354 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
3355 rdev = conf->disks[i].rdev;
3356 rdev_clear_badblocks(rdev, sh->sector,
3357 STRIPE_SECTORS);
3358 rdev_dec_pending(rdev, conf->mddev);
3359 }
3360 }
3580 3361
3581 if (s.ops_request) 3362 if (s.ops_request)
3582 raid_run_ops(sh, s.ops_request); 3363 raid_run_ops(sh, s.ops_request);
3583 3364
3584 ops_run_io(sh, &s); 3365 ops_run_io(sh, &s);
3585 3366
3586 3367 if (s.dec_preread_active) {
3587 if (dec_preread_active) {
3588 /* We delay this until after ops_run_io so that if make_request 3368 /* We delay this until after ops_run_io so that if make_request
3589 * is waiting on a flush, it won't continue until the writes 3369 * is waiting on a flush, it won't continue until the writes
3590 * have actually been submitted. 3370 * have actually been submitted.
@@ -3595,15 +3375,9 @@ static void handle_stripe6(struct stripe_head *sh)
3595 md_wakeup_thread(conf->mddev->thread); 3375 md_wakeup_thread(conf->mddev->thread);
3596 } 3376 }
3597 3377
3598 return_io(return_bi); 3378 return_io(s.return_bi);
3599}
3600 3379
3601static void handle_stripe(struct stripe_head *sh) 3380 clear_bit(STRIPE_ACTIVE, &sh->state);
3602{
3603 if (sh->raid_conf->level == 6)
3604 handle_stripe6(sh);
3605 else
3606 handle_stripe5(sh);
3607} 3381}
3608 3382
3609static void raid5_activate_delayed(raid5_conf_t *conf) 3383static void raid5_activate_delayed(raid5_conf_t *conf)
@@ -3833,6 +3607,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3833 rcu_read_lock(); 3607 rcu_read_lock();
3834 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3608 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3835 if (rdev && test_bit(In_sync, &rdev->flags)) { 3609 if (rdev && test_bit(In_sync, &rdev->flags)) {
3610 sector_t first_bad;
3611 int bad_sectors;
3612
3836 atomic_inc(&rdev->nr_pending); 3613 atomic_inc(&rdev->nr_pending);
3837 rcu_read_unlock(); 3614 rcu_read_unlock();
3838 raid_bio->bi_next = (void*)rdev; 3615 raid_bio->bi_next = (void*)rdev;
@@ -3840,8 +3617,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3840 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3617 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3841 align_bi->bi_sector += rdev->data_offset; 3618 align_bi->bi_sector += rdev->data_offset;
3842 3619
3843 if (!bio_fits_rdev(align_bi)) { 3620 if (!bio_fits_rdev(align_bi) ||
3844 /* too big in some way */ 3621 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
3622 &first_bad, &bad_sectors)) {
3623 /* too big in some way, or has a known bad block */
3845 bio_put(align_bi); 3624 bio_put(align_bi);
3846 rdev_dec_pending(rdev, mddev); 3625 rdev_dec_pending(rdev, mddev);
3847 return 0; 3626 return 0;
@@ -4016,7 +3795,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4016 } 3795 }
4017 } 3796 }
4018 3797
4019 if (bio_data_dir(bi) == WRITE && 3798 if (rw == WRITE &&
4020 logical_sector >= mddev->suspend_lo && 3799 logical_sector >= mddev->suspend_lo &&
4021 logical_sector < mddev->suspend_hi) { 3800 logical_sector < mddev->suspend_hi) {
4022 release_stripe(sh); 3801 release_stripe(sh);
@@ -4034,7 +3813,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4034 } 3813 }
4035 3814
4036 if (test_bit(STRIPE_EXPANDING, &sh->state) || 3815 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
4037 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 3816 !add_stripe_bio(sh, bi, dd_idx, rw)) {
4038 /* Stripe is busy expanding or 3817 /* Stripe is busy expanding or
4039 * add failed due to overlap. Flush everything 3818 * add failed due to overlap. Flush everything
4040 * and wait a while 3819 * and wait a while
@@ -4375,10 +4154,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4375 4154
4376 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4155 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
4377 4156
4378 spin_lock(&sh->lock); 4157 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
4379 set_bit(STRIPE_SYNCING, &sh->state);
4380 clear_bit(STRIPE_INSYNC, &sh->state);
4381 spin_unlock(&sh->lock);
4382 4158
4383 handle_stripe(sh); 4159 handle_stripe(sh);
4384 release_stripe(sh); 4160 release_stripe(sh);
@@ -4509,6 +4285,9 @@ static void raid5d(mddev_t *mddev)
4509 release_stripe(sh); 4285 release_stripe(sh);
4510 cond_resched(); 4286 cond_resched();
4511 4287
4288 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
4289 md_check_recovery(mddev);
4290
4512 spin_lock_irq(&conf->device_lock); 4291 spin_lock_irq(&conf->device_lock);
4513 } 4292 }
4514 pr_debug("%d stripes handled\n", handled); 4293 pr_debug("%d stripes handled\n", handled);
@@ -5313,6 +5092,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
5313 * isn't possible. 5092 * isn't possible.
5314 */ 5093 */
5315 if (!test_bit(Faulty, &rdev->flags) && 5094 if (!test_bit(Faulty, &rdev->flags) &&
5095 mddev->recovery_disabled != conf->recovery_disabled &&
5316 !has_failed(conf) && 5096 !has_failed(conf) &&
5317 number < conf->raid_disks) { 5097 number < conf->raid_disks) {
5318 err = -EBUSY; 5098 err = -EBUSY;
@@ -5341,6 +5121,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5341 int first = 0; 5121 int first = 0;
5342 int last = conf->raid_disks - 1; 5122 int last = conf->raid_disks - 1;
5343 5123
5124 if (mddev->recovery_disabled == conf->recovery_disabled)
5125 return -EBUSY;
5126
5344 if (has_failed(conf)) 5127 if (has_failed(conf))
5345 /* no point adding a device */ 5128 /* no point adding a device */
5346 return -EINVAL; 5129 return -EINVAL;
@@ -5519,16 +5302,14 @@ static int raid5_start_reshape(mddev_t *mddev)
5519 if (rdev->raid_disk < 0 && 5302 if (rdev->raid_disk < 0 &&
5520 !test_bit(Faulty, &rdev->flags)) { 5303 !test_bit(Faulty, &rdev->flags)) {
5521 if (raid5_add_disk(mddev, rdev) == 0) { 5304 if (raid5_add_disk(mddev, rdev) == 0) {
5522 char nm[20];
5523 if (rdev->raid_disk 5305 if (rdev->raid_disk
5524 >= conf->previous_raid_disks) { 5306 >= conf->previous_raid_disks) {
5525 set_bit(In_sync, &rdev->flags); 5307 set_bit(In_sync, &rdev->flags);
5526 added_devices++; 5308 added_devices++;
5527 } else 5309 } else
5528 rdev->recovery_offset = 0; 5310 rdev->recovery_offset = 0;
5529 sprintf(nm, "rd%d", rdev->raid_disk); 5311
5530 if (sysfs_create_link(&mddev->kobj, 5312 if (sysfs_link_rdev(mddev, rdev))
5531 &rdev->kobj, nm))
5532 /* Failure here is OK */; 5313 /* Failure here is OK */;
5533 } 5314 }
5534 } else if (rdev->raid_disk >= conf->previous_raid_disks 5315 } else if (rdev->raid_disk >= conf->previous_raid_disks
@@ -5624,9 +5405,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
5624 d++) { 5405 d++) {
5625 mdk_rdev_t *rdev = conf->disks[d].rdev; 5406 mdk_rdev_t *rdev = conf->disks[d].rdev;
5626 if (rdev && raid5_remove_disk(mddev, d) == 0) { 5407 if (rdev && raid5_remove_disk(mddev, d) == 0) {
5627 char nm[20]; 5408 sysfs_unlink_rdev(mddev, rdev);
5628 sprintf(nm, "rd%d", rdev->raid_disk);
5629 sysfs_remove_link(&mddev->kobj, nm);
5630 rdev->raid_disk = -1; 5409 rdev->raid_disk = -1;
5631 } 5410 }
5632 } 5411 }
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3ca77a2613ba..11b9566184b2 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -6,11 +6,11 @@
6 6
7/* 7/*
8 * 8 *
9 * Each stripe contains one buffer per disc. Each buffer can be in 9 * Each stripe contains one buffer per device. Each buffer can be in
10 * one of a number of states stored in "flags". Changes between 10 * one of a number of states stored in "flags". Changes between
11 * these states happen *almost* exclusively under a per-stripe 11 * these states happen *almost* exclusively under the protection of the
12 * spinlock. Some very specific changes can happen in bi_end_io, and 12 * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and
13 * these are not protected by the spin lock. 13 * these are not protected by STRIPE_ACTIVE.
14 * 14 *
15 * The flag bits that are used to represent these states are: 15 * The flag bits that are used to represent these states are:
16 * R5_UPTODATE and R5_LOCKED 16 * R5_UPTODATE and R5_LOCKED
@@ -76,12 +76,10 @@
76 * block and the cached buffer are successfully written, any buffer on 76 * block and the cached buffer are successfully written, any buffer on
77 * a written list can be returned with b_end_io. 77 * a written list can be returned with b_end_io.
78 * 78 *
79 * The write list and read list both act as fifos. The read list is 79 * The write list and read list both act as fifos. The read list,
80 * protected by the device_lock. The write and written lists are 80 * write list and written list are protected by the device_lock.
81 * protected by the stripe lock. The device_lock, which can be 81 * The device_lock is only for list manipulations and will only be
82 * claimed while the stipe lock is held, is only for list 82 * held for a very short time. It can be claimed from interrupts.
83 * manipulations and will only be held for a very short time. It can
84 * be claimed from interrupts.
85 * 83 *
86 * 84 *
87 * Stripes in the stripe cache can be on one of two lists (or on 85 * Stripes in the stripe cache can be on one of two lists (or on
@@ -96,7 +94,6 @@
96 * 94 *
97 * The inactive_list, handle_list and hash bucket lists are all protected by the 95 * The inactive_list, handle_list and hash bucket lists are all protected by the
98 * device_lock. 96 * device_lock.
99 * - stripes on the inactive_list never have their stripe_lock held.
100 * - stripes have a reference counter. If count==0, they are on a list. 97 * - stripes have a reference counter. If count==0, they are on a list.
101 * - If a stripe might need handling, STRIPE_HANDLE is set. 98 * - If a stripe might need handling, STRIPE_HANDLE is set.
102 * - When refcount reaches zero, then if STRIPE_HANDLE it is put on 99 * - When refcount reaches zero, then if STRIPE_HANDLE it is put on
@@ -116,10 +113,10 @@
116 * attach a request to an active stripe (add_stripe_bh()) 113 * attach a request to an active stripe (add_stripe_bh())
117 * lockdev attach-buffer unlockdev 114 * lockdev attach-buffer unlockdev
118 * handle a stripe (handle_stripe()) 115 * handle a stripe (handle_stripe())
119 * lockstripe clrSTRIPE_HANDLE ... 116 * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ...
120 * (lockdev check-buffers unlockdev) .. 117 * (lockdev check-buffers unlockdev) ..
121 * change-state .. 118 * change-state ..
122 * record io/ops needed unlockstripe schedule io/ops 119 * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops
123 * release an active stripe (release_stripe()) 120 * release an active stripe (release_stripe())
124 * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev 121 * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
125 * 122 *
@@ -128,8 +125,7 @@
128 * on a cached buffer, and plus one if the stripe is undergoing stripe 125 * on a cached buffer, and plus one if the stripe is undergoing stripe
129 * operations. 126 * operations.
130 * 127 *
131 * Stripe operations are performed outside the stripe lock, 128 * The stripe operations are:
132 * the stripe operations are:
133 * -copying data between the stripe cache and user application buffers 129 * -copying data between the stripe cache and user application buffers
134 * -computing blocks to save a disk access, or to recover a missing block 130 * -computing blocks to save a disk access, or to recover a missing block
135 * -updating the parity on a write operation (reconstruct write and 131 * -updating the parity on a write operation (reconstruct write and
@@ -159,7 +155,8 @@
159 */ 155 */
160 156
161/* 157/*
162 * Operations state - intermediate states that are visible outside of sh->lock 158 * Operations state - intermediate states that are visible outside of
159 * STRIPE_ACTIVE.
163 * In general _idle indicates nothing is running, _run indicates a data 160 * In general _idle indicates nothing is running, _run indicates a data
164 * processing operation is active, and _result means the data processing result 161 * processing operation is active, and _result means the data processing result
165 * is stable and can be acted upon. For simple operations like biofill and 162 * is stable and can be acted upon. For simple operations like biofill and
@@ -209,7 +206,6 @@ struct stripe_head {
209 short ddf_layout;/* use DDF ordering to calculate Q */ 206 short ddf_layout;/* use DDF ordering to calculate Q */
210 unsigned long state; /* state flags */ 207 unsigned long state; /* state flags */
211 atomic_t count; /* nr of active thread/requests */ 208 atomic_t count; /* nr of active thread/requests */
212 spinlock_t lock;
213 int bm_seq; /* sequence number for bitmap flushes */ 209 int bm_seq; /* sequence number for bitmap flushes */
214 int disks; /* disks in stripe */ 210 int disks; /* disks in stripe */
215 enum check_states check_state; 211 enum check_states check_state;
@@ -240,19 +236,20 @@ struct stripe_head {
240}; 236};
241 237
242/* stripe_head_state - collects and tracks the dynamic state of a stripe_head 238/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
243 * for handle_stripe. It is only valid under spin_lock(sh->lock); 239 * for handle_stripe.
244 */ 240 */
245struct stripe_head_state { 241struct stripe_head_state {
246 int syncing, expanding, expanded; 242 int syncing, expanding, expanded;
247 int locked, uptodate, to_read, to_write, failed, written; 243 int locked, uptodate, to_read, to_write, failed, written;
248 int to_fill, compute, req_compute, non_overwrite; 244 int to_fill, compute, req_compute, non_overwrite;
249 int failed_num; 245 int failed_num[2];
246 int p_failed, q_failed;
247 int dec_preread_active;
250 unsigned long ops_request; 248 unsigned long ops_request;
251};
252 249
253/* r6_state - extra state data only relevant to r6 */ 250 struct bio *return_bi;
254struct r6_state { 251 mdk_rdev_t *blocked_rdev;
255 int p_failed, q_failed, failed_num[2]; 252 int handle_bad_blocks;
256}; 253};
257 254
258/* Flags */ 255/* Flags */
@@ -268,14 +265,16 @@ struct r6_state {
268#define R5_ReWrite 9 /* have tried to over-write the readerror */ 265#define R5_ReWrite 9 /* have tried to over-write the readerror */
269 266
270#define R5_Expanded 10 /* This block now has post-expand data */ 267#define R5_Expanded 10 /* This block now has post-expand data */
271#define R5_Wantcompute 11 /* compute_block in progress treat as 268#define R5_Wantcompute 11 /* compute_block in progress treat as
272 * uptodate 269 * uptodate
273 */ 270 */
274#define R5_Wantfill 12 /* dev->toread contains a bio that needs 271#define R5_Wantfill 12 /* dev->toread contains a bio that needs
275 * filling 272 * filling
276 */ 273 */
277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ 274#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
278#define R5_WantFUA 14 /* Write should be FUA */ 275#define R5_WantFUA 14 /* Write should be FUA */
276#define R5_WriteError 15 /* got a write error - need to record it */
277#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/
279/* 278/*
280 * Write method 279 * Write method
281 */ 280 */
@@ -289,21 +288,25 @@ struct r6_state {
289/* 288/*
290 * Stripe state 289 * Stripe state
291 */ 290 */
292#define STRIPE_HANDLE 2 291enum {
293#define STRIPE_SYNCING 3 292 STRIPE_ACTIVE,
294#define STRIPE_INSYNC 4 293 STRIPE_HANDLE,
295#define STRIPE_PREREAD_ACTIVE 5 294 STRIPE_SYNC_REQUESTED,
296#define STRIPE_DELAYED 6 295 STRIPE_SYNCING,
297#define STRIPE_DEGRADED 7 296 STRIPE_INSYNC,
298#define STRIPE_BIT_DELAY 8 297 STRIPE_PREREAD_ACTIVE,
299#define STRIPE_EXPANDING 9 298 STRIPE_DELAYED,
300#define STRIPE_EXPAND_SOURCE 10 299 STRIPE_DEGRADED,
301#define STRIPE_EXPAND_READY 11 300 STRIPE_BIT_DELAY,
302#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ 301 STRIPE_EXPANDING,
303#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ 302 STRIPE_EXPAND_SOURCE,
304#define STRIPE_BIOFILL_RUN 14 303 STRIPE_EXPAND_READY,
305#define STRIPE_COMPUTE_RUN 15 304 STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */
306#define STRIPE_OPS_REQ_PENDING 16 305 STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
306 STRIPE_BIOFILL_RUN,
307 STRIPE_COMPUTE_RUN,
308 STRIPE_OPS_REQ_PENDING,
309};
307 310
308/* 311/*
309 * Operation request flags 312 * Operation request flags
@@ -336,7 +339,7 @@ struct r6_state {
336 * PREREAD_ACTIVE. 339 * PREREAD_ACTIVE.
337 * In stripe_handle, if we find pre-reading is necessary, we do it if 340 * In stripe_handle, if we find pre-reading is necessary, we do it if
338 * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. 341 * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
339 * HANDLE gets cleared if stripe_handle leave nothing locked. 342 * HANDLE gets cleared if stripe_handle leaves nothing locked.
340 */ 343 */
341 344
342 345
@@ -399,7 +402,7 @@ struct raid5_private_data {
399 * (fresh device added). 402 * (fresh device added).
400 * Cleared when a sync completes. 403 * Cleared when a sync completes.
401 */ 404 */
402 405 int recovery_disabled;
403 /* per cpu variables */ 406 /* per cpu variables */
404 struct raid5_percpu { 407 struct raid5_percpu {
405 struct page *spare_page; /* Used when checking P/Q in raid6 */ 408 struct page *spare_page; /* Used when checking P/Q in raid6 */
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 75cbf4f62fe8..9e65d9e20662 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -245,10 +245,16 @@ struct mdp_superblock_1 {
245 __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ 245 __u8 device_uuid[16]; /* user-space setable, ignored by kernel */
246 __u8 devflags; /* per-device flags. Only one defined...*/ 246 __u8 devflags; /* per-device flags. Only one defined...*/
247#define WriteMostly1 1 /* mask for writemostly flag in above */ 247#define WriteMostly1 1 /* mask for writemostly flag in above */
248 __u8 pad2[64-57]; /* set to 0 when writing */ 248 /* Bad block log. If there are any bad blocks the feature flag is set.
249 * If offset and size are non-zero, that space is reserved and available
250 */
251 __u8 bblog_shift; /* shift from sectors to block size */
252 __le16 bblog_size; /* number of sectors reserved for list */
253 __le32 bblog_offset; /* sector offset from superblock to bblog,
254 * signed - not unsigned */
249 255
250 /* array state information - 64 bytes */ 256 /* array state information - 64 bytes */
251 __le64 utime; /* 40 bits second, 24 btes microseconds */ 257 __le64 utime; /* 40 bits second, 24 bits microseconds */
252 __le64 events; /* incremented when superblock updated */ 258 __le64 events; /* incremented when superblock updated */
253 __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ 259 __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */
254 __le32 sb_csum; /* checksum up to devs[max_dev] */ 260 __le32 sb_csum; /* checksum up to devs[max_dev] */
@@ -270,8 +276,8 @@ struct mdp_superblock_1 {
270 * must be honoured 276 * must be honoured
271 */ 277 */
272#define MD_FEATURE_RESHAPE_ACTIVE 4 278#define MD_FEATURE_RESHAPE_ACTIVE 4
279#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */
273 280
274#define MD_FEATURE_ALL (1|2|4) 281#define MD_FEATURE_ALL (1|2|4|8)
275 282
276#endif 283#endif
277