diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-28 08:50:27 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-28 08:50:27 -0400 |
commit | 6140333d3656f62ac7e6a5af87e7fe92cfb8d655 (patch) | |
tree | d96f7ad2196b4383f5ca4396c956e24c82b2952c | |
parent | 6f56c218666b5c7eff354364357307d18c10058b (diff) | |
parent | 58c54fcca3bac5bf9290cfed31c76e4c4bfbabaf (diff) |
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (75 commits)
md/raid10: handle further errors during fix_read_error better.
md/raid10: Handle read errors during recovery better.
md/raid10: simplify read error handling during recovery.
md/raid10: record bad blocks due to write errors during resync/recovery.
md/raid10: attempt to fix read errors during resync/check
md/raid10: Handle write errors by updating badblock log.
md/raid10: clear bad-block record when write succeeds.
md/raid10: avoid writing to known bad blocks on known bad drives.
md/raid10 record bad blocks as needed during recovery.
md/raid10: avoid reading known bad blocks during resync/recovery.
md/raid10 - avoid reading from known bad blocks - part 3
md/raid10: avoid reading from known bad blocks - part 2
md/raid10: avoid reading from known bad blocks - part 1
md/raid10: Split handle_read_error out from raid10d.
md/raid10: simplify/reindent some loops.
md/raid5: Clear bad blocks on successful write.
md/raid5. Don't write to known bad block on doubtful devices.
md/raid5: write errors should be recorded as bad blocks if possible.
md/raid5: use bad-block log to improve handling of uncorrectable read errors.
md/raid5: avoid reading from known bad blocks.
...
-rw-r--r-- | Documentation/md.txt | 29 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 137 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 5 | ||||
-rw-r--r-- | drivers/md/md.c | 871 | ||||
-rw-r--r-- | drivers/md/md.h | 110 | ||||
-rw-r--r-- | drivers/md/raid1.c | 962 | ||||
-rw-r--r-- | drivers/md/raid1.h | 26 | ||||
-rw-r--r-- | drivers/md/raid10.c | 1183 | ||||
-rw-r--r-- | drivers/md/raid10.h | 21 | ||||
-rw-r--r-- | drivers/md/raid5.c | 1015 | ||||
-rw-r--r-- | drivers/md/raid5.h | 99 | ||||
-rw-r--r-- | include/linux/raid/md_p.h | 14 |
12 files changed, 3093 insertions, 1379 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt index f0eee83ff78a..fc94770f44ab 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt | |||
@@ -360,18 +360,20 @@ Each directory contains: | |||
360 | A file recording the current state of the device in the array | 360 | A file recording the current state of the device in the array |
361 | which can be a comma separated list of | 361 | which can be a comma separated list of |
362 | faulty - device has been kicked from active use due to | 362 | faulty - device has been kicked from active use due to |
363 | a detected fault | 363 | a detected fault or it has unacknowledged bad |
364 | blocks | ||
364 | in_sync - device is a fully in-sync member of the array | 365 | in_sync - device is a fully in-sync member of the array |
365 | writemostly - device will only be subject to read | 366 | writemostly - device will only be subject to read |
366 | requests if there are no other options. | 367 | requests if there are no other options. |
367 | This applies only to raid1 arrays. | 368 | This applies only to raid1 arrays. |
368 | blocked - device has failed, metadata is "external", | 369 | blocked - device has failed, and the failure hasn't been |
369 | and the failure hasn't been acknowledged yet. | 370 | acknowledged yet by the metadata handler. |
370 | Writes that would write to this device if | 371 | Writes that would write to this device if |
371 | it were not faulty are blocked. | 372 | it were not faulty are blocked. |
372 | spare - device is working, but not a full member. | 373 | spare - device is working, but not a full member. |
373 | This includes spares that are in the process | 374 | This includes spares that are in the process |
374 | of being recovered to | 375 | of being recovered to |
376 | write_error - device has ever seen a write error. | ||
375 | This list may grow in future. | 377 | This list may grow in future. |
376 | This can be written to. | 378 | This can be written to. |
377 | Writing "faulty" simulates a failure on the device. | 379 | Writing "faulty" simulates a failure on the device. |
@@ -379,9 +381,11 @@ Each directory contains: | |||
379 | Writing "writemostly" sets the writemostly flag. | 381 | Writing "writemostly" sets the writemostly flag. |
380 | Writing "-writemostly" clears the writemostly flag. | 382 | Writing "-writemostly" clears the writemostly flag. |
381 | Writing "blocked" sets the "blocked" flag. | 383 | Writing "blocked" sets the "blocked" flag. |
382 | Writing "-blocked" clears the "blocked" flag and allows writes | 384 | Writing "-blocked" clears the "blocked" flags and allows writes |
383 | to complete. | 385 | to complete and possibly simulates an error. |
384 | Writing "in_sync" sets the in_sync flag. | 386 | Writing "in_sync" sets the in_sync flag. |
387 | Writing "write_error" sets writeerrorseen flag. | ||
388 | Writing "-write_error" clears writeerrorseen flag. | ||
385 | 389 | ||
386 | This file responds to select/poll. Any change to 'faulty' | 390 | This file responds to select/poll. Any change to 'faulty' |
387 | or 'blocked' causes an event. | 391 | or 'blocked' causes an event. |
@@ -419,7 +423,6 @@ Each directory contains: | |||
419 | written, it will be rejected. | 423 | written, it will be rejected. |
420 | 424 | ||
421 | recovery_start | 425 | recovery_start |
422 | |||
423 | When the device is not 'in_sync', this records the number of | 426 | When the device is not 'in_sync', this records the number of |
424 | sectors from the start of the device which are known to be | 427 | sectors from the start of the device which are known to be |
425 | correct. This is normally zero, but during a recovery | 428 | correct. This is normally zero, but during a recovery |
@@ -435,6 +438,20 @@ Each directory contains: | |||
435 | Setting this to 'none' is equivalent to setting 'in_sync'. | 438 | Setting this to 'none' is equivalent to setting 'in_sync'. |
436 | Setting to any other value also clears the 'in_sync' flag. | 439 | Setting to any other value also clears the 'in_sync' flag. |
437 | 440 | ||
441 | bad_blocks | ||
442 | This gives the list of all known bad blocks in the form of | ||
443 | start address and length (in sectors respectively). If output | ||
444 | is too big to fit in a page, it will be truncated. Writing | ||
445 | "sector length" to this file adds new acknowledged (i.e. | ||
446 | recorded to disk safely) bad blocks. | ||
447 | |||
448 | unacknowledged_bad_blocks | ||
449 | This gives the list of known-but-not-yet-saved-to-disk bad | ||
450 | blocks in the same form of 'bad_blocks'. If output is too big | ||
451 | to fit in a page, it will be truncated. Writing to this file | ||
452 | adds bad blocks without acknowledging them. This is largely | ||
453 | for testing. | ||
454 | |||
438 | 455 | ||
439 | 456 | ||
440 | An active md device will also contain and entry for each active device | 457 | An active md device will also contain and entry for each active device |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 574b09afedd3..0dc6546b77a8 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include "md.h" | 29 | #include "md.h" |
30 | #include "bitmap.h" | 30 | #include "bitmap.h" |
31 | 31 | ||
32 | #include <linux/dm-dirty-log.h> | ||
33 | /* debug macros */ | 32 | /* debug macros */ |
34 | 33 | ||
35 | #define DEBUG 0 | 34 | #define DEBUG 0 |
@@ -775,10 +774,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon | |||
775 | * 0 or page 1 | 774 | * 0 or page 1 |
776 | */ | 775 | */ |
777 | static inline struct page *filemap_get_page(struct bitmap *bitmap, | 776 | static inline struct page *filemap_get_page(struct bitmap *bitmap, |
778 | unsigned long chunk) | 777 | unsigned long chunk) |
779 | { | 778 | { |
780 | if (bitmap->filemap == NULL) | ||
781 | return NULL; | ||
782 | if (file_page_index(bitmap, chunk) >= bitmap->file_pages) | 779 | if (file_page_index(bitmap, chunk) >= bitmap->file_pages) |
783 | return NULL; | 780 | return NULL; |
784 | return bitmap->filemap[file_page_index(bitmap, chunk) | 781 | return bitmap->filemap[file_page_index(bitmap, chunk) |
@@ -878,28 +875,19 @@ enum bitmap_page_attr { | |||
878 | static inline void set_page_attr(struct bitmap *bitmap, struct page *page, | 875 | static inline void set_page_attr(struct bitmap *bitmap, struct page *page, |
879 | enum bitmap_page_attr attr) | 876 | enum bitmap_page_attr attr) |
880 | { | 877 | { |
881 | if (page) | 878 | __set_bit((page->index<<2) + attr, bitmap->filemap_attr); |
882 | __set_bit((page->index<<2) + attr, bitmap->filemap_attr); | ||
883 | else | ||
884 | __set_bit(attr, &bitmap->logattrs); | ||
885 | } | 879 | } |
886 | 880 | ||
887 | static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, | 881 | static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, |
888 | enum bitmap_page_attr attr) | 882 | enum bitmap_page_attr attr) |
889 | { | 883 | { |
890 | if (page) | 884 | __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); |
891 | __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); | ||
892 | else | ||
893 | __clear_bit(attr, &bitmap->logattrs); | ||
894 | } | 885 | } |
895 | 886 | ||
896 | static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, | 887 | static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, |
897 | enum bitmap_page_attr attr) | 888 | enum bitmap_page_attr attr) |
898 | { | 889 | { |
899 | if (page) | 890 | return test_bit((page->index<<2) + attr, bitmap->filemap_attr); |
900 | return test_bit((page->index<<2) + attr, bitmap->filemap_attr); | ||
901 | else | ||
902 | return test_bit(attr, &bitmap->logattrs); | ||
903 | } | 891 | } |
904 | 892 | ||
905 | /* | 893 | /* |
@@ -912,30 +900,26 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p | |||
912 | static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) | 900 | static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) |
913 | { | 901 | { |
914 | unsigned long bit; | 902 | unsigned long bit; |
915 | struct page *page = NULL; | 903 | struct page *page; |
916 | void *kaddr; | 904 | void *kaddr; |
917 | unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); | 905 | unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); |
918 | 906 | ||
919 | if (!bitmap->filemap) { | 907 | if (!bitmap->filemap) |
920 | struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; | 908 | return; |
921 | if (log) | ||
922 | log->type->mark_region(log, chunk); | ||
923 | } else { | ||
924 | 909 | ||
925 | page = filemap_get_page(bitmap, chunk); | 910 | page = filemap_get_page(bitmap, chunk); |
926 | if (!page) | 911 | if (!page) |
927 | return; | 912 | return; |
928 | bit = file_page_offset(bitmap, chunk); | 913 | bit = file_page_offset(bitmap, chunk); |
929 | 914 | ||
930 | /* set the bit */ | 915 | /* set the bit */ |
931 | kaddr = kmap_atomic(page, KM_USER0); | 916 | kaddr = kmap_atomic(page, KM_USER0); |
932 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 917 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
933 | set_bit(bit, kaddr); | 918 | set_bit(bit, kaddr); |
934 | else | 919 | else |
935 | __test_and_set_bit_le(bit, kaddr); | 920 | __set_bit_le(bit, kaddr); |
936 | kunmap_atomic(kaddr, KM_USER0); | 921 | kunmap_atomic(kaddr, KM_USER0); |
937 | PRINTK("set file bit %lu page %lu\n", bit, page->index); | 922 | PRINTK("set file bit %lu page %lu\n", bit, page->index); |
938 | } | ||
939 | /* record page number so it gets flushed to disk when unplug occurs */ | 923 | /* record page number so it gets flushed to disk when unplug occurs */ |
940 | set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); | 924 | set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); |
941 | } | 925 | } |
@@ -952,16 +936,6 @@ void bitmap_unplug(struct bitmap *bitmap) | |||
952 | 936 | ||
953 | if (!bitmap) | 937 | if (!bitmap) |
954 | return; | 938 | return; |
955 | if (!bitmap->filemap) { | ||
956 | /* Must be using a dirty_log */ | ||
957 | struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; | ||
958 | dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs); | ||
959 | need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs); | ||
960 | if (dirty || need_write) | ||
961 | if (log->type->flush(log)) | ||
962 | bitmap->flags |= BITMAP_WRITE_ERROR; | ||
963 | goto out; | ||
964 | } | ||
965 | 939 | ||
966 | /* look at each page to see if there are any set bits that need to be | 940 | /* look at each page to see if there are any set bits that need to be |
967 | * flushed out to disk */ | 941 | * flushed out to disk */ |
@@ -990,7 +964,6 @@ void bitmap_unplug(struct bitmap *bitmap) | |||
990 | else | 964 | else |
991 | md_super_wait(bitmap->mddev); | 965 | md_super_wait(bitmap->mddev); |
992 | } | 966 | } |
993 | out: | ||
994 | if (bitmap->flags & BITMAP_WRITE_ERROR) | 967 | if (bitmap->flags & BITMAP_WRITE_ERROR) |
995 | bitmap_file_kick(bitmap); | 968 | bitmap_file_kick(bitmap); |
996 | } | 969 | } |
@@ -1199,7 +1172,6 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
1199 | struct page *page = NULL, *lastpage = NULL; | 1172 | struct page *page = NULL, *lastpage = NULL; |
1200 | sector_t blocks; | 1173 | sector_t blocks; |
1201 | void *paddr; | 1174 | void *paddr; |
1202 | struct dm_dirty_log *log = mddev->bitmap_info.log; | ||
1203 | 1175 | ||
1204 | /* Use a mutex to guard daemon_work against | 1176 | /* Use a mutex to guard daemon_work against |
1205 | * bitmap_destroy. | 1177 | * bitmap_destroy. |
@@ -1224,12 +1196,11 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
1224 | spin_lock_irqsave(&bitmap->lock, flags); | 1196 | spin_lock_irqsave(&bitmap->lock, flags); |
1225 | for (j = 0; j < bitmap->chunks; j++) { | 1197 | for (j = 0; j < bitmap->chunks; j++) { |
1226 | bitmap_counter_t *bmc; | 1198 | bitmap_counter_t *bmc; |
1227 | if (!bitmap->filemap) { | 1199 | if (!bitmap->filemap) |
1228 | if (!log) | 1200 | /* error or shutdown */ |
1229 | /* error or shutdown */ | 1201 | break; |
1230 | break; | 1202 | |
1231 | } else | 1203 | page = filemap_get_page(bitmap, j); |
1232 | page = filemap_get_page(bitmap, j); | ||
1233 | 1204 | ||
1234 | if (page != lastpage) { | 1205 | if (page != lastpage) { |
1235 | /* skip this page unless it's marked as needing cleaning */ | 1206 | /* skip this page unless it's marked as needing cleaning */ |
@@ -1298,17 +1269,16 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
1298 | -1); | 1269 | -1); |
1299 | 1270 | ||
1300 | /* clear the bit */ | 1271 | /* clear the bit */ |
1301 | if (page) { | 1272 | paddr = kmap_atomic(page, KM_USER0); |
1302 | paddr = kmap_atomic(page, KM_USER0); | 1273 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
1303 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 1274 | clear_bit(file_page_offset(bitmap, j), |
1304 | clear_bit(file_page_offset(bitmap, j), | 1275 | paddr); |
1305 | paddr); | 1276 | else |
1306 | else | 1277 | __clear_bit_le( |
1307 | __test_and_clear_bit_le(file_page_offset(bitmap, j), | 1278 | file_page_offset(bitmap, |
1308 | paddr); | 1279 | j), |
1309 | kunmap_atomic(paddr, KM_USER0); | 1280 | paddr); |
1310 | } else | 1281 | kunmap_atomic(paddr, KM_USER0); |
1311 | log->type->clear_region(log, j); | ||
1312 | } | 1282 | } |
1313 | } else | 1283 | } else |
1314 | j |= PAGE_COUNTER_MASK; | 1284 | j |= PAGE_COUNTER_MASK; |
@@ -1316,16 +1286,12 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
1316 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1286 | spin_unlock_irqrestore(&bitmap->lock, flags); |
1317 | 1287 | ||
1318 | /* now sync the final page */ | 1288 | /* now sync the final page */ |
1319 | if (lastpage != NULL || log != NULL) { | 1289 | if (lastpage != NULL) { |
1320 | spin_lock_irqsave(&bitmap->lock, flags); | 1290 | spin_lock_irqsave(&bitmap->lock, flags); |
1321 | if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { | 1291 | if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { |
1322 | clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); | 1292 | clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); |
1323 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1293 | spin_unlock_irqrestore(&bitmap->lock, flags); |
1324 | if (lastpage) | 1294 | write_page(bitmap, lastpage, 0); |
1325 | write_page(bitmap, lastpage, 0); | ||
1326 | else | ||
1327 | if (log->type->flush(log)) | ||
1328 | bitmap->flags |= BITMAP_WRITE_ERROR; | ||
1329 | } else { | 1295 | } else { |
1330 | set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); | 1296 | set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); |
1331 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1297 | spin_unlock_irqrestore(&bitmap->lock, flags); |
@@ -1767,12 +1733,10 @@ int bitmap_create(mddev_t *mddev) | |||
1767 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); | 1733 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); |
1768 | 1734 | ||
1769 | if (!file | 1735 | if (!file |
1770 | && !mddev->bitmap_info.offset | 1736 | && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ |
1771 | && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */ | ||
1772 | return 0; | 1737 | return 0; |
1773 | 1738 | ||
1774 | BUG_ON(file && mddev->bitmap_info.offset); | 1739 | BUG_ON(file && mddev->bitmap_info.offset); |
1775 | BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log); | ||
1776 | 1740 | ||
1777 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); | 1741 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); |
1778 | if (!bitmap) | 1742 | if (!bitmap) |
@@ -1863,6 +1827,7 @@ int bitmap_create(mddev_t *mddev) | |||
1863 | int bitmap_load(mddev_t *mddev) | 1827 | int bitmap_load(mddev_t *mddev) |
1864 | { | 1828 | { |
1865 | int err = 0; | 1829 | int err = 0; |
1830 | sector_t start = 0; | ||
1866 | sector_t sector = 0; | 1831 | sector_t sector = 0; |
1867 | struct bitmap *bitmap = mddev->bitmap; | 1832 | struct bitmap *bitmap = mddev->bitmap; |
1868 | 1833 | ||
@@ -1881,24 +1846,14 @@ int bitmap_load(mddev_t *mddev) | |||
1881 | } | 1846 | } |
1882 | bitmap_close_sync(bitmap); | 1847 | bitmap_close_sync(bitmap); |
1883 | 1848 | ||
1884 | if (mddev->bitmap_info.log) { | 1849 | if (mddev->degraded == 0 |
1885 | unsigned long i; | 1850 | || bitmap->events_cleared == mddev->events) |
1886 | struct dm_dirty_log *log = mddev->bitmap_info.log; | 1851 | /* no need to keep dirty bits to optimise a |
1887 | for (i = 0; i < bitmap->chunks; i++) | 1852 | * re-add of a missing device */ |
1888 | if (!log->type->in_sync(log, i, 1)) | 1853 | start = mddev->recovery_cp; |
1889 | bitmap_set_memory_bits(bitmap, | 1854 | |
1890 | (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), | 1855 | err = bitmap_init_from_disk(bitmap, start); |
1891 | 1); | 1856 | |
1892 | } else { | ||
1893 | sector_t start = 0; | ||
1894 | if (mddev->degraded == 0 | ||
1895 | || bitmap->events_cleared == mddev->events) | ||
1896 | /* no need to keep dirty bits to optimise a | ||
1897 | * re-add of a missing device */ | ||
1898 | start = mddev->recovery_cp; | ||
1899 | |||
1900 | err = bitmap_init_from_disk(bitmap, start); | ||
1901 | } | ||
1902 | if (err) | 1857 | if (err) |
1903 | goto out; | 1858 | goto out; |
1904 | 1859 | ||
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index b2a127e891ac..a28f2e5588c6 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h | |||
@@ -212,10 +212,6 @@ struct bitmap { | |||
212 | unsigned long file_pages; /* number of pages in the file */ | 212 | unsigned long file_pages; /* number of pages in the file */ |
213 | int last_page_size; /* bytes in the last page */ | 213 | int last_page_size; /* bytes in the last page */ |
214 | 214 | ||
215 | unsigned long logattrs; /* used when filemap_attr doesn't exist | ||
216 | * because we are working with a dirty_log | ||
217 | */ | ||
218 | |||
219 | unsigned long flags; | 215 | unsigned long flags; |
220 | 216 | ||
221 | int allclean; | 217 | int allclean; |
@@ -237,7 +233,6 @@ struct bitmap { | |||
237 | wait_queue_head_t behind_wait; | 233 | wait_queue_head_t behind_wait; |
238 | 234 | ||
239 | struct sysfs_dirent *sysfs_can_clear; | 235 | struct sysfs_dirent *sysfs_can_clear; |
240 | |||
241 | }; | 236 | }; |
242 | 237 | ||
243 | /* the bitmap API */ | 238 | /* the bitmap API */ |
diff --git a/drivers/md/md.c b/drivers/md/md.c index dfc9425db70b..8e221a20f5d9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | |||
215 | } | 215 | } |
216 | EXPORT_SYMBOL_GPL(bio_clone_mddev); | 216 | EXPORT_SYMBOL_GPL(bio_clone_mddev); |
217 | 217 | ||
218 | void md_trim_bio(struct bio *bio, int offset, int size) | ||
219 | { | ||
220 | /* 'bio' is a cloned bio which we need to trim to match | ||
221 | * the given offset and size. | ||
222 | * This requires adjusting bi_sector, bi_size, and bi_io_vec | ||
223 | */ | ||
224 | int i; | ||
225 | struct bio_vec *bvec; | ||
226 | int sofar = 0; | ||
227 | |||
228 | size <<= 9; | ||
229 | if (offset == 0 && size == bio->bi_size) | ||
230 | return; | ||
231 | |||
232 | bio->bi_sector += offset; | ||
233 | bio->bi_size = size; | ||
234 | offset <<= 9; | ||
235 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
236 | |||
237 | while (bio->bi_idx < bio->bi_vcnt && | ||
238 | bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { | ||
239 | /* remove this whole bio_vec */ | ||
240 | offset -= bio->bi_io_vec[bio->bi_idx].bv_len; | ||
241 | bio->bi_idx++; | ||
242 | } | ||
243 | if (bio->bi_idx < bio->bi_vcnt) { | ||
244 | bio->bi_io_vec[bio->bi_idx].bv_offset += offset; | ||
245 | bio->bi_io_vec[bio->bi_idx].bv_len -= offset; | ||
246 | } | ||
247 | /* avoid any complications with bi_idx being non-zero*/ | ||
248 | if (bio->bi_idx) { | ||
249 | memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, | ||
250 | (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); | ||
251 | bio->bi_vcnt -= bio->bi_idx; | ||
252 | bio->bi_idx = 0; | ||
253 | } | ||
254 | /* Make sure vcnt and last bv are not too big */ | ||
255 | bio_for_each_segment(bvec, bio, i) { | ||
256 | if (sofar + bvec->bv_len > size) | ||
257 | bvec->bv_len = size - sofar; | ||
258 | if (bvec->bv_len == 0) { | ||
259 | bio->bi_vcnt = i; | ||
260 | break; | ||
261 | } | ||
262 | sofar += bvec->bv_len; | ||
263 | } | ||
264 | } | ||
265 | EXPORT_SYMBOL_GPL(md_trim_bio); | ||
266 | |||
218 | /* | 267 | /* |
219 | * We have a system wide 'event count' that is incremented | 268 | * We have a system wide 'event count' that is incremented |
220 | * on any 'interesting' event, and readers of /proc/mdstat | 269 | * on any 'interesting' event, and readers of /proc/mdstat |
@@ -757,6 +806,10 @@ static void free_disk_sb(mdk_rdev_t * rdev) | |||
757 | rdev->sb_start = 0; | 806 | rdev->sb_start = 0; |
758 | rdev->sectors = 0; | 807 | rdev->sectors = 0; |
759 | } | 808 | } |
809 | if (rdev->bb_page) { | ||
810 | put_page(rdev->bb_page); | ||
811 | rdev->bb_page = NULL; | ||
812 | } | ||
760 | } | 813 | } |
761 | 814 | ||
762 | 815 | ||
@@ -1025,7 +1078,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1025 | ret = -EINVAL; | 1078 | ret = -EINVAL; |
1026 | 1079 | ||
1027 | bdevname(rdev->bdev, b); | 1080 | bdevname(rdev->bdev, b); |
1028 | sb = (mdp_super_t*)page_address(rdev->sb_page); | 1081 | sb = page_address(rdev->sb_page); |
1029 | 1082 | ||
1030 | if (sb->md_magic != MD_SB_MAGIC) { | 1083 | if (sb->md_magic != MD_SB_MAGIC) { |
1031 | printk(KERN_ERR "md: invalid raid superblock magic on %s\n", | 1084 | printk(KERN_ERR "md: invalid raid superblock magic on %s\n", |
@@ -1054,6 +1107,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1054 | rdev->preferred_minor = sb->md_minor; | 1107 | rdev->preferred_minor = sb->md_minor; |
1055 | rdev->data_offset = 0; | 1108 | rdev->data_offset = 0; |
1056 | rdev->sb_size = MD_SB_BYTES; | 1109 | rdev->sb_size = MD_SB_BYTES; |
1110 | rdev->badblocks.shift = -1; | ||
1057 | 1111 | ||
1058 | if (sb->level == LEVEL_MULTIPATH) | 1112 | if (sb->level == LEVEL_MULTIPATH) |
1059 | rdev->desc_nr = -1; | 1113 | rdev->desc_nr = -1; |
@@ -1064,7 +1118,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1064 | ret = 1; | 1118 | ret = 1; |
1065 | } else { | 1119 | } else { |
1066 | __u64 ev1, ev2; | 1120 | __u64 ev1, ev2; |
1067 | mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); | 1121 | mdp_super_t *refsb = page_address(refdev->sb_page); |
1068 | if (!uuid_equal(refsb, sb)) { | 1122 | if (!uuid_equal(refsb, sb)) { |
1069 | printk(KERN_WARNING "md: %s has different UUID to %s\n", | 1123 | printk(KERN_WARNING "md: %s has different UUID to %s\n", |
1070 | b, bdevname(refdev->bdev,b2)); | 1124 | b, bdevname(refdev->bdev,b2)); |
@@ -1099,7 +1153,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1099 | static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | 1153 | static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) |
1100 | { | 1154 | { |
1101 | mdp_disk_t *desc; | 1155 | mdp_disk_t *desc; |
1102 | mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); | 1156 | mdp_super_t *sb = page_address(rdev->sb_page); |
1103 | __u64 ev1 = md_event(sb); | 1157 | __u64 ev1 = md_event(sb); |
1104 | 1158 | ||
1105 | rdev->raid_disk = -1; | 1159 | rdev->raid_disk = -1; |
@@ -1230,7 +1284,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1230 | 1284 | ||
1231 | rdev->sb_size = MD_SB_BYTES; | 1285 | rdev->sb_size = MD_SB_BYTES; |
1232 | 1286 | ||
1233 | sb = (mdp_super_t*)page_address(rdev->sb_page); | 1287 | sb = page_address(rdev->sb_page); |
1234 | 1288 | ||
1235 | memset(sb, 0, sizeof(*sb)); | 1289 | memset(sb, 0, sizeof(*sb)); |
1236 | 1290 | ||
@@ -1395,6 +1449,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) | |||
1395 | return cpu_to_le32(csum); | 1449 | return cpu_to_le32(csum); |
1396 | } | 1450 | } |
1397 | 1451 | ||
1452 | static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, | ||
1453 | int acknowledged); | ||
1398 | static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | 1454 | static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) |
1399 | { | 1455 | { |
1400 | struct mdp_superblock_1 *sb; | 1456 | struct mdp_superblock_1 *sb; |
@@ -1435,7 +1491,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1435 | if (ret) return ret; | 1491 | if (ret) return ret; |
1436 | 1492 | ||
1437 | 1493 | ||
1438 | sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 1494 | sb = page_address(rdev->sb_page); |
1439 | 1495 | ||
1440 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || | 1496 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || |
1441 | sb->major_version != cpu_to_le32(1) || | 1497 | sb->major_version != cpu_to_le32(1) || |
@@ -1473,12 +1529,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1473 | else | 1529 | else |
1474 | rdev->desc_nr = le32_to_cpu(sb->dev_number); | 1530 | rdev->desc_nr = le32_to_cpu(sb->dev_number); |
1475 | 1531 | ||
1532 | if (!rdev->bb_page) { | ||
1533 | rdev->bb_page = alloc_page(GFP_KERNEL); | ||
1534 | if (!rdev->bb_page) | ||
1535 | return -ENOMEM; | ||
1536 | } | ||
1537 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && | ||
1538 | rdev->badblocks.count == 0) { | ||
1539 | /* need to load the bad block list. | ||
1540 | * Currently we limit it to one page. | ||
1541 | */ | ||
1542 | s32 offset; | ||
1543 | sector_t bb_sector; | ||
1544 | u64 *bbp; | ||
1545 | int i; | ||
1546 | int sectors = le16_to_cpu(sb->bblog_size); | ||
1547 | if (sectors > (PAGE_SIZE / 512)) | ||
1548 | return -EINVAL; | ||
1549 | offset = le32_to_cpu(sb->bblog_offset); | ||
1550 | if (offset == 0) | ||
1551 | return -EINVAL; | ||
1552 | bb_sector = (long long)offset; | ||
1553 | if (!sync_page_io(rdev, bb_sector, sectors << 9, | ||
1554 | rdev->bb_page, READ, true)) | ||
1555 | return -EIO; | ||
1556 | bbp = (u64 *)page_address(rdev->bb_page); | ||
1557 | rdev->badblocks.shift = sb->bblog_shift; | ||
1558 | for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { | ||
1559 | u64 bb = le64_to_cpu(*bbp); | ||
1560 | int count = bb & (0x3ff); | ||
1561 | u64 sector = bb >> 10; | ||
1562 | sector <<= sb->bblog_shift; | ||
1563 | count <<= sb->bblog_shift; | ||
1564 | if (bb + 1 == 0) | ||
1565 | break; | ||
1566 | if (md_set_badblocks(&rdev->badblocks, | ||
1567 | sector, count, 1) == 0) | ||
1568 | return -EINVAL; | ||
1569 | } | ||
1570 | } else if (sb->bblog_offset == 0) | ||
1571 | rdev->badblocks.shift = -1; | ||
1572 | |||
1476 | if (!refdev) { | 1573 | if (!refdev) { |
1477 | ret = 1; | 1574 | ret = 1; |
1478 | } else { | 1575 | } else { |
1479 | __u64 ev1, ev2; | 1576 | __u64 ev1, ev2; |
1480 | struct mdp_superblock_1 *refsb = | 1577 | struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); |
1481 | (struct mdp_superblock_1*)page_address(refdev->sb_page); | ||
1482 | 1578 | ||
1483 | if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || | 1579 | if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || |
1484 | sb->level != refsb->level || | 1580 | sb->level != refsb->level || |
@@ -1513,7 +1609,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1513 | 1609 | ||
1514 | static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | 1610 | static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) |
1515 | { | 1611 | { |
1516 | struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 1612 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); |
1517 | __u64 ev1 = le64_to_cpu(sb->events); | 1613 | __u64 ev1 = le64_to_cpu(sb->events); |
1518 | 1614 | ||
1519 | rdev->raid_disk = -1; | 1615 | rdev->raid_disk = -1; |
@@ -1619,13 +1715,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1619 | int max_dev, i; | 1715 | int max_dev, i; |
1620 | /* make rdev->sb match mddev and rdev data. */ | 1716 | /* make rdev->sb match mddev and rdev data. */ |
1621 | 1717 | ||
1622 | sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 1718 | sb = page_address(rdev->sb_page); |
1623 | 1719 | ||
1624 | sb->feature_map = 0; | 1720 | sb->feature_map = 0; |
1625 | sb->pad0 = 0; | 1721 | sb->pad0 = 0; |
1626 | sb->recovery_offset = cpu_to_le64(0); | 1722 | sb->recovery_offset = cpu_to_le64(0); |
1627 | memset(sb->pad1, 0, sizeof(sb->pad1)); | 1723 | memset(sb->pad1, 0, sizeof(sb->pad1)); |
1628 | memset(sb->pad2, 0, sizeof(sb->pad2)); | ||
1629 | memset(sb->pad3, 0, sizeof(sb->pad3)); | 1724 | memset(sb->pad3, 0, sizeof(sb->pad3)); |
1630 | 1725 | ||
1631 | sb->utime = cpu_to_le64((__u64)mddev->utime); | 1726 | sb->utime = cpu_to_le64((__u64)mddev->utime); |
@@ -1665,6 +1760,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1665 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); | 1760 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
1666 | } | 1761 | } |
1667 | 1762 | ||
1763 | if (rdev->badblocks.count == 0) | ||
1764 | /* Nothing to do for bad blocks*/ ; | ||
1765 | else if (sb->bblog_offset == 0) | ||
1766 | /* Cannot record bad blocks on this device */ | ||
1767 | md_error(mddev, rdev); | ||
1768 | else { | ||
1769 | struct badblocks *bb = &rdev->badblocks; | ||
1770 | u64 *bbp = (u64 *)page_address(rdev->bb_page); | ||
1771 | u64 *p = bb->page; | ||
1772 | sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); | ||
1773 | if (bb->changed) { | ||
1774 | unsigned seq; | ||
1775 | |||
1776 | retry: | ||
1777 | seq = read_seqbegin(&bb->lock); | ||
1778 | |||
1779 | memset(bbp, 0xff, PAGE_SIZE); | ||
1780 | |||
1781 | for (i = 0 ; i < bb->count ; i++) { | ||
1782 | u64 internal_bb = *p++; | ||
1783 | u64 store_bb = ((BB_OFFSET(internal_bb) << 10) | ||
1784 | | BB_LEN(internal_bb)); | ||
1785 | *bbp++ = cpu_to_le64(store_bb); | ||
1786 | } | ||
1787 | if (read_seqretry(&bb->lock, seq)) | ||
1788 | goto retry; | ||
1789 | |||
1790 | bb->sector = (rdev->sb_start + | ||
1791 | (int)le32_to_cpu(sb->bblog_offset)); | ||
1792 | bb->size = le16_to_cpu(sb->bblog_size); | ||
1793 | bb->changed = 0; | ||
1794 | } | ||
1795 | } | ||
1796 | |||
1668 | max_dev = 0; | 1797 | max_dev = 0; |
1669 | list_for_each_entry(rdev2, &mddev->disks, same_set) | 1798 | list_for_each_entry(rdev2, &mddev->disks, same_set) |
1670 | if (rdev2->desc_nr+1 > max_dev) | 1799 | if (rdev2->desc_nr+1 > max_dev) |
@@ -1724,7 +1853,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1724 | num_sectors = max_sectors; | 1853 | num_sectors = max_sectors; |
1725 | rdev->sb_start = sb_start; | 1854 | rdev->sb_start = sb_start; |
1726 | } | 1855 | } |
1727 | sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); | 1856 | sb = page_address(rdev->sb_page); |
1728 | sb->data_size = cpu_to_le64(num_sectors); | 1857 | sb->data_size = cpu_to_le64(num_sectors); |
1729 | sb->super_offset = rdev->sb_start; | 1858 | sb->super_offset = rdev->sb_start; |
1730 | sb->sb_csum = calc_sb_1_csum(sb); | 1859 | sb->sb_csum = calc_sb_1_csum(sb); |
@@ -1922,7 +2051,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
1922 | bd_link_disk_holder(rdev->bdev, mddev->gendisk); | 2051 | bd_link_disk_holder(rdev->bdev, mddev->gendisk); |
1923 | 2052 | ||
1924 | /* May as well allow recovery to be retried once */ | 2053 | /* May as well allow recovery to be retried once */ |
1925 | mddev->recovery_disabled = 0; | 2054 | mddev->recovery_disabled++; |
1926 | 2055 | ||
1927 | return 0; | 2056 | return 0; |
1928 | 2057 | ||
@@ -1953,6 +2082,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) | |||
1953 | sysfs_remove_link(&rdev->kobj, "block"); | 2082 | sysfs_remove_link(&rdev->kobj, "block"); |
1954 | sysfs_put(rdev->sysfs_state); | 2083 | sysfs_put(rdev->sysfs_state); |
1955 | rdev->sysfs_state = NULL; | 2084 | rdev->sysfs_state = NULL; |
2085 | kfree(rdev->badblocks.page); | ||
2086 | rdev->badblocks.count = 0; | ||
2087 | rdev->badblocks.page = NULL; | ||
1956 | /* We need to delay this, otherwise we can deadlock when | 2088 | /* We need to delay this, otherwise we can deadlock when |
1957 | * writing to 'remove' to "dev/state". We also need | 2089 | * writing to 'remove' to "dev/state". We also need |
1958 | * to delay it due to rcu usage. | 2090 | * to delay it due to rcu usage. |
@@ -2127,10 +2259,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version) | |||
2127 | printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); | 2259 | printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); |
2128 | switch (major_version) { | 2260 | switch (major_version) { |
2129 | case 0: | 2261 | case 0: |
2130 | print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); | 2262 | print_sb_90(page_address(rdev->sb_page)); |
2131 | break; | 2263 | break; |
2132 | case 1: | 2264 | case 1: |
2133 | print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); | 2265 | print_sb_1(page_address(rdev->sb_page)); |
2134 | break; | 2266 | break; |
2135 | } | 2267 | } |
2136 | } else | 2268 | } else |
@@ -2194,6 +2326,7 @@ static void md_update_sb(mddev_t * mddev, int force_change) | |||
2194 | mdk_rdev_t *rdev; | 2326 | mdk_rdev_t *rdev; |
2195 | int sync_req; | 2327 | int sync_req; |
2196 | int nospares = 0; | 2328 | int nospares = 0; |
2329 | int any_badblocks_changed = 0; | ||
2197 | 2330 | ||
2198 | repeat: | 2331 | repeat: |
2199 | /* First make sure individual recovery_offsets are correct */ | 2332 | /* First make sure individual recovery_offsets are correct */ |
@@ -2208,8 +2341,18 @@ repeat: | |||
2208 | if (!mddev->persistent) { | 2341 | if (!mddev->persistent) { |
2209 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); | 2342 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); |
2210 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); | 2343 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); |
2211 | if (!mddev->external) | 2344 | if (!mddev->external) { |
2212 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 2345 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); |
2346 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
2347 | if (rdev->badblocks.changed) { | ||
2348 | md_ack_all_badblocks(&rdev->badblocks); | ||
2349 | md_error(mddev, rdev); | ||
2350 | } | ||
2351 | clear_bit(Blocked, &rdev->flags); | ||
2352 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
2353 | wake_up(&rdev->blocked_wait); | ||
2354 | } | ||
2355 | } | ||
2213 | wake_up(&mddev->sb_wait); | 2356 | wake_up(&mddev->sb_wait); |
2214 | return; | 2357 | return; |
2215 | } | 2358 | } |
@@ -2265,6 +2408,14 @@ repeat: | |||
2265 | MD_BUG(); | 2408 | MD_BUG(); |
2266 | mddev->events --; | 2409 | mddev->events --; |
2267 | } | 2410 | } |
2411 | |||
2412 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
2413 | if (rdev->badblocks.changed) | ||
2414 | any_badblocks_changed++; | ||
2415 | if (test_bit(Faulty, &rdev->flags)) | ||
2416 | set_bit(FaultRecorded, &rdev->flags); | ||
2417 | } | ||
2418 | |||
2268 | sync_sbs(mddev, nospares); | 2419 | sync_sbs(mddev, nospares); |
2269 | spin_unlock_irq(&mddev->write_lock); | 2420 | spin_unlock_irq(&mddev->write_lock); |
2270 | 2421 | ||
@@ -2290,6 +2441,13 @@ repeat: | |||
2290 | bdevname(rdev->bdev,b), | 2441 | bdevname(rdev->bdev,b), |
2291 | (unsigned long long)rdev->sb_start); | 2442 | (unsigned long long)rdev->sb_start); |
2292 | rdev->sb_events = mddev->events; | 2443 | rdev->sb_events = mddev->events; |
2444 | if (rdev->badblocks.size) { | ||
2445 | md_super_write(mddev, rdev, | ||
2446 | rdev->badblocks.sector, | ||
2447 | rdev->badblocks.size << 9, | ||
2448 | rdev->bb_page); | ||
2449 | rdev->badblocks.size = 0; | ||
2450 | } | ||
2293 | 2451 | ||
2294 | } else | 2452 | } else |
2295 | dprintk(")\n"); | 2453 | dprintk(")\n"); |
@@ -2313,6 +2471,15 @@ repeat: | |||
2313 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 2471 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
2314 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 2472 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
2315 | 2473 | ||
2474 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
2475 | if (test_and_clear_bit(FaultRecorded, &rdev->flags)) | ||
2476 | clear_bit(Blocked, &rdev->flags); | ||
2477 | |||
2478 | if (any_badblocks_changed) | ||
2479 | md_ack_all_badblocks(&rdev->badblocks); | ||
2480 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
2481 | wake_up(&rdev->blocked_wait); | ||
2482 | } | ||
2316 | } | 2483 | } |
2317 | 2484 | ||
2318 | /* words written to sysfs files may, or may not, be \n terminated. | 2485 | /* words written to sysfs files may, or may not, be \n terminated. |
@@ -2347,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
2347 | char *sep = ""; | 2514 | char *sep = ""; |
2348 | size_t len = 0; | 2515 | size_t len = 0; |
2349 | 2516 | ||
2350 | if (test_bit(Faulty, &rdev->flags)) { | 2517 | if (test_bit(Faulty, &rdev->flags) || |
2518 | rdev->badblocks.unacked_exist) { | ||
2351 | len+= sprintf(page+len, "%sfaulty",sep); | 2519 | len+= sprintf(page+len, "%sfaulty",sep); |
2352 | sep = ","; | 2520 | sep = ","; |
2353 | } | 2521 | } |
@@ -2359,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
2359 | len += sprintf(page+len, "%swrite_mostly",sep); | 2527 | len += sprintf(page+len, "%swrite_mostly",sep); |
2360 | sep = ","; | 2528 | sep = ","; |
2361 | } | 2529 | } |
2362 | if (test_bit(Blocked, &rdev->flags)) { | 2530 | if (test_bit(Blocked, &rdev->flags) || |
2531 | rdev->badblocks.unacked_exist) { | ||
2363 | len += sprintf(page+len, "%sblocked", sep); | 2532 | len += sprintf(page+len, "%sblocked", sep); |
2364 | sep = ","; | 2533 | sep = ","; |
2365 | } | 2534 | } |
@@ -2368,6 +2537,10 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
2368 | len += sprintf(page+len, "%sspare", sep); | 2537 | len += sprintf(page+len, "%sspare", sep); |
2369 | sep = ","; | 2538 | sep = ","; |
2370 | } | 2539 | } |
2540 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
2541 | len += sprintf(page+len, "%swrite_error", sep); | ||
2542 | sep = ","; | ||
2543 | } | ||
2371 | return len+sprintf(page+len, "\n"); | 2544 | return len+sprintf(page+len, "\n"); |
2372 | } | 2545 | } |
2373 | 2546 | ||
@@ -2375,13 +2548,15 @@ static ssize_t | |||
2375 | state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | 2548 | state_store(mdk_rdev_t *rdev, const char *buf, size_t len) |
2376 | { | 2549 | { |
2377 | /* can write | 2550 | /* can write |
2378 | * faulty - simulates and error | 2551 | * faulty - simulates an error |
2379 | * remove - disconnects the device | 2552 | * remove - disconnects the device |
2380 | * writemostly - sets write_mostly | 2553 | * writemostly - sets write_mostly |
2381 | * -writemostly - clears write_mostly | 2554 | * -writemostly - clears write_mostly |
2382 | * blocked - sets the Blocked flag | 2555 | * blocked - sets the Blocked flags |
2383 | * -blocked - clears the Blocked flag | 2556 | * -blocked - clears the Blocked and possibly simulates an error |
2384 | * insync - sets Insync providing device isn't active | 2557 | * insync - sets Insync providing device isn't active |
2558 | * write_error - sets WriteErrorSeen | ||
2559 | * -write_error - clears WriteErrorSeen | ||
2385 | */ | 2560 | */ |
2386 | int err = -EINVAL; | 2561 | int err = -EINVAL; |
2387 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { | 2562 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { |
@@ -2408,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2408 | set_bit(Blocked, &rdev->flags); | 2583 | set_bit(Blocked, &rdev->flags); |
2409 | err = 0; | 2584 | err = 0; |
2410 | } else if (cmd_match(buf, "-blocked")) { | 2585 | } else if (cmd_match(buf, "-blocked")) { |
2586 | if (!test_bit(Faulty, &rdev->flags) && | ||
2587 | test_bit(BlockedBadBlocks, &rdev->flags)) { | ||
2588 | /* metadata handler doesn't understand badblocks, | ||
2589 | * so we need to fail the device | ||
2590 | */ | ||
2591 | md_error(rdev->mddev, rdev); | ||
2592 | } | ||
2411 | clear_bit(Blocked, &rdev->flags); | 2593 | clear_bit(Blocked, &rdev->flags); |
2594 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
2412 | wake_up(&rdev->blocked_wait); | 2595 | wake_up(&rdev->blocked_wait); |
2413 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2596 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
2414 | md_wakeup_thread(rdev->mddev->thread); | 2597 | md_wakeup_thread(rdev->mddev->thread); |
@@ -2417,6 +2600,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2417 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { | 2600 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { |
2418 | set_bit(In_sync, &rdev->flags); | 2601 | set_bit(In_sync, &rdev->flags); |
2419 | err = 0; | 2602 | err = 0; |
2603 | } else if (cmd_match(buf, "write_error")) { | ||
2604 | set_bit(WriteErrorSeen, &rdev->flags); | ||
2605 | err = 0; | ||
2606 | } else if (cmd_match(buf, "-write_error")) { | ||
2607 | clear_bit(WriteErrorSeen, &rdev->flags); | ||
2608 | err = 0; | ||
2420 | } | 2609 | } |
2421 | if (!err) | 2610 | if (!err) |
2422 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2611 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
@@ -2459,7 +2648,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2459 | { | 2648 | { |
2460 | char *e; | 2649 | char *e; |
2461 | int err; | 2650 | int err; |
2462 | char nm[20]; | ||
2463 | int slot = simple_strtoul(buf, &e, 10); | 2651 | int slot = simple_strtoul(buf, &e, 10); |
2464 | if (strncmp(buf, "none", 4)==0) | 2652 | if (strncmp(buf, "none", 4)==0) |
2465 | slot = -1; | 2653 | slot = -1; |
@@ -2482,8 +2670,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2482 | hot_remove_disk(rdev->mddev, rdev->raid_disk); | 2670 | hot_remove_disk(rdev->mddev, rdev->raid_disk); |
2483 | if (err) | 2671 | if (err) |
2484 | return err; | 2672 | return err; |
2485 | sprintf(nm, "rd%d", rdev->raid_disk); | 2673 | sysfs_unlink_rdev(rdev->mddev, rdev); |
2486 | sysfs_remove_link(&rdev->mddev->kobj, nm); | ||
2487 | rdev->raid_disk = -1; | 2674 | rdev->raid_disk = -1; |
2488 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2675 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
2489 | md_wakeup_thread(rdev->mddev->thread); | 2676 | md_wakeup_thread(rdev->mddev->thread); |
@@ -2522,8 +2709,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2522 | return err; | 2709 | return err; |
2523 | } else | 2710 | } else |
2524 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2711 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
2525 | sprintf(nm, "rd%d", rdev->raid_disk); | 2712 | if (sysfs_link_rdev(rdev->mddev, rdev)) |
2526 | if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) | ||
2527 | /* failure here is OK */; | 2713 | /* failure here is OK */; |
2528 | /* don't wakeup anyone, leave that to userspace. */ | 2714 | /* don't wakeup anyone, leave that to userspace. */ |
2529 | } else { | 2715 | } else { |
@@ -2712,6 +2898,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le | |||
2712 | static struct rdev_sysfs_entry rdev_recovery_start = | 2898 | static struct rdev_sysfs_entry rdev_recovery_start = |
2713 | __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); | 2899 | __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); |
2714 | 2900 | ||
2901 | |||
2902 | static ssize_t | ||
2903 | badblocks_show(struct badblocks *bb, char *page, int unack); | ||
2904 | static ssize_t | ||
2905 | badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); | ||
2906 | |||
2907 | static ssize_t bb_show(mdk_rdev_t *rdev, char *page) | ||
2908 | { | ||
2909 | return badblocks_show(&rdev->badblocks, page, 0); | ||
2910 | } | ||
2911 | static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len) | ||
2912 | { | ||
2913 | int rv = badblocks_store(&rdev->badblocks, page, len, 0); | ||
2914 | /* Maybe that ack was all we needed */ | ||
2915 | if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) | ||
2916 | wake_up(&rdev->blocked_wait); | ||
2917 | return rv; | ||
2918 | } | ||
2919 | static struct rdev_sysfs_entry rdev_bad_blocks = | ||
2920 | __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); | ||
2921 | |||
2922 | |||
2923 | static ssize_t ubb_show(mdk_rdev_t *rdev, char *page) | ||
2924 | { | ||
2925 | return badblocks_show(&rdev->badblocks, page, 1); | ||
2926 | } | ||
2927 | static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len) | ||
2928 | { | ||
2929 | return badblocks_store(&rdev->badblocks, page, len, 1); | ||
2930 | } | ||
2931 | static struct rdev_sysfs_entry rdev_unack_bad_blocks = | ||
2932 | __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); | ||
2933 | |||
2715 | static struct attribute *rdev_default_attrs[] = { | 2934 | static struct attribute *rdev_default_attrs[] = { |
2716 | &rdev_state.attr, | 2935 | &rdev_state.attr, |
2717 | &rdev_errors.attr, | 2936 | &rdev_errors.attr, |
@@ -2719,6 +2938,8 @@ static struct attribute *rdev_default_attrs[] = { | |||
2719 | &rdev_offset.attr, | 2938 | &rdev_offset.attr, |
2720 | &rdev_size.attr, | 2939 | &rdev_size.attr, |
2721 | &rdev_recovery_start.attr, | 2940 | &rdev_recovery_start.attr, |
2941 | &rdev_bad_blocks.attr, | ||
2942 | &rdev_unack_bad_blocks.attr, | ||
2722 | NULL, | 2943 | NULL, |
2723 | }; | 2944 | }; |
2724 | static ssize_t | 2945 | static ssize_t |
@@ -2782,7 +3003,7 @@ static struct kobj_type rdev_ktype = { | |||
2782 | .default_attrs = rdev_default_attrs, | 3003 | .default_attrs = rdev_default_attrs, |
2783 | }; | 3004 | }; |
2784 | 3005 | ||
2785 | void md_rdev_init(mdk_rdev_t *rdev) | 3006 | int md_rdev_init(mdk_rdev_t *rdev) |
2786 | { | 3007 | { |
2787 | rdev->desc_nr = -1; | 3008 | rdev->desc_nr = -1; |
2788 | rdev->saved_raid_disk = -1; | 3009 | rdev->saved_raid_disk = -1; |
@@ -2792,12 +3013,27 @@ void md_rdev_init(mdk_rdev_t *rdev) | |||
2792 | rdev->sb_events = 0; | 3013 | rdev->sb_events = 0; |
2793 | rdev->last_read_error.tv_sec = 0; | 3014 | rdev->last_read_error.tv_sec = 0; |
2794 | rdev->last_read_error.tv_nsec = 0; | 3015 | rdev->last_read_error.tv_nsec = 0; |
3016 | rdev->sb_loaded = 0; | ||
3017 | rdev->bb_page = NULL; | ||
2795 | atomic_set(&rdev->nr_pending, 0); | 3018 | atomic_set(&rdev->nr_pending, 0); |
2796 | atomic_set(&rdev->read_errors, 0); | 3019 | atomic_set(&rdev->read_errors, 0); |
2797 | atomic_set(&rdev->corrected_errors, 0); | 3020 | atomic_set(&rdev->corrected_errors, 0); |
2798 | 3021 | ||
2799 | INIT_LIST_HEAD(&rdev->same_set); | 3022 | INIT_LIST_HEAD(&rdev->same_set); |
2800 | init_waitqueue_head(&rdev->blocked_wait); | 3023 | init_waitqueue_head(&rdev->blocked_wait); |
3024 | |||
3025 | /* Add space to store bad block list. | ||
3026 | * This reserves the space even on arrays where it cannot | ||
3027 | * be used - I wonder if that matters | ||
3028 | */ | ||
3029 | rdev->badblocks.count = 0; | ||
3030 | rdev->badblocks.shift = 0; | ||
3031 | rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
3032 | seqlock_init(&rdev->badblocks.lock); | ||
3033 | if (rdev->badblocks.page == NULL) | ||
3034 | return -ENOMEM; | ||
3035 | |||
3036 | return 0; | ||
2801 | } | 3037 | } |
2802 | EXPORT_SYMBOL_GPL(md_rdev_init); | 3038 | EXPORT_SYMBOL_GPL(md_rdev_init); |
2803 | /* | 3039 | /* |
@@ -2823,8 +3059,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
2823 | return ERR_PTR(-ENOMEM); | 3059 | return ERR_PTR(-ENOMEM); |
2824 | } | 3060 | } |
2825 | 3061 | ||
2826 | md_rdev_init(rdev); | 3062 | err = md_rdev_init(rdev); |
2827 | if ((err = alloc_disk_sb(rdev))) | 3063 | if (err) |
3064 | goto abort_free; | ||
3065 | err = alloc_disk_sb(rdev); | ||
3066 | if (err) | ||
2828 | goto abort_free; | 3067 | goto abort_free; |
2829 | 3068 | ||
2830 | err = lock_rdev(rdev, newdev, super_format == -2); | 3069 | err = lock_rdev(rdev, newdev, super_format == -2); |
@@ -2860,15 +3099,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
2860 | goto abort_free; | 3099 | goto abort_free; |
2861 | } | 3100 | } |
2862 | } | 3101 | } |
3102 | if (super_format == -1) | ||
3103 | /* hot-add for 0.90, or non-persistent: so no badblocks */ | ||
3104 | rdev->badblocks.shift = -1; | ||
2863 | 3105 | ||
2864 | return rdev; | 3106 | return rdev; |
2865 | 3107 | ||
2866 | abort_free: | 3108 | abort_free: |
2867 | if (rdev->sb_page) { | 3109 | if (rdev->bdev) |
2868 | if (rdev->bdev) | 3110 | unlock_rdev(rdev); |
2869 | unlock_rdev(rdev); | 3111 | free_disk_sb(rdev); |
2870 | free_disk_sb(rdev); | 3112 | kfree(rdev->badblocks.page); |
2871 | } | ||
2872 | kfree(rdev); | 3113 | kfree(rdev); |
2873 | return ERR_PTR(err); | 3114 | return ERR_PTR(err); |
2874 | } | 3115 | } |
@@ -3149,15 +3390,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
3149 | } | 3390 | } |
3150 | 3391 | ||
3151 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 3392 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
3152 | char nm[20]; | ||
3153 | if (rdev->raid_disk < 0) | 3393 | if (rdev->raid_disk < 0) |
3154 | continue; | 3394 | continue; |
3155 | if (rdev->new_raid_disk >= mddev->raid_disks) | 3395 | if (rdev->new_raid_disk >= mddev->raid_disks) |
3156 | rdev->new_raid_disk = -1; | 3396 | rdev->new_raid_disk = -1; |
3157 | if (rdev->new_raid_disk == rdev->raid_disk) | 3397 | if (rdev->new_raid_disk == rdev->raid_disk) |
3158 | continue; | 3398 | continue; |
3159 | sprintf(nm, "rd%d", rdev->raid_disk); | 3399 | sysfs_unlink_rdev(mddev, rdev); |
3160 | sysfs_remove_link(&mddev->kobj, nm); | ||
3161 | } | 3400 | } |
3162 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 3401 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
3163 | if (rdev->raid_disk < 0) | 3402 | if (rdev->raid_disk < 0) |
@@ -3168,11 +3407,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
3168 | if (rdev->raid_disk < 0) | 3407 | if (rdev->raid_disk < 0) |
3169 | clear_bit(In_sync, &rdev->flags); | 3408 | clear_bit(In_sync, &rdev->flags); |
3170 | else { | 3409 | else { |
3171 | char nm[20]; | 3410 | if (sysfs_link_rdev(mddev, rdev)) |
3172 | sprintf(nm, "rd%d", rdev->raid_disk); | 3411 | printk(KERN_WARNING "md: cannot register rd%d" |
3173 | if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) | 3412 | " for %s after level change\n", |
3174 | printk("md: cannot register %s for %s after level change\n", | 3413 | rdev->raid_disk, mdname(mddev)); |
3175 | nm, mdname(mddev)); | ||
3176 | } | 3414 | } |
3177 | } | 3415 | } |
3178 | 3416 | ||
@@ -4504,7 +4742,8 @@ int md_run(mddev_t *mddev) | |||
4504 | } | 4742 | } |
4505 | 4743 | ||
4506 | if (mddev->bio_set == NULL) | 4744 | if (mddev->bio_set == NULL) |
4507 | mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); | 4745 | mddev->bio_set = bioset_create(BIO_POOL_SIZE, |
4746 | sizeof(mddev_t *)); | ||
4508 | 4747 | ||
4509 | spin_lock(&pers_lock); | 4748 | spin_lock(&pers_lock); |
4510 | pers = find_pers(mddev->level, mddev->clevel); | 4749 | pers = find_pers(mddev->level, mddev->clevel); |
@@ -4621,12 +4860,9 @@ int md_run(mddev_t *mddev) | |||
4621 | smp_wmb(); | 4860 | smp_wmb(); |
4622 | mddev->ready = 1; | 4861 | mddev->ready = 1; |
4623 | list_for_each_entry(rdev, &mddev->disks, same_set) | 4862 | list_for_each_entry(rdev, &mddev->disks, same_set) |
4624 | if (rdev->raid_disk >= 0) { | 4863 | if (rdev->raid_disk >= 0) |
4625 | char nm[20]; | 4864 | if (sysfs_link_rdev(mddev, rdev)) |
4626 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
4627 | if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) | ||
4628 | /* failure here is OK */; | 4865 | /* failure here is OK */; |
4629 | } | ||
4630 | 4866 | ||
4631 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 4867 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
4632 | 4868 | ||
@@ -4854,11 +5090,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4854 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 5090 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
4855 | 5091 | ||
4856 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5092 | list_for_each_entry(rdev, &mddev->disks, same_set) |
4857 | if (rdev->raid_disk >= 0) { | 5093 | if (rdev->raid_disk >= 0) |
4858 | char nm[20]; | 5094 | sysfs_unlink_rdev(mddev, rdev); |
4859 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
4860 | sysfs_remove_link(&mddev->kobj, nm); | ||
4861 | } | ||
4862 | 5095 | ||
4863 | set_capacity(disk, 0); | 5096 | set_capacity(disk, 0); |
4864 | mutex_unlock(&mddev->open_mutex); | 5097 | mutex_unlock(&mddev->open_mutex); |
@@ -6198,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
6198 | if (!rdev || test_bit(Faulty, &rdev->flags)) | 6431 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
6199 | return; | 6432 | return; |
6200 | 6433 | ||
6201 | if (mddev->external) | 6434 | if (!mddev->pers || !mddev->pers->error_handler) |
6202 | set_bit(Blocked, &rdev->flags); | ||
6203 | /* | ||
6204 | dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", | ||
6205 | mdname(mddev), | ||
6206 | MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), | ||
6207 | __builtin_return_address(0),__builtin_return_address(1), | ||
6208 | __builtin_return_address(2),__builtin_return_address(3)); | ||
6209 | */ | ||
6210 | if (!mddev->pers) | ||
6211 | return; | ||
6212 | if (!mddev->pers->error_handler) | ||
6213 | return; | 6435 | return; |
6214 | mddev->pers->error_handler(mddev,rdev); | 6436 | mddev->pers->error_handler(mddev,rdev); |
6215 | if (mddev->degraded) | 6437 | if (mddev->degraded) |
@@ -6933,11 +7155,14 @@ void md_do_sync(mddev_t *mddev) | |||
6933 | atomic_add(sectors, &mddev->recovery_active); | 7155 | atomic_add(sectors, &mddev->recovery_active); |
6934 | } | 7156 | } |
6935 | 7157 | ||
7158 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
7159 | break; | ||
7160 | |||
6936 | j += sectors; | 7161 | j += sectors; |
6937 | if (j>1) mddev->curr_resync = j; | 7162 | if (j>1) mddev->curr_resync = j; |
6938 | mddev->curr_mark_cnt = io_sectors; | 7163 | mddev->curr_mark_cnt = io_sectors; |
6939 | if (last_check == 0) | 7164 | if (last_check == 0) |
6940 | /* this is the earliers that rebuilt will be | 7165 | /* this is the earliest that rebuild will be |
6941 | * visible in /proc/mdstat | 7166 | * visible in /proc/mdstat |
6942 | */ | 7167 | */ |
6943 | md_new_event(mddev); | 7168 | md_new_event(mddev); |
@@ -6946,10 +7171,6 @@ void md_do_sync(mddev_t *mddev) | |||
6946 | continue; | 7171 | continue; |
6947 | 7172 | ||
6948 | last_check = io_sectors; | 7173 | last_check = io_sectors; |
6949 | |||
6950 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
6951 | break; | ||
6952 | |||
6953 | repeat: | 7174 | repeat: |
6954 | if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { | 7175 | if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { |
6955 | /* step marks */ | 7176 | /* step marks */ |
@@ -7067,29 +7288,23 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
7067 | atomic_read(&rdev->nr_pending)==0) { | 7288 | atomic_read(&rdev->nr_pending)==0) { |
7068 | if (mddev->pers->hot_remove_disk( | 7289 | if (mddev->pers->hot_remove_disk( |
7069 | mddev, rdev->raid_disk)==0) { | 7290 | mddev, rdev->raid_disk)==0) { |
7070 | char nm[20]; | 7291 | sysfs_unlink_rdev(mddev, rdev); |
7071 | sprintf(nm,"rd%d", rdev->raid_disk); | ||
7072 | sysfs_remove_link(&mddev->kobj, nm); | ||
7073 | rdev->raid_disk = -1; | 7292 | rdev->raid_disk = -1; |
7074 | } | 7293 | } |
7075 | } | 7294 | } |
7076 | 7295 | ||
7077 | if (mddev->degraded && !mddev->recovery_disabled) { | 7296 | if (mddev->degraded) { |
7078 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 7297 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
7079 | if (rdev->raid_disk >= 0 && | 7298 | if (rdev->raid_disk >= 0 && |
7080 | !test_bit(In_sync, &rdev->flags) && | 7299 | !test_bit(In_sync, &rdev->flags) && |
7081 | !test_bit(Faulty, &rdev->flags) && | 7300 | !test_bit(Faulty, &rdev->flags)) |
7082 | !test_bit(Blocked, &rdev->flags)) | ||
7083 | spares++; | 7301 | spares++; |
7084 | if (rdev->raid_disk < 0 | 7302 | if (rdev->raid_disk < 0 |
7085 | && !test_bit(Faulty, &rdev->flags)) { | 7303 | && !test_bit(Faulty, &rdev->flags)) { |
7086 | rdev->recovery_offset = 0; | 7304 | rdev->recovery_offset = 0; |
7087 | if (mddev->pers-> | 7305 | if (mddev->pers-> |
7088 | hot_add_disk(mddev, rdev) == 0) { | 7306 | hot_add_disk(mddev, rdev) == 0) { |
7089 | char nm[20]; | 7307 | if (sysfs_link_rdev(mddev, rdev)) |
7090 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
7091 | if (sysfs_create_link(&mddev->kobj, | ||
7092 | &rdev->kobj, nm)) | ||
7093 | /* failure here is OK */; | 7308 | /* failure here is OK */; |
7094 | spares++; | 7309 | spares++; |
7095 | md_new_event(mddev); | 7310 | md_new_event(mddev); |
@@ -7138,6 +7353,8 @@ static void reap_sync_thread(mddev_t *mddev) | |||
7138 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 7353 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
7139 | sysfs_notify_dirent_safe(mddev->sysfs_action); | 7354 | sysfs_notify_dirent_safe(mddev->sysfs_action); |
7140 | md_new_event(mddev); | 7355 | md_new_event(mddev); |
7356 | if (mddev->event_work.func) | ||
7357 | queue_work(md_misc_wq, &mddev->event_work); | ||
7141 | } | 7358 | } |
7142 | 7359 | ||
7143 | /* | 7360 | /* |
@@ -7170,9 +7387,6 @@ void md_check_recovery(mddev_t *mddev) | |||
7170 | if (mddev->bitmap) | 7387 | if (mddev->bitmap) |
7171 | bitmap_daemon_work(mddev); | 7388 | bitmap_daemon_work(mddev); |
7172 | 7389 | ||
7173 | if (mddev->ro) | ||
7174 | return; | ||
7175 | |||
7176 | if (signal_pending(current)) { | 7390 | if (signal_pending(current)) { |
7177 | if (mddev->pers->sync_request && !mddev->external) { | 7391 | if (mddev->pers->sync_request && !mddev->external) { |
7178 | printk(KERN_INFO "md: %s in immediate safe mode\n", | 7392 | printk(KERN_INFO "md: %s in immediate safe mode\n", |
@@ -7209,9 +7423,7 @@ void md_check_recovery(mddev_t *mddev) | |||
7209 | atomic_read(&rdev->nr_pending)==0) { | 7423 | atomic_read(&rdev->nr_pending)==0) { |
7210 | if (mddev->pers->hot_remove_disk( | 7424 | if (mddev->pers->hot_remove_disk( |
7211 | mddev, rdev->raid_disk)==0) { | 7425 | mddev, rdev->raid_disk)==0) { |
7212 | char nm[20]; | 7426 | sysfs_unlink_rdev(mddev, rdev); |
7213 | sprintf(nm,"rd%d", rdev->raid_disk); | ||
7214 | sysfs_remove_link(&mddev->kobj, nm); | ||
7215 | rdev->raid_disk = -1; | 7427 | rdev->raid_disk = -1; |
7216 | } | 7428 | } |
7217 | } | 7429 | } |
@@ -7331,12 +7543,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) | |||
7331 | { | 7543 | { |
7332 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 7544 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
7333 | wait_event_timeout(rdev->blocked_wait, | 7545 | wait_event_timeout(rdev->blocked_wait, |
7334 | !test_bit(Blocked, &rdev->flags), | 7546 | !test_bit(Blocked, &rdev->flags) && |
7547 | !test_bit(BlockedBadBlocks, &rdev->flags), | ||
7335 | msecs_to_jiffies(5000)); | 7548 | msecs_to_jiffies(5000)); |
7336 | rdev_dec_pending(rdev, mddev); | 7549 | rdev_dec_pending(rdev, mddev); |
7337 | } | 7550 | } |
7338 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); | 7551 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); |
7339 | 7552 | ||
7553 | |||
7554 | /* Bad block management. | ||
7555 | * We can record which blocks on each device are 'bad' and so just | ||
7556 | * fail those blocks, or that stripe, rather than the whole device. | ||
7557 | * Entries in the bad-block table are 64bits wide. This comprises: | ||
7558 | * Length of bad-range, in sectors: 0-511 for lengths 1-512 | ||
7559 | * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) | ||
7560 | * A 'shift' can be set so that larger blocks are tracked and | ||
7561 | * consequently larger devices can be covered. | ||
7562 | * 'Acknowledged' flag - 1 bit. - the most significant bit. | ||
7563 | * | ||
7564 | * Locking of the bad-block table uses a seqlock so md_is_badblock | ||
7565 | * might need to retry if it is very unlucky. | ||
7566 | * We will sometimes want to check for bad blocks in a bi_end_io function, | ||
7567 | * so we use the write_seqlock_irq variant. | ||
7568 | * | ||
7569 | * When looking for a bad block we specify a range and want to | ||
7570 | * know if any block in the range is bad. So we binary-search | ||
7571 | * to the last range that starts at-or-before the given endpoint, | ||
7572 | * (or "before the sector after the target range") | ||
7573 | * then see if it ends after the given start. | ||
7574 | * We return | ||
7575 | * 0 if there are no known bad blocks in the range | ||
7576 | * 1 if there are known bad block which are all acknowledged | ||
7577 | * -1 if there are bad blocks which have not yet been acknowledged in metadata. | ||
7578 | * plus the start/length of the first bad section we overlap. | ||
7579 | */ | ||
7580 | int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, | ||
7581 | sector_t *first_bad, int *bad_sectors) | ||
7582 | { | ||
7583 | int hi; | ||
7584 | int lo = 0; | ||
7585 | u64 *p = bb->page; | ||
7586 | int rv = 0; | ||
7587 | sector_t target = s + sectors; | ||
7588 | unsigned seq; | ||
7589 | |||
7590 | if (bb->shift > 0) { | ||
7591 | /* round the start down, and the end up */ | ||
7592 | s >>= bb->shift; | ||
7593 | target += (1<<bb->shift) - 1; | ||
7594 | target >>= bb->shift; | ||
7595 | sectors = target - s; | ||
7596 | } | ||
7597 | /* 'target' is now the first block after the bad range */ | ||
7598 | |||
7599 | retry: | ||
7600 | seq = read_seqbegin(&bb->lock); | ||
7601 | |||
7602 | hi = bb->count; | ||
7603 | |||
7604 | /* Binary search between lo and hi for 'target' | ||
7605 | * i.e. for the last range that starts before 'target' | ||
7606 | */ | ||
7607 | /* INVARIANT: ranges before 'lo' and at-or-after 'hi' | ||
7608 | * are known not to be the last range before target. | ||
7609 | * VARIANT: hi-lo is the number of possible | ||
7610 | * ranges, and decreases until it reaches 1 | ||
7611 | */ | ||
7612 | while (hi - lo > 1) { | ||
7613 | int mid = (lo + hi) / 2; | ||
7614 | sector_t a = BB_OFFSET(p[mid]); | ||
7615 | if (a < target) | ||
7616 | /* This could still be the one, earlier ranges | ||
7617 | * could not. */ | ||
7618 | lo = mid; | ||
7619 | else | ||
7620 | /* This and later ranges are definitely out. */ | ||
7621 | hi = mid; | ||
7622 | } | ||
7623 | /* 'lo' might be the last that started before target, but 'hi' isn't */ | ||
7624 | if (hi > lo) { | ||
7625 | /* need to check all range that end after 's' to see if | ||
7626 | * any are unacknowledged. | ||
7627 | */ | ||
7628 | while (lo >= 0 && | ||
7629 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | ||
7630 | if (BB_OFFSET(p[lo]) < target) { | ||
7631 | /* starts before the end, and finishes after | ||
7632 | * the start, so they must overlap | ||
7633 | */ | ||
7634 | if (rv != -1 && BB_ACK(p[lo])) | ||
7635 | rv = 1; | ||
7636 | else | ||
7637 | rv = -1; | ||
7638 | *first_bad = BB_OFFSET(p[lo]); | ||
7639 | *bad_sectors = BB_LEN(p[lo]); | ||
7640 | } | ||
7641 | lo--; | ||
7642 | } | ||
7643 | } | ||
7644 | |||
7645 | if (read_seqretry(&bb->lock, seq)) | ||
7646 | goto retry; | ||
7647 | |||
7648 | return rv; | ||
7649 | } | ||
7650 | EXPORT_SYMBOL_GPL(md_is_badblock); | ||
7651 | |||
7652 | /* | ||
7653 | * Add a range of bad blocks to the table. | ||
7654 | * This might extend the table, or might contract it | ||
7655 | * if two adjacent ranges can be merged. | ||
7656 | * We binary-search to find the 'insertion' point, then | ||
7657 | * decide how best to handle it. | ||
7658 | */ | ||
7659 | static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, | ||
7660 | int acknowledged) | ||
7661 | { | ||
7662 | u64 *p; | ||
7663 | int lo, hi; | ||
7664 | int rv = 1; | ||
7665 | |||
7666 | if (bb->shift < 0) | ||
7667 | /* badblocks are disabled */ | ||
7668 | return 0; | ||
7669 | |||
7670 | if (bb->shift) { | ||
7671 | /* round the start down, and the end up */ | ||
7672 | sector_t next = s + sectors; | ||
7673 | s >>= bb->shift; | ||
7674 | next += (1<<bb->shift) - 1; | ||
7675 | next >>= bb->shift; | ||
7676 | sectors = next - s; | ||
7677 | } | ||
7678 | |||
7679 | write_seqlock_irq(&bb->lock); | ||
7680 | |||
7681 | p = bb->page; | ||
7682 | lo = 0; | ||
7683 | hi = bb->count; | ||
7684 | /* Find the last range that starts at-or-before 's' */ | ||
7685 | while (hi - lo > 1) { | ||
7686 | int mid = (lo + hi) / 2; | ||
7687 | sector_t a = BB_OFFSET(p[mid]); | ||
7688 | if (a <= s) | ||
7689 | lo = mid; | ||
7690 | else | ||
7691 | hi = mid; | ||
7692 | } | ||
7693 | if (hi > lo && BB_OFFSET(p[lo]) > s) | ||
7694 | hi = lo; | ||
7695 | |||
7696 | if (hi > lo) { | ||
7697 | /* we found a range that might merge with the start | ||
7698 | * of our new range | ||
7699 | */ | ||
7700 | sector_t a = BB_OFFSET(p[lo]); | ||
7701 | sector_t e = a + BB_LEN(p[lo]); | ||
7702 | int ack = BB_ACK(p[lo]); | ||
7703 | if (e >= s) { | ||
7704 | /* Yes, we can merge with a previous range */ | ||
7705 | if (s == a && s + sectors >= e) | ||
7706 | /* new range covers old */ | ||
7707 | ack = acknowledged; | ||
7708 | else | ||
7709 | ack = ack && acknowledged; | ||
7710 | |||
7711 | if (e < s + sectors) | ||
7712 | e = s + sectors; | ||
7713 | if (e - a <= BB_MAX_LEN) { | ||
7714 | p[lo] = BB_MAKE(a, e-a, ack); | ||
7715 | s = e; | ||
7716 | } else { | ||
7717 | /* does not all fit in one range, | ||
7718 | * make p[lo] maximal | ||
7719 | */ | ||
7720 | if (BB_LEN(p[lo]) != BB_MAX_LEN) | ||
7721 | p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); | ||
7722 | s = a + BB_MAX_LEN; | ||
7723 | } | ||
7724 | sectors = e - s; | ||
7725 | } | ||
7726 | } | ||
7727 | if (sectors && hi < bb->count) { | ||
7728 | /* 'hi' points to the first range that starts after 's'. | ||
7729 | * Maybe we can merge with the start of that range */ | ||
7730 | sector_t a = BB_OFFSET(p[hi]); | ||
7731 | sector_t e = a + BB_LEN(p[hi]); | ||
7732 | int ack = BB_ACK(p[hi]); | ||
7733 | if (a <= s + sectors) { | ||
7734 | /* merging is possible */ | ||
7735 | if (e <= s + sectors) { | ||
7736 | /* full overlap */ | ||
7737 | e = s + sectors; | ||
7738 | ack = acknowledged; | ||
7739 | } else | ||
7740 | ack = ack && acknowledged; | ||
7741 | |||
7742 | a = s; | ||
7743 | if (e - a <= BB_MAX_LEN) { | ||
7744 | p[hi] = BB_MAKE(a, e-a, ack); | ||
7745 | s = e; | ||
7746 | } else { | ||
7747 | p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); | ||
7748 | s = a + BB_MAX_LEN; | ||
7749 | } | ||
7750 | sectors = e - s; | ||
7751 | lo = hi; | ||
7752 | hi++; | ||
7753 | } | ||
7754 | } | ||
7755 | if (sectors == 0 && hi < bb->count) { | ||
7756 | /* we might be able to combine lo and hi */ | ||
7757 | /* Note: 's' is at the end of 'lo' */ | ||
7758 | sector_t a = BB_OFFSET(p[hi]); | ||
7759 | int lolen = BB_LEN(p[lo]); | ||
7760 | int hilen = BB_LEN(p[hi]); | ||
7761 | int newlen = lolen + hilen - (s - a); | ||
7762 | if (s >= a && newlen < BB_MAX_LEN) { | ||
7763 | /* yes, we can combine them */ | ||
7764 | int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); | ||
7765 | p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); | ||
7766 | memmove(p + hi, p + hi + 1, | ||
7767 | (bb->count - hi - 1) * 8); | ||
7768 | bb->count--; | ||
7769 | } | ||
7770 | } | ||
7771 | while (sectors) { | ||
7772 | /* didn't merge (it all). | ||
7773 | * Need to add a range just before 'hi' */ | ||
7774 | if (bb->count >= MD_MAX_BADBLOCKS) { | ||
7775 | /* No room for more */ | ||
7776 | rv = 0; | ||
7777 | break; | ||
7778 | } else { | ||
7779 | int this_sectors = sectors; | ||
7780 | memmove(p + hi + 1, p + hi, | ||
7781 | (bb->count - hi) * 8); | ||
7782 | bb->count++; | ||
7783 | |||
7784 | if (this_sectors > BB_MAX_LEN) | ||
7785 | this_sectors = BB_MAX_LEN; | ||
7786 | p[hi] = BB_MAKE(s, this_sectors, acknowledged); | ||
7787 | sectors -= this_sectors; | ||
7788 | s += this_sectors; | ||
7789 | } | ||
7790 | } | ||
7791 | |||
7792 | bb->changed = 1; | ||
7793 | if (!acknowledged) | ||
7794 | bb->unacked_exist = 1; | ||
7795 | write_sequnlock_irq(&bb->lock); | ||
7796 | |||
7797 | return rv; | ||
7798 | } | ||
7799 | |||
7800 | int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, | ||
7801 | int acknowledged) | ||
7802 | { | ||
7803 | int rv = md_set_badblocks(&rdev->badblocks, | ||
7804 | s + rdev->data_offset, sectors, acknowledged); | ||
7805 | if (rv) { | ||
7806 | /* Make sure they get written out promptly */ | ||
7807 | set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); | ||
7808 | md_wakeup_thread(rdev->mddev->thread); | ||
7809 | } | ||
7810 | return rv; | ||
7811 | } | ||
7812 | EXPORT_SYMBOL_GPL(rdev_set_badblocks); | ||
7813 | |||
7814 | /* | ||
7815 | * Remove a range of bad blocks from the table. | ||
7816 | * This may involve extending the table if we spilt a region, | ||
7817 | * but it must not fail. So if the table becomes full, we just | ||
7818 | * drop the remove request. | ||
7819 | */ | ||
7820 | static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) | ||
7821 | { | ||
7822 | u64 *p; | ||
7823 | int lo, hi; | ||
7824 | sector_t target = s + sectors; | ||
7825 | int rv = 0; | ||
7826 | |||
7827 | if (bb->shift > 0) { | ||
7828 | /* When clearing we round the start up and the end down. | ||
7829 | * This should not matter as the shift should align with | ||
7830 | * the block size and no rounding should ever be needed. | ||
7831 | * However it is better the think a block is bad when it | ||
7832 | * isn't than to think a block is not bad when it is. | ||
7833 | */ | ||
7834 | s += (1<<bb->shift) - 1; | ||
7835 | s >>= bb->shift; | ||
7836 | target >>= bb->shift; | ||
7837 | sectors = target - s; | ||
7838 | } | ||
7839 | |||
7840 | write_seqlock_irq(&bb->lock); | ||
7841 | |||
7842 | p = bb->page; | ||
7843 | lo = 0; | ||
7844 | hi = bb->count; | ||
7845 | /* Find the last range that starts before 'target' */ | ||
7846 | while (hi - lo > 1) { | ||
7847 | int mid = (lo + hi) / 2; | ||
7848 | sector_t a = BB_OFFSET(p[mid]); | ||
7849 | if (a < target) | ||
7850 | lo = mid; | ||
7851 | else | ||
7852 | hi = mid; | ||
7853 | } | ||
7854 | if (hi > lo) { | ||
7855 | /* p[lo] is the last range that could overlap the | ||
7856 | * current range. Earlier ranges could also overlap, | ||
7857 | * but only this one can overlap the end of the range. | ||
7858 | */ | ||
7859 | if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { | ||
7860 | /* Partial overlap, leave the tail of this range */ | ||
7861 | int ack = BB_ACK(p[lo]); | ||
7862 | sector_t a = BB_OFFSET(p[lo]); | ||
7863 | sector_t end = a + BB_LEN(p[lo]); | ||
7864 | |||
7865 | if (a < s) { | ||
7866 | /* we need to split this range */ | ||
7867 | if (bb->count >= MD_MAX_BADBLOCKS) { | ||
7868 | rv = 0; | ||
7869 | goto out; | ||
7870 | } | ||
7871 | memmove(p+lo+1, p+lo, (bb->count - lo) * 8); | ||
7872 | bb->count++; | ||
7873 | p[lo] = BB_MAKE(a, s-a, ack); | ||
7874 | lo++; | ||
7875 | } | ||
7876 | p[lo] = BB_MAKE(target, end - target, ack); | ||
7877 | /* there is no longer an overlap */ | ||
7878 | hi = lo; | ||
7879 | lo--; | ||
7880 | } | ||
7881 | while (lo >= 0 && | ||
7882 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | ||
7883 | /* This range does overlap */ | ||
7884 | if (BB_OFFSET(p[lo]) < s) { | ||
7885 | /* Keep the early parts of this range. */ | ||
7886 | int ack = BB_ACK(p[lo]); | ||
7887 | sector_t start = BB_OFFSET(p[lo]); | ||
7888 | p[lo] = BB_MAKE(start, s - start, ack); | ||
7889 | /* now low doesn't overlap, so.. */ | ||
7890 | break; | ||
7891 | } | ||
7892 | lo--; | ||
7893 | } | ||
7894 | /* 'lo' is strictly before, 'hi' is strictly after, | ||
7895 | * anything between needs to be discarded | ||
7896 | */ | ||
7897 | if (hi - lo > 1) { | ||
7898 | memmove(p+lo+1, p+hi, (bb->count - hi) * 8); | ||
7899 | bb->count -= (hi - lo - 1); | ||
7900 | } | ||
7901 | } | ||
7902 | |||
7903 | bb->changed = 1; | ||
7904 | out: | ||
7905 | write_sequnlock_irq(&bb->lock); | ||
7906 | return rv; | ||
7907 | } | ||
7908 | |||
7909 | int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors) | ||
7910 | { | ||
7911 | return md_clear_badblocks(&rdev->badblocks, | ||
7912 | s + rdev->data_offset, | ||
7913 | sectors); | ||
7914 | } | ||
7915 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); | ||
7916 | |||
7917 | /* | ||
7918 | * Acknowledge all bad blocks in a list. | ||
7919 | * This only succeeds if ->changed is clear. It is used by | ||
7920 | * in-kernel metadata updates | ||
7921 | */ | ||
7922 | void md_ack_all_badblocks(struct badblocks *bb) | ||
7923 | { | ||
7924 | if (bb->page == NULL || bb->changed) | ||
7925 | /* no point even trying */ | ||
7926 | return; | ||
7927 | write_seqlock_irq(&bb->lock); | ||
7928 | |||
7929 | if (bb->changed == 0) { | ||
7930 | u64 *p = bb->page; | ||
7931 | int i; | ||
7932 | for (i = 0; i < bb->count ; i++) { | ||
7933 | if (!BB_ACK(p[i])) { | ||
7934 | sector_t start = BB_OFFSET(p[i]); | ||
7935 | int len = BB_LEN(p[i]); | ||
7936 | p[i] = BB_MAKE(start, len, 1); | ||
7937 | } | ||
7938 | } | ||
7939 | bb->unacked_exist = 0; | ||
7940 | } | ||
7941 | write_sequnlock_irq(&bb->lock); | ||
7942 | } | ||
7943 | EXPORT_SYMBOL_GPL(md_ack_all_badblocks); | ||
7944 | |||
7945 | /* sysfs access to bad-blocks list. | ||
7946 | * We present two files. | ||
7947 | * 'bad-blocks' lists sector numbers and lengths of ranges that | ||
7948 | * are recorded as bad. The list is truncated to fit within | ||
7949 | * the one-page limit of sysfs. | ||
7950 | * Writing "sector length" to this file adds an acknowledged | ||
7951 | * bad block list. | ||
7952 | * 'unacknowledged-bad-blocks' lists bad blocks that have not yet | ||
7953 | * been acknowledged. Writing to this file adds bad blocks | ||
7954 | * without acknowledging them. This is largely for testing. | ||
7955 | */ | ||
7956 | |||
7957 | static ssize_t | ||
7958 | badblocks_show(struct badblocks *bb, char *page, int unack) | ||
7959 | { | ||
7960 | size_t len; | ||
7961 | int i; | ||
7962 | u64 *p = bb->page; | ||
7963 | unsigned seq; | ||
7964 | |||
7965 | if (bb->shift < 0) | ||
7966 | return 0; | ||
7967 | |||
7968 | retry: | ||
7969 | seq = read_seqbegin(&bb->lock); | ||
7970 | |||
7971 | len = 0; | ||
7972 | i = 0; | ||
7973 | |||
7974 | while (len < PAGE_SIZE && i < bb->count) { | ||
7975 | sector_t s = BB_OFFSET(p[i]); | ||
7976 | unsigned int length = BB_LEN(p[i]); | ||
7977 | int ack = BB_ACK(p[i]); | ||
7978 | i++; | ||
7979 | |||
7980 | if (unack && ack) | ||
7981 | continue; | ||
7982 | |||
7983 | len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", | ||
7984 | (unsigned long long)s << bb->shift, | ||
7985 | length << bb->shift); | ||
7986 | } | ||
7987 | if (unack && len == 0) | ||
7988 | bb->unacked_exist = 0; | ||
7989 | |||
7990 | if (read_seqretry(&bb->lock, seq)) | ||
7991 | goto retry; | ||
7992 | |||
7993 | return len; | ||
7994 | } | ||
7995 | |||
7996 | #define DO_DEBUG 1 | ||
7997 | |||
7998 | static ssize_t | ||
7999 | badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) | ||
8000 | { | ||
8001 | unsigned long long sector; | ||
8002 | int length; | ||
8003 | char newline; | ||
8004 | #ifdef DO_DEBUG | ||
8005 | /* Allow clearing via sysfs *only* for testing/debugging. | ||
8006 | * Normally only a successful write may clear a badblock | ||
8007 | */ | ||
8008 | int clear = 0; | ||
8009 | if (page[0] == '-') { | ||
8010 | clear = 1; | ||
8011 | page++; | ||
8012 | } | ||
8013 | #endif /* DO_DEBUG */ | ||
8014 | |||
8015 | switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { | ||
8016 | case 3: | ||
8017 | if (newline != '\n') | ||
8018 | return -EINVAL; | ||
8019 | case 2: | ||
8020 | if (length <= 0) | ||
8021 | return -EINVAL; | ||
8022 | break; | ||
8023 | default: | ||
8024 | return -EINVAL; | ||
8025 | } | ||
8026 | |||
8027 | #ifdef DO_DEBUG | ||
8028 | if (clear) { | ||
8029 | md_clear_badblocks(bb, sector, length); | ||
8030 | return len; | ||
8031 | } | ||
8032 | #endif /* DO_DEBUG */ | ||
8033 | if (md_set_badblocks(bb, sector, length, !unack)) | ||
8034 | return len; | ||
8035 | else | ||
8036 | return -ENOSPC; | ||
8037 | } | ||
8038 | |||
7340 | static int md_notify_reboot(struct notifier_block *this, | 8039 | static int md_notify_reboot(struct notifier_block *this, |
7341 | unsigned long code, void *x) | 8040 | unsigned long code, void *x) |
7342 | { | 8041 | { |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 1c26c7a08ae6..1e586bb4452e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -29,6 +29,13 @@ | |||
29 | typedef struct mddev_s mddev_t; | 29 | typedef struct mddev_s mddev_t; |
30 | typedef struct mdk_rdev_s mdk_rdev_t; | 30 | typedef struct mdk_rdev_s mdk_rdev_t; |
31 | 31 | ||
32 | /* Bad block numbers are stored sorted in a single page. | ||
33 | * 64bits is used for each block or extent. | ||
34 | * 54 bits are sector number, 9 bits are extent size, | ||
35 | * 1 bit is an 'acknowledged' flag. | ||
36 | */ | ||
37 | #define MD_MAX_BADBLOCKS (PAGE_SIZE/8) | ||
38 | |||
32 | /* | 39 | /* |
33 | * MD's 'extended' device | 40 | * MD's 'extended' device |
34 | */ | 41 | */ |
@@ -48,7 +55,7 @@ struct mdk_rdev_s | |||
48 | struct block_device *meta_bdev; | 55 | struct block_device *meta_bdev; |
49 | struct block_device *bdev; /* block device handle */ | 56 | struct block_device *bdev; /* block device handle */ |
50 | 57 | ||
51 | struct page *sb_page; | 58 | struct page *sb_page, *bb_page; |
52 | int sb_loaded; | 59 | int sb_loaded; |
53 | __u64 sb_events; | 60 | __u64 sb_events; |
54 | sector_t data_offset; /* start of data in array */ | 61 | sector_t data_offset; /* start of data in array */ |
@@ -74,9 +81,29 @@ struct mdk_rdev_s | |||
74 | #define In_sync 2 /* device is in_sync with rest of array */ | 81 | #define In_sync 2 /* device is in_sync with rest of array */ |
75 | #define WriteMostly 4 /* Avoid reading if at all possible */ | 82 | #define WriteMostly 4 /* Avoid reading if at all possible */ |
76 | #define AutoDetected 7 /* added by auto-detect */ | 83 | #define AutoDetected 7 /* added by auto-detect */ |
77 | #define Blocked 8 /* An error occurred on an externally | 84 | #define Blocked 8 /* An error occurred but has not yet |
78 | * managed array, don't allow writes | 85 | * been acknowledged by the metadata |
86 | * handler, so don't allow writes | ||
79 | * until it is cleared */ | 87 | * until it is cleared */ |
88 | #define WriteErrorSeen 9 /* A write error has been seen on this | ||
89 | * device | ||
90 | */ | ||
91 | #define FaultRecorded 10 /* Intermediate state for clearing | ||
92 | * Blocked. The Fault is/will-be | ||
93 | * recorded in the metadata, but that | ||
94 | * metadata hasn't been stored safely | ||
95 | * on disk yet. | ||
96 | */ | ||
97 | #define BlockedBadBlocks 11 /* A writer is blocked because they | ||
98 | * found an unacknowledged bad-block. | ||
99 | * This can safely be cleared at any | ||
100 | * time, and the writer will re-check. | ||
101 | * It may be set at any time, and at | ||
102 | * worst the writer will timeout and | ||
103 | * re-check. So setting it as | ||
104 | * accurately as possible is good, but | ||
105 | * not absolutely critical. | ||
106 | */ | ||
80 | wait_queue_head_t blocked_wait; | 107 | wait_queue_head_t blocked_wait; |
81 | 108 | ||
82 | int desc_nr; /* descriptor index in the superblock */ | 109 | int desc_nr; /* descriptor index in the superblock */ |
@@ -111,8 +138,54 @@ struct mdk_rdev_s | |||
111 | 138 | ||
112 | struct sysfs_dirent *sysfs_state; /* handle for 'state' | 139 | struct sysfs_dirent *sysfs_state; /* handle for 'state' |
113 | * sysfs entry */ | 140 | * sysfs entry */ |
141 | |||
142 | struct badblocks { | ||
143 | int count; /* count of bad blocks */ | ||
144 | int unacked_exist; /* there probably are unacknowledged | ||
145 | * bad blocks. This is only cleared | ||
146 | * when a read discovers none | ||
147 | */ | ||
148 | int shift; /* shift from sectors to block size | ||
149 | * a -ve shift means badblocks are | ||
150 | * disabled.*/ | ||
151 | u64 *page; /* badblock list */ | ||
152 | int changed; | ||
153 | seqlock_t lock; | ||
154 | |||
155 | sector_t sector; | ||
156 | sector_t size; /* in sectors */ | ||
157 | } badblocks; | ||
114 | }; | 158 | }; |
115 | 159 | ||
160 | #define BB_LEN_MASK (0x00000000000001FFULL) | ||
161 | #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) | ||
162 | #define BB_ACK_MASK (0x8000000000000000ULL) | ||
163 | #define BB_MAX_LEN 512 | ||
164 | #define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) | ||
165 | #define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) | ||
166 | #define BB_ACK(x) (!!((x) & BB_ACK_MASK)) | ||
167 | #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) | ||
168 | |||
169 | extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, | ||
170 | sector_t *first_bad, int *bad_sectors); | ||
171 | static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors, | ||
172 | sector_t *first_bad, int *bad_sectors) | ||
173 | { | ||
174 | if (unlikely(rdev->badblocks.count)) { | ||
175 | int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, | ||
176 | sectors, | ||
177 | first_bad, bad_sectors); | ||
178 | if (rv) | ||
179 | *first_bad -= rdev->data_offset; | ||
180 | return rv; | ||
181 | } | ||
182 | return 0; | ||
183 | } | ||
184 | extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, | ||
185 | int acknowledged); | ||
186 | extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors); | ||
187 | extern void md_ack_all_badblocks(struct badblocks *bb); | ||
188 | |||
116 | struct mddev_s | 189 | struct mddev_s |
117 | { | 190 | { |
118 | void *private; | 191 | void *private; |
@@ -239,9 +312,12 @@ struct mddev_s | |||
239 | #define MD_RECOVERY_FROZEN 9 | 312 | #define MD_RECOVERY_FROZEN 9 |
240 | 313 | ||
241 | unsigned long recovery; | 314 | unsigned long recovery; |
242 | int recovery_disabled; /* if we detect that recovery | 315 | /* If a RAID personality determines that recovery (of a particular |
243 | * will always fail, set this | 316 | * device) will fail due to a read error on the source device, it |
244 | * so we don't loop trying */ | 317 | * takes a copy of this number and does not attempt recovery again |
318 | * until this number changes. | ||
319 | */ | ||
320 | int recovery_disabled; | ||
245 | 321 | ||
246 | int in_sync; /* know to not need resync */ | 322 | int in_sync; /* know to not need resync */ |
247 | /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so | 323 | /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so |
@@ -304,11 +380,6 @@ struct mddev_s | |||
304 | * hot-adding a bitmap. It should | 380 | * hot-adding a bitmap. It should |
305 | * eventually be settable by sysfs. | 381 | * eventually be settable by sysfs. |
306 | */ | 382 | */ |
307 | /* When md is serving under dm, it might use a | ||
308 | * dirty_log to store the bits. | ||
309 | */ | ||
310 | struct dm_dirty_log *log; | ||
311 | |||
312 | struct mutex mutex; | 383 | struct mutex mutex; |
313 | unsigned long chunksize; | 384 | unsigned long chunksize; |
314 | unsigned long daemon_sleep; /* how many jiffies between updates? */ | 385 | unsigned long daemon_sleep; /* how many jiffies between updates? */ |
@@ -413,6 +484,20 @@ static inline char * mdname (mddev_t * mddev) | |||
413 | return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; | 484 | return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; |
414 | } | 485 | } |
415 | 486 | ||
487 | static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev) | ||
488 | { | ||
489 | char nm[20]; | ||
490 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
491 | return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); | ||
492 | } | ||
493 | |||
494 | static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev) | ||
495 | { | ||
496 | char nm[20]; | ||
497 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
498 | sysfs_remove_link(&mddev->kobj, nm); | ||
499 | } | ||
500 | |||
416 | /* | 501 | /* |
417 | * iterates through some rdev ringlist. It's safe to remove the | 502 | * iterates through some rdev ringlist. It's safe to remove the |
418 | * current 'rdev'. Dont touch 'tmp' though. | 503 | * current 'rdev'. Dont touch 'tmp' though. |
@@ -505,7 +590,7 @@ extern void mddev_init(mddev_t *mddev); | |||
505 | extern int md_run(mddev_t *mddev); | 590 | extern int md_run(mddev_t *mddev); |
506 | extern void md_stop(mddev_t *mddev); | 591 | extern void md_stop(mddev_t *mddev); |
507 | extern void md_stop_writes(mddev_t *mddev); | 592 | extern void md_stop_writes(mddev_t *mddev); |
508 | extern void md_rdev_init(mdk_rdev_t *rdev); | 593 | extern int md_rdev_init(mdk_rdev_t *rdev); |
509 | 594 | ||
510 | extern void mddev_suspend(mddev_t *mddev); | 595 | extern void mddev_suspend(mddev_t *mddev); |
511 | extern void mddev_resume(mddev_t *mddev); | 596 | extern void mddev_resume(mddev_t *mddev); |
@@ -514,4 +599,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | |||
514 | extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, | 599 | extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, |
515 | mddev_t *mddev); | 600 | mddev_t *mddev); |
516 | extern int mddev_check_plugged(mddev_t *mddev); | 601 | extern int mddev_check_plugged(mddev_t *mddev); |
602 | extern void md_trim_bio(struct bio *bio, int offset, int size); | ||
517 | #endif /* _MD_MD_H */ | 603 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f7431b6d8447..32323f0afd89 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -35,16 +35,13 @@ | |||
35 | #include <linux/delay.h> | 35 | #include <linux/delay.h> |
36 | #include <linux/blkdev.h> | 36 | #include <linux/blkdev.h> |
37 | #include <linux/seq_file.h> | 37 | #include <linux/seq_file.h> |
38 | #include <linux/ratelimit.h> | ||
38 | #include "md.h" | 39 | #include "md.h" |
39 | #include "raid1.h" | 40 | #include "raid1.h" |
40 | #include "bitmap.h" | 41 | #include "bitmap.h" |
41 | 42 | ||
42 | #define DEBUG 0 | 43 | #define DEBUG 0 |
43 | #if DEBUG | 44 | #define PRINTK(x...) do { if (DEBUG) printk(x); } while (0) |
44 | #define PRINTK(x...) printk(x) | ||
45 | #else | ||
46 | #define PRINTK(x...) | ||
47 | #endif | ||
48 | 45 | ||
49 | /* | 46 | /* |
50 | * Number of guaranteed r1bios in case of extreme VM load: | 47 | * Number of guaranteed r1bios in case of extreme VM load: |
@@ -166,7 +163,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) | |||
166 | 163 | ||
167 | for (i = 0; i < conf->raid_disks; i++) { | 164 | for (i = 0; i < conf->raid_disks; i++) { |
168 | struct bio **bio = r1_bio->bios + i; | 165 | struct bio **bio = r1_bio->bios + i; |
169 | if (*bio && *bio != IO_BLOCKED) | 166 | if (!BIO_SPECIAL(*bio)) |
170 | bio_put(*bio); | 167 | bio_put(*bio); |
171 | *bio = NULL; | 168 | *bio = NULL; |
172 | } | 169 | } |
@@ -176,12 +173,6 @@ static void free_r1bio(r1bio_t *r1_bio) | |||
176 | { | 173 | { |
177 | conf_t *conf = r1_bio->mddev->private; | 174 | conf_t *conf = r1_bio->mddev->private; |
178 | 175 | ||
179 | /* | ||
180 | * Wake up any possible resync thread that waits for the device | ||
181 | * to go idle. | ||
182 | */ | ||
183 | allow_barrier(conf); | ||
184 | |||
185 | put_all_bios(conf, r1_bio); | 176 | put_all_bios(conf, r1_bio); |
186 | mempool_free(r1_bio, conf->r1bio_pool); | 177 | mempool_free(r1_bio, conf->r1bio_pool); |
187 | } | 178 | } |
@@ -222,6 +213,33 @@ static void reschedule_retry(r1bio_t *r1_bio) | |||
222 | * operation and are ready to return a success/failure code to the buffer | 213 | * operation and are ready to return a success/failure code to the buffer |
223 | * cache layer. | 214 | * cache layer. |
224 | */ | 215 | */ |
216 | static void call_bio_endio(r1bio_t *r1_bio) | ||
217 | { | ||
218 | struct bio *bio = r1_bio->master_bio; | ||
219 | int done; | ||
220 | conf_t *conf = r1_bio->mddev->private; | ||
221 | |||
222 | if (bio->bi_phys_segments) { | ||
223 | unsigned long flags; | ||
224 | spin_lock_irqsave(&conf->device_lock, flags); | ||
225 | bio->bi_phys_segments--; | ||
226 | done = (bio->bi_phys_segments == 0); | ||
227 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
228 | } else | ||
229 | done = 1; | ||
230 | |||
231 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) | ||
232 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
233 | if (done) { | ||
234 | bio_endio(bio, 0); | ||
235 | /* | ||
236 | * Wake up any possible resync thread that waits for the device | ||
237 | * to go idle. | ||
238 | */ | ||
239 | allow_barrier(conf); | ||
240 | } | ||
241 | } | ||
242 | |||
225 | static void raid_end_bio_io(r1bio_t *r1_bio) | 243 | static void raid_end_bio_io(r1bio_t *r1_bio) |
226 | { | 244 | { |
227 | struct bio *bio = r1_bio->master_bio; | 245 | struct bio *bio = r1_bio->master_bio; |
@@ -234,8 +252,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio) | |||
234 | (unsigned long long) bio->bi_sector + | 252 | (unsigned long long) bio->bi_sector + |
235 | (bio->bi_size >> 9) - 1); | 253 | (bio->bi_size >> 9) - 1); |
236 | 254 | ||
237 | bio_endio(bio, | 255 | call_bio_endio(r1_bio); |
238 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | ||
239 | } | 256 | } |
240 | free_r1bio(r1_bio); | 257 | free_r1bio(r1_bio); |
241 | } | 258 | } |
@@ -287,36 +304,52 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
287 | * oops, read error: | 304 | * oops, read error: |
288 | */ | 305 | */ |
289 | char b[BDEVNAME_SIZE]; | 306 | char b[BDEVNAME_SIZE]; |
290 | if (printk_ratelimit()) | 307 | printk_ratelimited( |
291 | printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", | 308 | KERN_ERR "md/raid1:%s: %s: " |
292 | mdname(conf->mddev), | 309 | "rescheduling sector %llu\n", |
293 | bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); | 310 | mdname(conf->mddev), |
311 | bdevname(conf->mirrors[mirror].rdev->bdev, | ||
312 | b), | ||
313 | (unsigned long long)r1_bio->sector); | ||
314 | set_bit(R1BIO_ReadError, &r1_bio->state); | ||
294 | reschedule_retry(r1_bio); | 315 | reschedule_retry(r1_bio); |
295 | } | 316 | } |
296 | 317 | ||
297 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | 318 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); |
298 | } | 319 | } |
299 | 320 | ||
321 | static void close_write(r1bio_t *r1_bio) | ||
322 | { | ||
323 | /* it really is the end of this request */ | ||
324 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
325 | /* free extra copy of the data pages */ | ||
326 | int i = r1_bio->behind_page_count; | ||
327 | while (i--) | ||
328 | safe_put_page(r1_bio->behind_bvecs[i].bv_page); | ||
329 | kfree(r1_bio->behind_bvecs); | ||
330 | r1_bio->behind_bvecs = NULL; | ||
331 | } | ||
332 | /* clear the bitmap if all writes complete successfully */ | ||
333 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
334 | r1_bio->sectors, | ||
335 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
336 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
337 | md_write_end(r1_bio->mddev); | ||
338 | } | ||
339 | |||
300 | static void r1_bio_write_done(r1bio_t *r1_bio) | 340 | static void r1_bio_write_done(r1bio_t *r1_bio) |
301 | { | 341 | { |
302 | if (atomic_dec_and_test(&r1_bio->remaining)) | 342 | if (!atomic_dec_and_test(&r1_bio->remaining)) |
303 | { | 343 | return; |
304 | /* it really is the end of this request */ | 344 | |
305 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | 345 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) |
306 | /* free extra copy of the data pages */ | 346 | reschedule_retry(r1_bio); |
307 | int i = r1_bio->behind_page_count; | 347 | else { |
308 | while (i--) | 348 | close_write(r1_bio); |
309 | safe_put_page(r1_bio->behind_pages[i]); | 349 | if (test_bit(R1BIO_MadeGood, &r1_bio->state)) |
310 | kfree(r1_bio->behind_pages); | 350 | reschedule_retry(r1_bio); |
311 | r1_bio->behind_pages = NULL; | 351 | else |
312 | } | 352 | raid_end_bio_io(r1_bio); |
313 | /* clear the bitmap if all writes complete successfully */ | ||
314 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
315 | r1_bio->sectors, | ||
316 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
317 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
318 | md_write_end(r1_bio->mddev); | ||
319 | raid_end_bio_io(r1_bio); | ||
320 | } | 353 | } |
321 | } | 354 | } |
322 | 355 | ||
@@ -336,13 +369,11 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
336 | /* | 369 | /* |
337 | * 'one mirror IO has finished' event handler: | 370 | * 'one mirror IO has finished' event handler: |
338 | */ | 371 | */ |
339 | r1_bio->bios[mirror] = NULL; | ||
340 | to_put = bio; | ||
341 | if (!uptodate) { | 372 | if (!uptodate) { |
342 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | 373 | set_bit(WriteErrorSeen, |
343 | /* an I/O failed, we can't clear the bitmap */ | 374 | &conf->mirrors[mirror].rdev->flags); |
344 | set_bit(R1BIO_Degraded, &r1_bio->state); | 375 | set_bit(R1BIO_WriteError, &r1_bio->state); |
345 | } else | 376 | } else { |
346 | /* | 377 | /* |
347 | * Set R1BIO_Uptodate in our master bio, so that we | 378 | * Set R1BIO_Uptodate in our master bio, so that we |
348 | * will return a good error code for to the higher | 379 | * will return a good error code for to the higher |
@@ -353,8 +384,22 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
353 | * to user-side. So if something waits for IO, then it | 384 | * to user-side. So if something waits for IO, then it |
354 | * will wait for the 'master' bio. | 385 | * will wait for the 'master' bio. |
355 | */ | 386 | */ |
387 | sector_t first_bad; | ||
388 | int bad_sectors; | ||
389 | |||
390 | r1_bio->bios[mirror] = NULL; | ||
391 | to_put = bio; | ||
356 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 392 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
357 | 393 | ||
394 | /* Maybe we can clear some bad blocks. */ | ||
395 | if (is_badblock(conf->mirrors[mirror].rdev, | ||
396 | r1_bio->sector, r1_bio->sectors, | ||
397 | &first_bad, &bad_sectors)) { | ||
398 | r1_bio->bios[mirror] = IO_MADE_GOOD; | ||
399 | set_bit(R1BIO_MadeGood, &r1_bio->state); | ||
400 | } | ||
401 | } | ||
402 | |||
358 | update_head_pos(mirror, r1_bio); | 403 | update_head_pos(mirror, r1_bio); |
359 | 404 | ||
360 | if (behind) { | 405 | if (behind) { |
@@ -377,11 +422,13 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
377 | (unsigned long long) mbio->bi_sector, | 422 | (unsigned long long) mbio->bi_sector, |
378 | (unsigned long long) mbio->bi_sector + | 423 | (unsigned long long) mbio->bi_sector + |
379 | (mbio->bi_size >> 9) - 1); | 424 | (mbio->bi_size >> 9) - 1); |
380 | bio_endio(mbio, 0); | 425 | call_bio_endio(r1_bio); |
381 | } | 426 | } |
382 | } | 427 | } |
383 | } | 428 | } |
384 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | 429 | if (r1_bio->bios[mirror] == NULL) |
430 | rdev_dec_pending(conf->mirrors[mirror].rdev, | ||
431 | conf->mddev); | ||
385 | 432 | ||
386 | /* | 433 | /* |
387 | * Let's see if all mirrored write operations have finished | 434 | * Let's see if all mirrored write operations have finished |
@@ -408,10 +455,11 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
408 | * | 455 | * |
409 | * The rdev for the device selected will have nr_pending incremented. | 456 | * The rdev for the device selected will have nr_pending incremented. |
410 | */ | 457 | */ |
411 | static int read_balance(conf_t *conf, r1bio_t *r1_bio) | 458 | static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors) |
412 | { | 459 | { |
413 | const sector_t this_sector = r1_bio->sector; | 460 | const sector_t this_sector = r1_bio->sector; |
414 | const int sectors = r1_bio->sectors; | 461 | int sectors; |
462 | int best_good_sectors; | ||
415 | int start_disk; | 463 | int start_disk; |
416 | int best_disk; | 464 | int best_disk; |
417 | int i; | 465 | int i; |
@@ -426,8 +474,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
426 | * We take the first readable disk when above the resync window. | 474 | * We take the first readable disk when above the resync window. |
427 | */ | 475 | */ |
428 | retry: | 476 | retry: |
477 | sectors = r1_bio->sectors; | ||
429 | best_disk = -1; | 478 | best_disk = -1; |
430 | best_dist = MaxSector; | 479 | best_dist = MaxSector; |
480 | best_good_sectors = 0; | ||
481 | |||
431 | if (conf->mddev->recovery_cp < MaxSector && | 482 | if (conf->mddev->recovery_cp < MaxSector && |
432 | (this_sector + sectors >= conf->next_resync)) { | 483 | (this_sector + sectors >= conf->next_resync)) { |
433 | choose_first = 1; | 484 | choose_first = 1; |
@@ -439,6 +490,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
439 | 490 | ||
440 | for (i = 0 ; i < conf->raid_disks ; i++) { | 491 | for (i = 0 ; i < conf->raid_disks ; i++) { |
441 | sector_t dist; | 492 | sector_t dist; |
493 | sector_t first_bad; | ||
494 | int bad_sectors; | ||
495 | |||
442 | int disk = start_disk + i; | 496 | int disk = start_disk + i; |
443 | if (disk >= conf->raid_disks) | 497 | if (disk >= conf->raid_disks) |
444 | disk -= conf->raid_disks; | 498 | disk -= conf->raid_disks; |
@@ -461,6 +515,35 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
461 | /* This is a reasonable device to use. It might | 515 | /* This is a reasonable device to use. It might |
462 | * even be best. | 516 | * even be best. |
463 | */ | 517 | */ |
518 | if (is_badblock(rdev, this_sector, sectors, | ||
519 | &first_bad, &bad_sectors)) { | ||
520 | if (best_dist < MaxSector) | ||
521 | /* already have a better device */ | ||
522 | continue; | ||
523 | if (first_bad <= this_sector) { | ||
524 | /* cannot read here. If this is the 'primary' | ||
525 | * device, then we must not read beyond | ||
526 | * bad_sectors from another device.. | ||
527 | */ | ||
528 | bad_sectors -= (this_sector - first_bad); | ||
529 | if (choose_first && sectors > bad_sectors) | ||
530 | sectors = bad_sectors; | ||
531 | if (best_good_sectors > sectors) | ||
532 | best_good_sectors = sectors; | ||
533 | |||
534 | } else { | ||
535 | sector_t good_sectors = first_bad - this_sector; | ||
536 | if (good_sectors > best_good_sectors) { | ||
537 | best_good_sectors = good_sectors; | ||
538 | best_disk = disk; | ||
539 | } | ||
540 | if (choose_first) | ||
541 | break; | ||
542 | } | ||
543 | continue; | ||
544 | } else | ||
545 | best_good_sectors = sectors; | ||
546 | |||
464 | dist = abs(this_sector - conf->mirrors[disk].head_position); | 547 | dist = abs(this_sector - conf->mirrors[disk].head_position); |
465 | if (choose_first | 548 | if (choose_first |
466 | /* Don't change to another disk for sequential reads */ | 549 | /* Don't change to another disk for sequential reads */ |
@@ -489,10 +572,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
489 | rdev_dec_pending(rdev, conf->mddev); | 572 | rdev_dec_pending(rdev, conf->mddev); |
490 | goto retry; | 573 | goto retry; |
491 | } | 574 | } |
575 | sectors = best_good_sectors; | ||
492 | conf->next_seq_sect = this_sector + sectors; | 576 | conf->next_seq_sect = this_sector + sectors; |
493 | conf->last_used = best_disk; | 577 | conf->last_used = best_disk; |
494 | } | 578 | } |
495 | rcu_read_unlock(); | 579 | rcu_read_unlock(); |
580 | *max_sectors = sectors; | ||
496 | 581 | ||
497 | return best_disk; | 582 | return best_disk; |
498 | } | 583 | } |
@@ -672,30 +757,31 @@ static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio) | |||
672 | { | 757 | { |
673 | int i; | 758 | int i; |
674 | struct bio_vec *bvec; | 759 | struct bio_vec *bvec; |
675 | struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), | 760 | struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), |
676 | GFP_NOIO); | 761 | GFP_NOIO); |
677 | if (unlikely(!pages)) | 762 | if (unlikely(!bvecs)) |
678 | return; | 763 | return; |
679 | 764 | ||
680 | bio_for_each_segment(bvec, bio, i) { | 765 | bio_for_each_segment(bvec, bio, i) { |
681 | pages[i] = alloc_page(GFP_NOIO); | 766 | bvecs[i] = *bvec; |
682 | if (unlikely(!pages[i])) | 767 | bvecs[i].bv_page = alloc_page(GFP_NOIO); |
768 | if (unlikely(!bvecs[i].bv_page)) | ||
683 | goto do_sync_io; | 769 | goto do_sync_io; |
684 | memcpy(kmap(pages[i]) + bvec->bv_offset, | 770 | memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset, |
685 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); | 771 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); |
686 | kunmap(pages[i]); | 772 | kunmap(bvecs[i].bv_page); |
687 | kunmap(bvec->bv_page); | 773 | kunmap(bvec->bv_page); |
688 | } | 774 | } |
689 | r1_bio->behind_pages = pages; | 775 | r1_bio->behind_bvecs = bvecs; |
690 | r1_bio->behind_page_count = bio->bi_vcnt; | 776 | r1_bio->behind_page_count = bio->bi_vcnt; |
691 | set_bit(R1BIO_BehindIO, &r1_bio->state); | 777 | set_bit(R1BIO_BehindIO, &r1_bio->state); |
692 | return; | 778 | return; |
693 | 779 | ||
694 | do_sync_io: | 780 | do_sync_io: |
695 | for (i = 0; i < bio->bi_vcnt; i++) | 781 | for (i = 0; i < bio->bi_vcnt; i++) |
696 | if (pages[i]) | 782 | if (bvecs[i].bv_page) |
697 | put_page(pages[i]); | 783 | put_page(bvecs[i].bv_page); |
698 | kfree(pages); | 784 | kfree(bvecs); |
699 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | 785 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); |
700 | } | 786 | } |
701 | 787 | ||
@@ -705,7 +791,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
705 | mirror_info_t *mirror; | 791 | mirror_info_t *mirror; |
706 | r1bio_t *r1_bio; | 792 | r1bio_t *r1_bio; |
707 | struct bio *read_bio; | 793 | struct bio *read_bio; |
708 | int i, targets = 0, disks; | 794 | int i, disks; |
709 | struct bitmap *bitmap; | 795 | struct bitmap *bitmap; |
710 | unsigned long flags; | 796 | unsigned long flags; |
711 | const int rw = bio_data_dir(bio); | 797 | const int rw = bio_data_dir(bio); |
@@ -713,6 +799,9 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
713 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); | 799 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
714 | mdk_rdev_t *blocked_rdev; | 800 | mdk_rdev_t *blocked_rdev; |
715 | int plugged; | 801 | int plugged; |
802 | int first_clone; | ||
803 | int sectors_handled; | ||
804 | int max_sectors; | ||
716 | 805 | ||
717 | /* | 806 | /* |
718 | * Register the new request and wait if the reconstruction | 807 | * Register the new request and wait if the reconstruction |
@@ -759,11 +848,24 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
759 | r1_bio->mddev = mddev; | 848 | r1_bio->mddev = mddev; |
760 | r1_bio->sector = bio->bi_sector; | 849 | r1_bio->sector = bio->bi_sector; |
761 | 850 | ||
851 | /* We might need to issue multiple reads to different | ||
852 | * devices if there are bad blocks around, so we keep | ||
853 | * track of the number of reads in bio->bi_phys_segments. | ||
854 | * If this is 0, there is only one r1_bio and no locking | ||
855 | * will be needed when requests complete. If it is | ||
856 | * non-zero, then it is the number of not-completed requests. | ||
857 | */ | ||
858 | bio->bi_phys_segments = 0; | ||
859 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
860 | |||
762 | if (rw == READ) { | 861 | if (rw == READ) { |
763 | /* | 862 | /* |
764 | * read balancing logic: | 863 | * read balancing logic: |
765 | */ | 864 | */ |
766 | int rdisk = read_balance(conf, r1_bio); | 865 | int rdisk; |
866 | |||
867 | read_again: | ||
868 | rdisk = read_balance(conf, r1_bio, &max_sectors); | ||
767 | 869 | ||
768 | if (rdisk < 0) { | 870 | if (rdisk < 0) { |
769 | /* couldn't find anywhere to read from */ | 871 | /* couldn't find anywhere to read from */ |
@@ -784,6 +886,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
784 | r1_bio->read_disk = rdisk; | 886 | r1_bio->read_disk = rdisk; |
785 | 887 | ||
786 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 888 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
889 | md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector, | ||
890 | max_sectors); | ||
787 | 891 | ||
788 | r1_bio->bios[rdisk] = read_bio; | 892 | r1_bio->bios[rdisk] = read_bio; |
789 | 893 | ||
@@ -793,16 +897,52 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
793 | read_bio->bi_rw = READ | do_sync; | 897 | read_bio->bi_rw = READ | do_sync; |
794 | read_bio->bi_private = r1_bio; | 898 | read_bio->bi_private = r1_bio; |
795 | 899 | ||
796 | generic_make_request(read_bio); | 900 | if (max_sectors < r1_bio->sectors) { |
901 | /* could not read all from this device, so we will | ||
902 | * need another r1_bio. | ||
903 | */ | ||
904 | |||
905 | sectors_handled = (r1_bio->sector + max_sectors | ||
906 | - bio->bi_sector); | ||
907 | r1_bio->sectors = max_sectors; | ||
908 | spin_lock_irq(&conf->device_lock); | ||
909 | if (bio->bi_phys_segments == 0) | ||
910 | bio->bi_phys_segments = 2; | ||
911 | else | ||
912 | bio->bi_phys_segments++; | ||
913 | spin_unlock_irq(&conf->device_lock); | ||
914 | /* Cannot call generic_make_request directly | ||
915 | * as that will be queued in __make_request | ||
916 | * and subsequent mempool_alloc might block waiting | ||
917 | * for it. So hand bio over to raid1d. | ||
918 | */ | ||
919 | reschedule_retry(r1_bio); | ||
920 | |||
921 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
922 | |||
923 | r1_bio->master_bio = bio; | ||
924 | r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; | ||
925 | r1_bio->state = 0; | ||
926 | r1_bio->mddev = mddev; | ||
927 | r1_bio->sector = bio->bi_sector + sectors_handled; | ||
928 | goto read_again; | ||
929 | } else | ||
930 | generic_make_request(read_bio); | ||
797 | return 0; | 931 | return 0; |
798 | } | 932 | } |
799 | 933 | ||
800 | /* | 934 | /* |
801 | * WRITE: | 935 | * WRITE: |
802 | */ | 936 | */ |
803 | /* first select target devices under spinlock and | 937 | /* first select target devices under rcu_lock and |
804 | * inc refcount on their rdev. Record them by setting | 938 | * inc refcount on their rdev. Record them by setting |
805 | * bios[x] to bio | 939 | * bios[x] to bio |
940 | * If there are known/acknowledged bad blocks on any device on | ||
941 | * which we have seen a write error, we want to avoid writing those | ||
942 | * blocks. | ||
943 | * This potentially requires several writes to write around | ||
944 | * the bad blocks. Each set of writes gets it's own r1bio | ||
945 | * with a set of bios attached. | ||
806 | */ | 946 | */ |
807 | plugged = mddev_check_plugged(mddev); | 947 | plugged = mddev_check_plugged(mddev); |
808 | 948 | ||
@@ -810,6 +950,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
810 | retry_write: | 950 | retry_write: |
811 | blocked_rdev = NULL; | 951 | blocked_rdev = NULL; |
812 | rcu_read_lock(); | 952 | rcu_read_lock(); |
953 | max_sectors = r1_bio->sectors; | ||
813 | for (i = 0; i < disks; i++) { | 954 | for (i = 0; i < disks; i++) { |
814 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | 955 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); |
815 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | 956 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { |
@@ -817,17 +958,56 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
817 | blocked_rdev = rdev; | 958 | blocked_rdev = rdev; |
818 | break; | 959 | break; |
819 | } | 960 | } |
820 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 961 | r1_bio->bios[i] = NULL; |
821 | atomic_inc(&rdev->nr_pending); | 962 | if (!rdev || test_bit(Faulty, &rdev->flags)) { |
822 | if (test_bit(Faulty, &rdev->flags)) { | 963 | set_bit(R1BIO_Degraded, &r1_bio->state); |
964 | continue; | ||
965 | } | ||
966 | |||
967 | atomic_inc(&rdev->nr_pending); | ||
968 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
969 | sector_t first_bad; | ||
970 | int bad_sectors; | ||
971 | int is_bad; | ||
972 | |||
973 | is_bad = is_badblock(rdev, r1_bio->sector, | ||
974 | max_sectors, | ||
975 | &first_bad, &bad_sectors); | ||
976 | if (is_bad < 0) { | ||
977 | /* mustn't write here until the bad block is | ||
978 | * acknowledged*/ | ||
979 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
980 | blocked_rdev = rdev; | ||
981 | break; | ||
982 | } | ||
983 | if (is_bad && first_bad <= r1_bio->sector) { | ||
984 | /* Cannot write here at all */ | ||
985 | bad_sectors -= (r1_bio->sector - first_bad); | ||
986 | if (bad_sectors < max_sectors) | ||
987 | /* mustn't write more than bad_sectors | ||
988 | * to other devices yet | ||
989 | */ | ||
990 | max_sectors = bad_sectors; | ||
823 | rdev_dec_pending(rdev, mddev); | 991 | rdev_dec_pending(rdev, mddev); |
824 | r1_bio->bios[i] = NULL; | 992 | /* We don't set R1BIO_Degraded as that |
825 | } else { | 993 | * only applies if the disk is |
826 | r1_bio->bios[i] = bio; | 994 | * missing, so it might be re-added, |
827 | targets++; | 995 | * and we want to know to recover this |
996 | * chunk. | ||
997 | * In this case the device is here, | ||
998 | * and the fact that this chunk is not | ||
999 | * in-sync is recorded in the bad | ||
1000 | * block log | ||
1001 | */ | ||
1002 | continue; | ||
828 | } | 1003 | } |
829 | } else | 1004 | if (is_bad) { |
830 | r1_bio->bios[i] = NULL; | 1005 | int good_sectors = first_bad - r1_bio->sector; |
1006 | if (good_sectors < max_sectors) | ||
1007 | max_sectors = good_sectors; | ||
1008 | } | ||
1009 | } | ||
1010 | r1_bio->bios[i] = bio; | ||
831 | } | 1011 | } |
832 | rcu_read_unlock(); | 1012 | rcu_read_unlock(); |
833 | 1013 | ||
@@ -838,51 +1018,57 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
838 | for (j = 0; j < i; j++) | 1018 | for (j = 0; j < i; j++) |
839 | if (r1_bio->bios[j]) | 1019 | if (r1_bio->bios[j]) |
840 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 1020 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
841 | 1021 | r1_bio->state = 0; | |
842 | allow_barrier(conf); | 1022 | allow_barrier(conf); |
843 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1023 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
844 | wait_barrier(conf); | 1024 | wait_barrier(conf); |
845 | goto retry_write; | 1025 | goto retry_write; |
846 | } | 1026 | } |
847 | 1027 | ||
848 | BUG_ON(targets == 0); /* we never fail the last device */ | 1028 | if (max_sectors < r1_bio->sectors) { |
849 | 1029 | /* We are splitting this write into multiple parts, so | |
850 | if (targets < conf->raid_disks) { | 1030 | * we need to prepare for allocating another r1_bio. |
851 | /* array is degraded, we will not clear the bitmap | 1031 | */ |
852 | * on I/O completion (see raid1_end_write_request) */ | 1032 | r1_bio->sectors = max_sectors; |
853 | set_bit(R1BIO_Degraded, &r1_bio->state); | 1033 | spin_lock_irq(&conf->device_lock); |
1034 | if (bio->bi_phys_segments == 0) | ||
1035 | bio->bi_phys_segments = 2; | ||
1036 | else | ||
1037 | bio->bi_phys_segments++; | ||
1038 | spin_unlock_irq(&conf->device_lock); | ||
854 | } | 1039 | } |
855 | 1040 | sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; | |
856 | /* do behind I/O ? | ||
857 | * Not if there are too many, or cannot allocate memory, | ||
858 | * or a reader on WriteMostly is waiting for behind writes | ||
859 | * to flush */ | ||
860 | if (bitmap && | ||
861 | (atomic_read(&bitmap->behind_writes) | ||
862 | < mddev->bitmap_info.max_write_behind) && | ||
863 | !waitqueue_active(&bitmap->behind_wait)) | ||
864 | alloc_behind_pages(bio, r1_bio); | ||
865 | 1041 | ||
866 | atomic_set(&r1_bio->remaining, 1); | 1042 | atomic_set(&r1_bio->remaining, 1); |
867 | atomic_set(&r1_bio->behind_remaining, 0); | 1043 | atomic_set(&r1_bio->behind_remaining, 0); |
868 | 1044 | ||
869 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, | 1045 | first_clone = 1; |
870 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
871 | for (i = 0; i < disks; i++) { | 1046 | for (i = 0; i < disks; i++) { |
872 | struct bio *mbio; | 1047 | struct bio *mbio; |
873 | if (!r1_bio->bios[i]) | 1048 | if (!r1_bio->bios[i]) |
874 | continue; | 1049 | continue; |
875 | 1050 | ||
876 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1051 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
877 | r1_bio->bios[i] = mbio; | 1052 | md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors); |
878 | 1053 | ||
879 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; | 1054 | if (first_clone) { |
880 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1055 | /* do behind I/O ? |
881 | mbio->bi_end_io = raid1_end_write_request; | 1056 | * Not if there are too many, or cannot |
882 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; | 1057 | * allocate memory, or a reader on WriteMostly |
883 | mbio->bi_private = r1_bio; | 1058 | * is waiting for behind writes to flush */ |
884 | 1059 | if (bitmap && | |
885 | if (r1_bio->behind_pages) { | 1060 | (atomic_read(&bitmap->behind_writes) |
1061 | < mddev->bitmap_info.max_write_behind) && | ||
1062 | !waitqueue_active(&bitmap->behind_wait)) | ||
1063 | alloc_behind_pages(mbio, r1_bio); | ||
1064 | |||
1065 | bitmap_startwrite(bitmap, r1_bio->sector, | ||
1066 | r1_bio->sectors, | ||
1067 | test_bit(R1BIO_BehindIO, | ||
1068 | &r1_bio->state)); | ||
1069 | first_clone = 0; | ||
1070 | } | ||
1071 | if (r1_bio->behind_bvecs) { | ||
886 | struct bio_vec *bvec; | 1072 | struct bio_vec *bvec; |
887 | int j; | 1073 | int j; |
888 | 1074 | ||
@@ -894,11 +1080,20 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
894 | * them all | 1080 | * them all |
895 | */ | 1081 | */ |
896 | __bio_for_each_segment(bvec, mbio, j, 0) | 1082 | __bio_for_each_segment(bvec, mbio, j, 0) |
897 | bvec->bv_page = r1_bio->behind_pages[j]; | 1083 | bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; |
898 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) | 1084 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) |
899 | atomic_inc(&r1_bio->behind_remaining); | 1085 | atomic_inc(&r1_bio->behind_remaining); |
900 | } | 1086 | } |
901 | 1087 | ||
1088 | r1_bio->bios[i] = mbio; | ||
1089 | |||
1090 | mbio->bi_sector = (r1_bio->sector + | ||
1091 | conf->mirrors[i].rdev->data_offset); | ||
1092 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1093 | mbio->bi_end_io = raid1_end_write_request; | ||
1094 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; | ||
1095 | mbio->bi_private = r1_bio; | ||
1096 | |||
902 | atomic_inc(&r1_bio->remaining); | 1097 | atomic_inc(&r1_bio->remaining); |
903 | spin_lock_irqsave(&conf->device_lock, flags); | 1098 | spin_lock_irqsave(&conf->device_lock, flags); |
904 | bio_list_add(&conf->pending_bio_list, mbio); | 1099 | bio_list_add(&conf->pending_bio_list, mbio); |
@@ -909,6 +1104,19 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
909 | /* In case raid1d snuck in to freeze_array */ | 1104 | /* In case raid1d snuck in to freeze_array */ |
910 | wake_up(&conf->wait_barrier); | 1105 | wake_up(&conf->wait_barrier); |
911 | 1106 | ||
1107 | if (sectors_handled < (bio->bi_size >> 9)) { | ||
1108 | /* We need another r1_bio. It has already been counted | ||
1109 | * in bio->bi_phys_segments | ||
1110 | */ | ||
1111 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
1112 | r1_bio->master_bio = bio; | ||
1113 | r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; | ||
1114 | r1_bio->state = 0; | ||
1115 | r1_bio->mddev = mddev; | ||
1116 | r1_bio->sector = bio->bi_sector + sectors_handled; | ||
1117 | goto retry_write; | ||
1118 | } | ||
1119 | |||
912 | if (do_sync || !bitmap || !plugged) | 1120 | if (do_sync || !bitmap || !plugged) |
913 | md_wakeup_thread(mddev->thread); | 1121 | md_wakeup_thread(mddev->thread); |
914 | 1122 | ||
@@ -952,9 +1160,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
952 | * However don't try a recovery from this drive as | 1160 | * However don't try a recovery from this drive as |
953 | * it is very likely to fail. | 1161 | * it is very likely to fail. |
954 | */ | 1162 | */ |
955 | mddev->recovery_disabled = 1; | 1163 | conf->recovery_disabled = mddev->recovery_disabled; |
956 | return; | 1164 | return; |
957 | } | 1165 | } |
1166 | set_bit(Blocked, &rdev->flags); | ||
958 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1167 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
959 | unsigned long flags; | 1168 | unsigned long flags; |
960 | spin_lock_irqsave(&conf->device_lock, flags); | 1169 | spin_lock_irqsave(&conf->device_lock, flags); |
@@ -1027,7 +1236,7 @@ static int raid1_spare_active(mddev_t *mddev) | |||
1027 | && !test_bit(Faulty, &rdev->flags) | 1236 | && !test_bit(Faulty, &rdev->flags) |
1028 | && !test_and_set_bit(In_sync, &rdev->flags)) { | 1237 | && !test_and_set_bit(In_sync, &rdev->flags)) { |
1029 | count++; | 1238 | count++; |
1030 | sysfs_notify_dirent(rdev->sysfs_state); | 1239 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
1031 | } | 1240 | } |
1032 | } | 1241 | } |
1033 | spin_lock_irqsave(&conf->device_lock, flags); | 1242 | spin_lock_irqsave(&conf->device_lock, flags); |
@@ -1048,6 +1257,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1048 | int first = 0; | 1257 | int first = 0; |
1049 | int last = mddev->raid_disks - 1; | 1258 | int last = mddev->raid_disks - 1; |
1050 | 1259 | ||
1260 | if (mddev->recovery_disabled == conf->recovery_disabled) | ||
1261 | return -EBUSY; | ||
1262 | |||
1051 | if (rdev->raid_disk >= 0) | 1263 | if (rdev->raid_disk >= 0) |
1052 | first = last = rdev->raid_disk; | 1264 | first = last = rdev->raid_disk; |
1053 | 1265 | ||
@@ -1103,7 +1315,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number) | |||
1103 | * is not possible. | 1315 | * is not possible. |
1104 | */ | 1316 | */ |
1105 | if (!test_bit(Faulty, &rdev->flags) && | 1317 | if (!test_bit(Faulty, &rdev->flags) && |
1106 | !mddev->recovery_disabled && | 1318 | mddev->recovery_disabled != conf->recovery_disabled && |
1107 | mddev->degraded < conf->raid_disks) { | 1319 | mddev->degraded < conf->raid_disks) { |
1108 | err = -EBUSY; | 1320 | err = -EBUSY; |
1109 | goto abort; | 1321 | goto abort; |
@@ -1155,6 +1367,8 @@ static void end_sync_write(struct bio *bio, int error) | |||
1155 | conf_t *conf = mddev->private; | 1367 | conf_t *conf = mddev->private; |
1156 | int i; | 1368 | int i; |
1157 | int mirror=0; | 1369 | int mirror=0; |
1370 | sector_t first_bad; | ||
1371 | int bad_sectors; | ||
1158 | 1372 | ||
1159 | for (i = 0; i < conf->raid_disks; i++) | 1373 | for (i = 0; i < conf->raid_disks; i++) |
1160 | if (r1_bio->bios[i] == bio) { | 1374 | if (r1_bio->bios[i] == bio) { |
@@ -1172,18 +1386,48 @@ static void end_sync_write(struct bio *bio, int error) | |||
1172 | s += sync_blocks; | 1386 | s += sync_blocks; |
1173 | sectors_to_go -= sync_blocks; | 1387 | sectors_to_go -= sync_blocks; |
1174 | } while (sectors_to_go > 0); | 1388 | } while (sectors_to_go > 0); |
1175 | md_error(mddev, conf->mirrors[mirror].rdev); | 1389 | set_bit(WriteErrorSeen, |
1176 | } | 1390 | &conf->mirrors[mirror].rdev->flags); |
1391 | set_bit(R1BIO_WriteError, &r1_bio->state); | ||
1392 | } else if (is_badblock(conf->mirrors[mirror].rdev, | ||
1393 | r1_bio->sector, | ||
1394 | r1_bio->sectors, | ||
1395 | &first_bad, &bad_sectors) && | ||
1396 | !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, | ||
1397 | r1_bio->sector, | ||
1398 | r1_bio->sectors, | ||
1399 | &first_bad, &bad_sectors) | ||
1400 | ) | ||
1401 | set_bit(R1BIO_MadeGood, &r1_bio->state); | ||
1177 | 1402 | ||
1178 | update_head_pos(mirror, r1_bio); | 1403 | update_head_pos(mirror, r1_bio); |
1179 | 1404 | ||
1180 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 1405 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
1181 | sector_t s = r1_bio->sectors; | 1406 | int s = r1_bio->sectors; |
1182 | put_buf(r1_bio); | 1407 | if (test_bit(R1BIO_MadeGood, &r1_bio->state) || |
1183 | md_done_sync(mddev, s, uptodate); | 1408 | test_bit(R1BIO_WriteError, &r1_bio->state)) |
1409 | reschedule_retry(r1_bio); | ||
1410 | else { | ||
1411 | put_buf(r1_bio); | ||
1412 | md_done_sync(mddev, s, uptodate); | ||
1413 | } | ||
1184 | } | 1414 | } |
1185 | } | 1415 | } |
1186 | 1416 | ||
1417 | static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector, | ||
1418 | int sectors, struct page *page, int rw) | ||
1419 | { | ||
1420 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) | ||
1421 | /* success */ | ||
1422 | return 1; | ||
1423 | if (rw == WRITE) | ||
1424 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1425 | /* need to record an error - either for the block or the device */ | ||
1426 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) | ||
1427 | md_error(rdev->mddev, rdev); | ||
1428 | return 0; | ||
1429 | } | ||
1430 | |||
1187 | static int fix_sync_read_error(r1bio_t *r1_bio) | 1431 | static int fix_sync_read_error(r1bio_t *r1_bio) |
1188 | { | 1432 | { |
1189 | /* Try some synchronous reads of other devices to get | 1433 | /* Try some synchronous reads of other devices to get |
@@ -1193,6 +1437,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
1193 | * We don't need to freeze the array, because being in an | 1437 | * We don't need to freeze the array, because being in an |
1194 | * active sync request, there is no normal IO, and | 1438 | * active sync request, there is no normal IO, and |
1195 | * no overlapping syncs. | 1439 | * no overlapping syncs. |
1440 | * We don't need to check is_badblock() again as we | ||
1441 | * made sure that anything with a bad block in range | ||
1442 | * will have bi_end_io clear. | ||
1196 | */ | 1443 | */ |
1197 | mddev_t *mddev = r1_bio->mddev; | 1444 | mddev_t *mddev = r1_bio->mddev; |
1198 | conf_t *conf = mddev->private; | 1445 | conf_t *conf = mddev->private; |
@@ -1217,9 +1464,7 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
1217 | * active, and resync is currently active | 1464 | * active, and resync is currently active |
1218 | */ | 1465 | */ |
1219 | rdev = conf->mirrors[d].rdev; | 1466 | rdev = conf->mirrors[d].rdev; |
1220 | if (sync_page_io(rdev, | 1467 | if (sync_page_io(rdev, sect, s<<9, |
1221 | sect, | ||
1222 | s<<9, | ||
1223 | bio->bi_io_vec[idx].bv_page, | 1468 | bio->bi_io_vec[idx].bv_page, |
1224 | READ, false)) { | 1469 | READ, false)) { |
1225 | success = 1; | 1470 | success = 1; |
@@ -1233,16 +1478,36 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
1233 | 1478 | ||
1234 | if (!success) { | 1479 | if (!success) { |
1235 | char b[BDEVNAME_SIZE]; | 1480 | char b[BDEVNAME_SIZE]; |
1236 | /* Cannot read from anywhere, array is toast */ | 1481 | int abort = 0; |
1237 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | 1482 | /* Cannot read from anywhere, this block is lost. |
1483 | * Record a bad block on each device. If that doesn't | ||
1484 | * work just disable and interrupt the recovery. | ||
1485 | * Don't fail devices as that won't really help. | ||
1486 | */ | ||
1238 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | 1487 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" |
1239 | " for block %llu\n", | 1488 | " for block %llu\n", |
1240 | mdname(mddev), | 1489 | mdname(mddev), |
1241 | bdevname(bio->bi_bdev, b), | 1490 | bdevname(bio->bi_bdev, b), |
1242 | (unsigned long long)r1_bio->sector); | 1491 | (unsigned long long)r1_bio->sector); |
1243 | md_done_sync(mddev, r1_bio->sectors, 0); | 1492 | for (d = 0; d < conf->raid_disks; d++) { |
1244 | put_buf(r1_bio); | 1493 | rdev = conf->mirrors[d].rdev; |
1245 | return 0; | 1494 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
1495 | continue; | ||
1496 | if (!rdev_set_badblocks(rdev, sect, s, 0)) | ||
1497 | abort = 1; | ||
1498 | } | ||
1499 | if (abort) { | ||
1500 | mddev->recovery_disabled = 1; | ||
1501 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
1502 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
1503 | put_buf(r1_bio); | ||
1504 | return 0; | ||
1505 | } | ||
1506 | /* Try next page */ | ||
1507 | sectors -= s; | ||
1508 | sect += s; | ||
1509 | idx++; | ||
1510 | continue; | ||
1246 | } | 1511 | } |
1247 | 1512 | ||
1248 | start = d; | 1513 | start = d; |
@@ -1254,16 +1519,12 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
1254 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | 1519 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) |
1255 | continue; | 1520 | continue; |
1256 | rdev = conf->mirrors[d].rdev; | 1521 | rdev = conf->mirrors[d].rdev; |
1257 | if (sync_page_io(rdev, | 1522 | if (r1_sync_page_io(rdev, sect, s, |
1258 | sect, | 1523 | bio->bi_io_vec[idx].bv_page, |
1259 | s<<9, | 1524 | WRITE) == 0) { |
1260 | bio->bi_io_vec[idx].bv_page, | ||
1261 | WRITE, false) == 0) { | ||
1262 | r1_bio->bios[d]->bi_end_io = NULL; | 1525 | r1_bio->bios[d]->bi_end_io = NULL; |
1263 | rdev_dec_pending(rdev, mddev); | 1526 | rdev_dec_pending(rdev, mddev); |
1264 | md_error(mddev, rdev); | 1527 | } |
1265 | } else | ||
1266 | atomic_add(s, &rdev->corrected_errors); | ||
1267 | } | 1528 | } |
1268 | d = start; | 1529 | d = start; |
1269 | while (d != r1_bio->read_disk) { | 1530 | while (d != r1_bio->read_disk) { |
@@ -1273,12 +1534,10 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
1273 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | 1534 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) |
1274 | continue; | 1535 | continue; |
1275 | rdev = conf->mirrors[d].rdev; | 1536 | rdev = conf->mirrors[d].rdev; |
1276 | if (sync_page_io(rdev, | 1537 | if (r1_sync_page_io(rdev, sect, s, |
1277 | sect, | 1538 | bio->bi_io_vec[idx].bv_page, |
1278 | s<<9, | 1539 | READ) != 0) |
1279 | bio->bi_io_vec[idx].bv_page, | 1540 | atomic_add(s, &rdev->corrected_errors); |
1280 | READ, false) == 0) | ||
1281 | md_error(mddev, rdev); | ||
1282 | } | 1541 | } |
1283 | sectors -= s; | 1542 | sectors -= s; |
1284 | sect += s; | 1543 | sect += s; |
@@ -1420,7 +1679,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | |||
1420 | * | 1679 | * |
1421 | * 1. Retries failed read operations on working mirrors. | 1680 | * 1. Retries failed read operations on working mirrors. |
1422 | * 2. Updates the raid superblock when problems encounter. | 1681 | * 2. Updates the raid superblock when problems encounter. |
1423 | * 3. Performs writes following reads for array syncronising. | 1682 | * 3. Performs writes following reads for array synchronising. |
1424 | */ | 1683 | */ |
1425 | 1684 | ||
1426 | static void fix_read_error(conf_t *conf, int read_disk, | 1685 | static void fix_read_error(conf_t *conf, int read_disk, |
@@ -1443,9 +1702,14 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1443 | * which is the thread that might remove | 1702 | * which is the thread that might remove |
1444 | * a device. If raid1d ever becomes multi-threaded.... | 1703 | * a device. If raid1d ever becomes multi-threaded.... |
1445 | */ | 1704 | */ |
1705 | sector_t first_bad; | ||
1706 | int bad_sectors; | ||
1707 | |||
1446 | rdev = conf->mirrors[d].rdev; | 1708 | rdev = conf->mirrors[d].rdev; |
1447 | if (rdev && | 1709 | if (rdev && |
1448 | test_bit(In_sync, &rdev->flags) && | 1710 | test_bit(In_sync, &rdev->flags) && |
1711 | is_badblock(rdev, sect, s, | ||
1712 | &first_bad, &bad_sectors) == 0 && | ||
1449 | sync_page_io(rdev, sect, s<<9, | 1713 | sync_page_io(rdev, sect, s<<9, |
1450 | conf->tmppage, READ, false)) | 1714 | conf->tmppage, READ, false)) |
1451 | success = 1; | 1715 | success = 1; |
@@ -1457,8 +1721,10 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1457 | } while (!success && d != read_disk); | 1721 | } while (!success && d != read_disk); |
1458 | 1722 | ||
1459 | if (!success) { | 1723 | if (!success) { |
1460 | /* Cannot read from anywhere -- bye bye array */ | 1724 | /* Cannot read from anywhere - mark it bad */ |
1461 | md_error(mddev, conf->mirrors[read_disk].rdev); | 1725 | mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev; |
1726 | if (!rdev_set_badblocks(rdev, sect, s, 0)) | ||
1727 | md_error(mddev, rdev); | ||
1462 | break; | 1728 | break; |
1463 | } | 1729 | } |
1464 | /* write it back and re-read */ | 1730 | /* write it back and re-read */ |
@@ -1469,13 +1735,9 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1469 | d--; | 1735 | d--; |
1470 | rdev = conf->mirrors[d].rdev; | 1736 | rdev = conf->mirrors[d].rdev; |
1471 | if (rdev && | 1737 | if (rdev && |
1472 | test_bit(In_sync, &rdev->flags)) { | 1738 | test_bit(In_sync, &rdev->flags)) |
1473 | if (sync_page_io(rdev, sect, s<<9, | 1739 | r1_sync_page_io(rdev, sect, s, |
1474 | conf->tmppage, WRITE, false) | 1740 | conf->tmppage, WRITE); |
1475 | == 0) | ||
1476 | /* Well, this device is dead */ | ||
1477 | md_error(mddev, rdev); | ||
1478 | } | ||
1479 | } | 1741 | } |
1480 | d = start; | 1742 | d = start; |
1481 | while (d != read_disk) { | 1743 | while (d != read_disk) { |
@@ -1486,12 +1748,8 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1486 | rdev = conf->mirrors[d].rdev; | 1748 | rdev = conf->mirrors[d].rdev; |
1487 | if (rdev && | 1749 | if (rdev && |
1488 | test_bit(In_sync, &rdev->flags)) { | 1750 | test_bit(In_sync, &rdev->flags)) { |
1489 | if (sync_page_io(rdev, sect, s<<9, | 1751 | if (r1_sync_page_io(rdev, sect, s, |
1490 | conf->tmppage, READ, false) | 1752 | conf->tmppage, READ)) { |
1491 | == 0) | ||
1492 | /* Well, this device is dead */ | ||
1493 | md_error(mddev, rdev); | ||
1494 | else { | ||
1495 | atomic_add(s, &rdev->corrected_errors); | 1753 | atomic_add(s, &rdev->corrected_errors); |
1496 | printk(KERN_INFO | 1754 | printk(KERN_INFO |
1497 | "md/raid1:%s: read error corrected " | 1755 | "md/raid1:%s: read error corrected " |
@@ -1508,21 +1766,255 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1508 | } | 1766 | } |
1509 | } | 1767 | } |
1510 | 1768 | ||
1769 | static void bi_complete(struct bio *bio, int error) | ||
1770 | { | ||
1771 | complete((struct completion *)bio->bi_private); | ||
1772 | } | ||
1773 | |||
1774 | static int submit_bio_wait(int rw, struct bio *bio) | ||
1775 | { | ||
1776 | struct completion event; | ||
1777 | rw |= REQ_SYNC; | ||
1778 | |||
1779 | init_completion(&event); | ||
1780 | bio->bi_private = &event; | ||
1781 | bio->bi_end_io = bi_complete; | ||
1782 | submit_bio(rw, bio); | ||
1783 | wait_for_completion(&event); | ||
1784 | |||
1785 | return test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1786 | } | ||
1787 | |||
1788 | static int narrow_write_error(r1bio_t *r1_bio, int i) | ||
1789 | { | ||
1790 | mddev_t *mddev = r1_bio->mddev; | ||
1791 | conf_t *conf = mddev->private; | ||
1792 | mdk_rdev_t *rdev = conf->mirrors[i].rdev; | ||
1793 | int vcnt, idx; | ||
1794 | struct bio_vec *vec; | ||
1795 | |||
1796 | /* bio has the data to be written to device 'i' where | ||
1797 | * we just recently had a write error. | ||
1798 | * We repeatedly clone the bio and trim down to one block, | ||
1799 | * then try the write. Where the write fails we record | ||
1800 | * a bad block. | ||
1801 | * It is conceivable that the bio doesn't exactly align with | ||
1802 | * blocks. We must handle this somehow. | ||
1803 | * | ||
1804 | * We currently own a reference on the rdev. | ||
1805 | */ | ||
1806 | |||
1807 | int block_sectors; | ||
1808 | sector_t sector; | ||
1809 | int sectors; | ||
1810 | int sect_to_write = r1_bio->sectors; | ||
1811 | int ok = 1; | ||
1812 | |||
1813 | if (rdev->badblocks.shift < 0) | ||
1814 | return 0; | ||
1815 | |||
1816 | block_sectors = 1 << rdev->badblocks.shift; | ||
1817 | sector = r1_bio->sector; | ||
1818 | sectors = ((sector + block_sectors) | ||
1819 | & ~(sector_t)(block_sectors - 1)) | ||
1820 | - sector; | ||
1821 | |||
1822 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
1823 | vcnt = r1_bio->behind_page_count; | ||
1824 | vec = r1_bio->behind_bvecs; | ||
1825 | idx = 0; | ||
1826 | while (vec[idx].bv_page == NULL) | ||
1827 | idx++; | ||
1828 | } else { | ||
1829 | vcnt = r1_bio->master_bio->bi_vcnt; | ||
1830 | vec = r1_bio->master_bio->bi_io_vec; | ||
1831 | idx = r1_bio->master_bio->bi_idx; | ||
1832 | } | ||
1833 | while (sect_to_write) { | ||
1834 | struct bio *wbio; | ||
1835 | if (sectors > sect_to_write) | ||
1836 | sectors = sect_to_write; | ||
1837 | /* Write at 'sector' for 'sectors'*/ | ||
1838 | |||
1839 | wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); | ||
1840 | memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); | ||
1841 | wbio->bi_sector = r1_bio->sector; | ||
1842 | wbio->bi_rw = WRITE; | ||
1843 | wbio->bi_vcnt = vcnt; | ||
1844 | wbio->bi_size = r1_bio->sectors << 9; | ||
1845 | wbio->bi_idx = idx; | ||
1846 | |||
1847 | md_trim_bio(wbio, sector - r1_bio->sector, sectors); | ||
1848 | wbio->bi_sector += rdev->data_offset; | ||
1849 | wbio->bi_bdev = rdev->bdev; | ||
1850 | if (submit_bio_wait(WRITE, wbio) == 0) | ||
1851 | /* failure! */ | ||
1852 | ok = rdev_set_badblocks(rdev, sector, | ||
1853 | sectors, 0) | ||
1854 | && ok; | ||
1855 | |||
1856 | bio_put(wbio); | ||
1857 | sect_to_write -= sectors; | ||
1858 | sector += sectors; | ||
1859 | sectors = block_sectors; | ||
1860 | } | ||
1861 | return ok; | ||
1862 | } | ||
1863 | |||
1864 | static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio) | ||
1865 | { | ||
1866 | int m; | ||
1867 | int s = r1_bio->sectors; | ||
1868 | for (m = 0; m < conf->raid_disks ; m++) { | ||
1869 | mdk_rdev_t *rdev = conf->mirrors[m].rdev; | ||
1870 | struct bio *bio = r1_bio->bios[m]; | ||
1871 | if (bio->bi_end_io == NULL) | ||
1872 | continue; | ||
1873 | if (test_bit(BIO_UPTODATE, &bio->bi_flags) && | ||
1874 | test_bit(R1BIO_MadeGood, &r1_bio->state)) { | ||
1875 | rdev_clear_badblocks(rdev, r1_bio->sector, s); | ||
1876 | } | ||
1877 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && | ||
1878 | test_bit(R1BIO_WriteError, &r1_bio->state)) { | ||
1879 | if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) | ||
1880 | md_error(conf->mddev, rdev); | ||
1881 | } | ||
1882 | } | ||
1883 | put_buf(r1_bio); | ||
1884 | md_done_sync(conf->mddev, s, 1); | ||
1885 | } | ||
1886 | |||
1887 | static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio) | ||
1888 | { | ||
1889 | int m; | ||
1890 | for (m = 0; m < conf->raid_disks ; m++) | ||
1891 | if (r1_bio->bios[m] == IO_MADE_GOOD) { | ||
1892 | mdk_rdev_t *rdev = conf->mirrors[m].rdev; | ||
1893 | rdev_clear_badblocks(rdev, | ||
1894 | r1_bio->sector, | ||
1895 | r1_bio->sectors); | ||
1896 | rdev_dec_pending(rdev, conf->mddev); | ||
1897 | } else if (r1_bio->bios[m] != NULL) { | ||
1898 | /* This drive got a write error. We need to | ||
1899 | * narrow down and record precise write | ||
1900 | * errors. | ||
1901 | */ | ||
1902 | if (!narrow_write_error(r1_bio, m)) { | ||
1903 | md_error(conf->mddev, | ||
1904 | conf->mirrors[m].rdev); | ||
1905 | /* an I/O failed, we can't clear the bitmap */ | ||
1906 | set_bit(R1BIO_Degraded, &r1_bio->state); | ||
1907 | } | ||
1908 | rdev_dec_pending(conf->mirrors[m].rdev, | ||
1909 | conf->mddev); | ||
1910 | } | ||
1911 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) | ||
1912 | close_write(r1_bio); | ||
1913 | raid_end_bio_io(r1_bio); | ||
1914 | } | ||
1915 | |||
1916 | static void handle_read_error(conf_t *conf, r1bio_t *r1_bio) | ||
1917 | { | ||
1918 | int disk; | ||
1919 | int max_sectors; | ||
1920 | mddev_t *mddev = conf->mddev; | ||
1921 | struct bio *bio; | ||
1922 | char b[BDEVNAME_SIZE]; | ||
1923 | mdk_rdev_t *rdev; | ||
1924 | |||
1925 | clear_bit(R1BIO_ReadError, &r1_bio->state); | ||
1926 | /* we got a read error. Maybe the drive is bad. Maybe just | ||
1927 | * the block and we can fix it. | ||
1928 | * We freeze all other IO, and try reading the block from | ||
1929 | * other devices. When we find one, we re-write | ||
1930 | * and check it that fixes the read error. | ||
1931 | * This is all done synchronously while the array is | ||
1932 | * frozen | ||
1933 | */ | ||
1934 | if (mddev->ro == 0) { | ||
1935 | freeze_array(conf); | ||
1936 | fix_read_error(conf, r1_bio->read_disk, | ||
1937 | r1_bio->sector, r1_bio->sectors); | ||
1938 | unfreeze_array(conf); | ||
1939 | } else | ||
1940 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
1941 | |||
1942 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1943 | bdevname(bio->bi_bdev, b); | ||
1944 | read_more: | ||
1945 | disk = read_balance(conf, r1_bio, &max_sectors); | ||
1946 | if (disk == -1) { | ||
1947 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" | ||
1948 | " read error for block %llu\n", | ||
1949 | mdname(mddev), b, (unsigned long long)r1_bio->sector); | ||
1950 | raid_end_bio_io(r1_bio); | ||
1951 | } else { | ||
1952 | const unsigned long do_sync | ||
1953 | = r1_bio->master_bio->bi_rw & REQ_SYNC; | ||
1954 | if (bio) { | ||
1955 | r1_bio->bios[r1_bio->read_disk] = | ||
1956 | mddev->ro ? IO_BLOCKED : NULL; | ||
1957 | bio_put(bio); | ||
1958 | } | ||
1959 | r1_bio->read_disk = disk; | ||
1960 | bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); | ||
1961 | md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors); | ||
1962 | r1_bio->bios[r1_bio->read_disk] = bio; | ||
1963 | rdev = conf->mirrors[disk].rdev; | ||
1964 | printk_ratelimited(KERN_ERR | ||
1965 | "md/raid1:%s: redirecting sector %llu" | ||
1966 | " to other mirror: %s\n", | ||
1967 | mdname(mddev), | ||
1968 | (unsigned long long)r1_bio->sector, | ||
1969 | bdevname(rdev->bdev, b)); | ||
1970 | bio->bi_sector = r1_bio->sector + rdev->data_offset; | ||
1971 | bio->bi_bdev = rdev->bdev; | ||
1972 | bio->bi_end_io = raid1_end_read_request; | ||
1973 | bio->bi_rw = READ | do_sync; | ||
1974 | bio->bi_private = r1_bio; | ||
1975 | if (max_sectors < r1_bio->sectors) { | ||
1976 | /* Drat - have to split this up more */ | ||
1977 | struct bio *mbio = r1_bio->master_bio; | ||
1978 | int sectors_handled = (r1_bio->sector + max_sectors | ||
1979 | - mbio->bi_sector); | ||
1980 | r1_bio->sectors = max_sectors; | ||
1981 | spin_lock_irq(&conf->device_lock); | ||
1982 | if (mbio->bi_phys_segments == 0) | ||
1983 | mbio->bi_phys_segments = 2; | ||
1984 | else | ||
1985 | mbio->bi_phys_segments++; | ||
1986 | spin_unlock_irq(&conf->device_lock); | ||
1987 | generic_make_request(bio); | ||
1988 | bio = NULL; | ||
1989 | |||
1990 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
1991 | |||
1992 | r1_bio->master_bio = mbio; | ||
1993 | r1_bio->sectors = (mbio->bi_size >> 9) | ||
1994 | - sectors_handled; | ||
1995 | r1_bio->state = 0; | ||
1996 | set_bit(R1BIO_ReadError, &r1_bio->state); | ||
1997 | r1_bio->mddev = mddev; | ||
1998 | r1_bio->sector = mbio->bi_sector + sectors_handled; | ||
1999 | |||
2000 | goto read_more; | ||
2001 | } else | ||
2002 | generic_make_request(bio); | ||
2003 | } | ||
2004 | } | ||
2005 | |||
1511 | static void raid1d(mddev_t *mddev) | 2006 | static void raid1d(mddev_t *mddev) |
1512 | { | 2007 | { |
1513 | r1bio_t *r1_bio; | 2008 | r1bio_t *r1_bio; |
1514 | struct bio *bio; | ||
1515 | unsigned long flags; | 2009 | unsigned long flags; |
1516 | conf_t *conf = mddev->private; | 2010 | conf_t *conf = mddev->private; |
1517 | struct list_head *head = &conf->retry_list; | 2011 | struct list_head *head = &conf->retry_list; |
1518 | mdk_rdev_t *rdev; | ||
1519 | struct blk_plug plug; | 2012 | struct blk_plug plug; |
1520 | 2013 | ||
1521 | md_check_recovery(mddev); | 2014 | md_check_recovery(mddev); |
1522 | 2015 | ||
1523 | blk_start_plug(&plug); | 2016 | blk_start_plug(&plug); |
1524 | for (;;) { | 2017 | for (;;) { |
1525 | char b[BDEVNAME_SIZE]; | ||
1526 | 2018 | ||
1527 | if (atomic_read(&mddev->plug_cnt) == 0) | 2019 | if (atomic_read(&mddev->plug_cnt) == 0) |
1528 | flush_pending_writes(conf); | 2020 | flush_pending_writes(conf); |
@@ -1539,62 +2031,26 @@ static void raid1d(mddev_t *mddev) | |||
1539 | 2031 | ||
1540 | mddev = r1_bio->mddev; | 2032 | mddev = r1_bio->mddev; |
1541 | conf = mddev->private; | 2033 | conf = mddev->private; |
1542 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) | 2034 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { |
1543 | sync_request_write(mddev, r1_bio); | 2035 | if (test_bit(R1BIO_MadeGood, &r1_bio->state) || |
1544 | else { | 2036 | test_bit(R1BIO_WriteError, &r1_bio->state)) |
1545 | int disk; | 2037 | handle_sync_write_finished(conf, r1_bio); |
1546 | 2038 | else | |
1547 | /* we got a read error. Maybe the drive is bad. Maybe just | 2039 | sync_request_write(mddev, r1_bio); |
1548 | * the block and we can fix it. | 2040 | } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) || |
1549 | * We freeze all other IO, and try reading the block from | 2041 | test_bit(R1BIO_WriteError, &r1_bio->state)) |
1550 | * other devices. When we find one, we re-write | 2042 | handle_write_finished(conf, r1_bio); |
1551 | * and check it that fixes the read error. | 2043 | else if (test_bit(R1BIO_ReadError, &r1_bio->state)) |
1552 | * This is all done synchronously while the array is | 2044 | handle_read_error(conf, r1_bio); |
1553 | * frozen | 2045 | else |
2046 | /* just a partial read to be scheduled from separate | ||
2047 | * context | ||
1554 | */ | 2048 | */ |
1555 | if (mddev->ro == 0) { | 2049 | generic_make_request(r1_bio->bios[r1_bio->read_disk]); |
1556 | freeze_array(conf); | 2050 | |
1557 | fix_read_error(conf, r1_bio->read_disk, | ||
1558 | r1_bio->sector, | ||
1559 | r1_bio->sectors); | ||
1560 | unfreeze_array(conf); | ||
1561 | } else | ||
1562 | md_error(mddev, | ||
1563 | conf->mirrors[r1_bio->read_disk].rdev); | ||
1564 | |||
1565 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1566 | if ((disk=read_balance(conf, r1_bio)) == -1) { | ||
1567 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" | ||
1568 | " read error for block %llu\n", | ||
1569 | mdname(mddev), | ||
1570 | bdevname(bio->bi_bdev,b), | ||
1571 | (unsigned long long)r1_bio->sector); | ||
1572 | raid_end_bio_io(r1_bio); | ||
1573 | } else { | ||
1574 | const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC; | ||
1575 | r1_bio->bios[r1_bio->read_disk] = | ||
1576 | mddev->ro ? IO_BLOCKED : NULL; | ||
1577 | r1_bio->read_disk = disk; | ||
1578 | bio_put(bio); | ||
1579 | bio = bio_clone_mddev(r1_bio->master_bio, | ||
1580 | GFP_NOIO, mddev); | ||
1581 | r1_bio->bios[r1_bio->read_disk] = bio; | ||
1582 | rdev = conf->mirrors[disk].rdev; | ||
1583 | if (printk_ratelimit()) | ||
1584 | printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to" | ||
1585 | " other mirror: %s\n", | ||
1586 | mdname(mddev), | ||
1587 | (unsigned long long)r1_bio->sector, | ||
1588 | bdevname(rdev->bdev,b)); | ||
1589 | bio->bi_sector = r1_bio->sector + rdev->data_offset; | ||
1590 | bio->bi_bdev = rdev->bdev; | ||
1591 | bio->bi_end_io = raid1_end_read_request; | ||
1592 | bio->bi_rw = READ | do_sync; | ||
1593 | bio->bi_private = r1_bio; | ||
1594 | generic_make_request(bio); | ||
1595 | } | ||
1596 | } | ||
1597 | cond_resched(); | 2051 | cond_resched(); |
2052 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | ||
2053 | md_check_recovery(mddev); | ||
1598 | } | 2054 | } |
1599 | blk_finish_plug(&plug); | 2055 | blk_finish_plug(&plug); |
1600 | } | 2056 | } |
@@ -1636,6 +2092,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1636 | int write_targets = 0, read_targets = 0; | 2092 | int write_targets = 0, read_targets = 0; |
1637 | sector_t sync_blocks; | 2093 | sector_t sync_blocks; |
1638 | int still_degraded = 0; | 2094 | int still_degraded = 0; |
2095 | int good_sectors = RESYNC_SECTORS; | ||
2096 | int min_bad = 0; /* number of sectors that are bad in all devices */ | ||
1639 | 2097 | ||
1640 | if (!conf->r1buf_pool) | 2098 | if (!conf->r1buf_pool) |
1641 | if (init_resync(conf)) | 2099 | if (init_resync(conf)) |
@@ -1723,36 +2181,89 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1723 | 2181 | ||
1724 | rdev = rcu_dereference(conf->mirrors[i].rdev); | 2182 | rdev = rcu_dereference(conf->mirrors[i].rdev); |
1725 | if (rdev == NULL || | 2183 | if (rdev == NULL || |
1726 | test_bit(Faulty, &rdev->flags)) { | 2184 | test_bit(Faulty, &rdev->flags)) { |
1727 | still_degraded = 1; | 2185 | still_degraded = 1; |
1728 | continue; | ||
1729 | } else if (!test_bit(In_sync, &rdev->flags)) { | 2186 | } else if (!test_bit(In_sync, &rdev->flags)) { |
1730 | bio->bi_rw = WRITE; | 2187 | bio->bi_rw = WRITE; |
1731 | bio->bi_end_io = end_sync_write; | 2188 | bio->bi_end_io = end_sync_write; |
1732 | write_targets ++; | 2189 | write_targets ++; |
1733 | } else { | 2190 | } else { |
1734 | /* may need to read from here */ | 2191 | /* may need to read from here */ |
1735 | bio->bi_rw = READ; | 2192 | sector_t first_bad = MaxSector; |
1736 | bio->bi_end_io = end_sync_read; | 2193 | int bad_sectors; |
1737 | if (test_bit(WriteMostly, &rdev->flags)) { | 2194 | |
1738 | if (wonly < 0) | 2195 | if (is_badblock(rdev, sector_nr, good_sectors, |
1739 | wonly = i; | 2196 | &first_bad, &bad_sectors)) { |
1740 | } else { | 2197 | if (first_bad > sector_nr) |
1741 | if (disk < 0) | 2198 | good_sectors = first_bad - sector_nr; |
1742 | disk = i; | 2199 | else { |
2200 | bad_sectors -= (sector_nr - first_bad); | ||
2201 | if (min_bad == 0 || | ||
2202 | min_bad > bad_sectors) | ||
2203 | min_bad = bad_sectors; | ||
2204 | } | ||
2205 | } | ||
2206 | if (sector_nr < first_bad) { | ||
2207 | if (test_bit(WriteMostly, &rdev->flags)) { | ||
2208 | if (wonly < 0) | ||
2209 | wonly = i; | ||
2210 | } else { | ||
2211 | if (disk < 0) | ||
2212 | disk = i; | ||
2213 | } | ||
2214 | bio->bi_rw = READ; | ||
2215 | bio->bi_end_io = end_sync_read; | ||
2216 | read_targets++; | ||
1743 | } | 2217 | } |
1744 | read_targets++; | ||
1745 | } | 2218 | } |
1746 | atomic_inc(&rdev->nr_pending); | 2219 | if (bio->bi_end_io) { |
1747 | bio->bi_sector = sector_nr + rdev->data_offset; | 2220 | atomic_inc(&rdev->nr_pending); |
1748 | bio->bi_bdev = rdev->bdev; | 2221 | bio->bi_sector = sector_nr + rdev->data_offset; |
1749 | bio->bi_private = r1_bio; | 2222 | bio->bi_bdev = rdev->bdev; |
2223 | bio->bi_private = r1_bio; | ||
2224 | } | ||
1750 | } | 2225 | } |
1751 | rcu_read_unlock(); | 2226 | rcu_read_unlock(); |
1752 | if (disk < 0) | 2227 | if (disk < 0) |
1753 | disk = wonly; | 2228 | disk = wonly; |
1754 | r1_bio->read_disk = disk; | 2229 | r1_bio->read_disk = disk; |
1755 | 2230 | ||
2231 | if (read_targets == 0 && min_bad > 0) { | ||
2232 | /* These sectors are bad on all InSync devices, so we | ||
2233 | * need to mark them bad on all write targets | ||
2234 | */ | ||
2235 | int ok = 1; | ||
2236 | for (i = 0 ; i < conf->raid_disks ; i++) | ||
2237 | if (r1_bio->bios[i]->bi_end_io == end_sync_write) { | ||
2238 | mdk_rdev_t *rdev = | ||
2239 | rcu_dereference(conf->mirrors[i].rdev); | ||
2240 | ok = rdev_set_badblocks(rdev, sector_nr, | ||
2241 | min_bad, 0 | ||
2242 | ) && ok; | ||
2243 | } | ||
2244 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
2245 | *skipped = 1; | ||
2246 | put_buf(r1_bio); | ||
2247 | |||
2248 | if (!ok) { | ||
2249 | /* Cannot record the badblocks, so need to | ||
2250 | * abort the resync. | ||
2251 | * If there are multiple read targets, could just | ||
2252 | * fail the really bad ones ??? | ||
2253 | */ | ||
2254 | conf->recovery_disabled = mddev->recovery_disabled; | ||
2255 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
2256 | return 0; | ||
2257 | } else | ||
2258 | return min_bad; | ||
2259 | |||
2260 | } | ||
2261 | if (min_bad > 0 && min_bad < good_sectors) { | ||
2262 | /* only resync enough to reach the next bad->good | ||
2263 | * transition */ | ||
2264 | good_sectors = min_bad; | ||
2265 | } | ||
2266 | |||
1756 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) | 2267 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) |
1757 | /* extra read targets are also write targets */ | 2268 | /* extra read targets are also write targets */ |
1758 | write_targets += read_targets-1; | 2269 | write_targets += read_targets-1; |
@@ -1769,6 +2280,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1769 | 2280 | ||
1770 | if (max_sector > mddev->resync_max) | 2281 | if (max_sector > mddev->resync_max) |
1771 | max_sector = mddev->resync_max; /* Don't do IO beyond here */ | 2282 | max_sector = mddev->resync_max; /* Don't do IO beyond here */ |
2283 | if (max_sector > sector_nr + good_sectors) | ||
2284 | max_sector = sector_nr + good_sectors; | ||
1772 | nr_sectors = 0; | 2285 | nr_sectors = 0; |
1773 | sync_blocks = 0; | 2286 | sync_blocks = 0; |
1774 | do { | 2287 | do { |
@@ -2154,18 +2667,13 @@ static int raid1_reshape(mddev_t *mddev) | |||
2154 | for (d = d2 = 0; d < conf->raid_disks; d++) { | 2667 | for (d = d2 = 0; d < conf->raid_disks; d++) { |
2155 | mdk_rdev_t *rdev = conf->mirrors[d].rdev; | 2668 | mdk_rdev_t *rdev = conf->mirrors[d].rdev; |
2156 | if (rdev && rdev->raid_disk != d2) { | 2669 | if (rdev && rdev->raid_disk != d2) { |
2157 | char nm[20]; | 2670 | sysfs_unlink_rdev(mddev, rdev); |
2158 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
2159 | sysfs_remove_link(&mddev->kobj, nm); | ||
2160 | rdev->raid_disk = d2; | 2671 | rdev->raid_disk = d2; |
2161 | sprintf(nm, "rd%d", rdev->raid_disk); | 2672 | sysfs_unlink_rdev(mddev, rdev); |
2162 | sysfs_remove_link(&mddev->kobj, nm); | 2673 | if (sysfs_link_rdev(mddev, rdev)) |
2163 | if (sysfs_create_link(&mddev->kobj, | ||
2164 | &rdev->kobj, nm)) | ||
2165 | printk(KERN_WARNING | 2674 | printk(KERN_WARNING |
2166 | "md/raid1:%s: cannot register " | 2675 | "md/raid1:%s: cannot register rd%d\n", |
2167 | "%s\n", | 2676 | mdname(mddev), rdev->raid_disk); |
2168 | mdname(mddev), nm); | ||
2169 | } | 2677 | } |
2170 | if (rdev) | 2678 | if (rdev) |
2171 | newmirrors[d2++].rdev = rdev; | 2679 | newmirrors[d2++].rdev = rdev; |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index e743a64fac4f..e0d676b48974 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
@@ -48,6 +48,12 @@ struct r1_private_data_s { | |||
48 | * (fresh device added). | 48 | * (fresh device added). |
49 | * Cleared when a sync completes. | 49 | * Cleared when a sync completes. |
50 | */ | 50 | */ |
51 | int recovery_disabled; /* when the same as | ||
52 | * mddev->recovery_disabled | ||
53 | * we don't allow recovery | ||
54 | * to be attempted as we | ||
55 | * expect a read error | ||
56 | */ | ||
51 | 57 | ||
52 | wait_queue_head_t wait_barrier; | 58 | wait_queue_head_t wait_barrier; |
53 | 59 | ||
@@ -95,7 +101,7 @@ struct r1bio_s { | |||
95 | 101 | ||
96 | struct list_head retry_list; | 102 | struct list_head retry_list; |
97 | /* Next two are only valid when R1BIO_BehindIO is set */ | 103 | /* Next two are only valid when R1BIO_BehindIO is set */ |
98 | struct page **behind_pages; | 104 | struct bio_vec *behind_bvecs; |
99 | int behind_page_count; | 105 | int behind_page_count; |
100 | /* | 106 | /* |
101 | * if the IO is in WRITE direction, then multiple bios are used. | 107 | * if the IO is in WRITE direction, then multiple bios are used. |
@@ -110,13 +116,24 @@ struct r1bio_s { | |||
110 | * correct the read error. To keep track of bad blocks on a per-bio | 116 | * correct the read error. To keep track of bad blocks on a per-bio |
111 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | 117 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer |
112 | */ | 118 | */ |
113 | #define IO_BLOCKED ((struct bio*)1) | 119 | #define IO_BLOCKED ((struct bio *)1) |
120 | /* When we successfully write to a known bad-block, we need to remove the | ||
121 | * bad-block marking which must be done from process context. So we record | ||
122 | * the success by setting bios[n] to IO_MADE_GOOD | ||
123 | */ | ||
124 | #define IO_MADE_GOOD ((struct bio *)2) | ||
125 | |||
126 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
114 | 127 | ||
115 | /* bits for r1bio.state */ | 128 | /* bits for r1bio.state */ |
116 | #define R1BIO_Uptodate 0 | 129 | #define R1BIO_Uptodate 0 |
117 | #define R1BIO_IsSync 1 | 130 | #define R1BIO_IsSync 1 |
118 | #define R1BIO_Degraded 2 | 131 | #define R1BIO_Degraded 2 |
119 | #define R1BIO_BehindIO 3 | 132 | #define R1BIO_BehindIO 3 |
133 | /* Set ReadError on bios that experience a readerror so that | ||
134 | * raid1d knows what to do with them. | ||
135 | */ | ||
136 | #define R1BIO_ReadError 4 | ||
120 | /* For write-behind requests, we call bi_end_io when | 137 | /* For write-behind requests, we call bi_end_io when |
121 | * the last non-write-behind device completes, providing | 138 | * the last non-write-behind device completes, providing |
122 | * any write was successful. Otherwise we call when | 139 | * any write was successful. Otherwise we call when |
@@ -125,6 +142,11 @@ struct r1bio_s { | |||
125 | * Record that bi_end_io was called with this flag... | 142 | * Record that bi_end_io was called with this flag... |
126 | */ | 143 | */ |
127 | #define R1BIO_Returned 6 | 144 | #define R1BIO_Returned 6 |
145 | /* If a write for this request means we can clear some | ||
146 | * known-bad-block records, we set this flag | ||
147 | */ | ||
148 | #define R1BIO_MadeGood 7 | ||
149 | #define R1BIO_WriteError 8 | ||
128 | 150 | ||
129 | extern int md_raid1_congested(mddev_t *mddev, int bits); | 151 | extern int md_raid1_congested(mddev_t *mddev, int bits); |
130 | 152 | ||
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6e846688962f..8b29cd4f01c8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/delay.h> | 22 | #include <linux/delay.h> |
23 | #include <linux/blkdev.h> | 23 | #include <linux/blkdev.h> |
24 | #include <linux/seq_file.h> | 24 | #include <linux/seq_file.h> |
25 | #include <linux/ratelimit.h> | ||
25 | #include "md.h" | 26 | #include "md.h" |
26 | #include "raid10.h" | 27 | #include "raid10.h" |
27 | #include "raid0.h" | 28 | #include "raid0.h" |
@@ -123,7 +124,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
123 | for (j = 0 ; j < nalloc; j++) { | 124 | for (j = 0 ; j < nalloc; j++) { |
124 | bio = r10_bio->devs[j].bio; | 125 | bio = r10_bio->devs[j].bio; |
125 | for (i = 0; i < RESYNC_PAGES; i++) { | 126 | for (i = 0; i < RESYNC_PAGES; i++) { |
126 | page = alloc_page(gfp_flags); | 127 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, |
128 | &conf->mddev->recovery)) { | ||
129 | /* we can share bv_page's during recovery */ | ||
130 | struct bio *rbio = r10_bio->devs[0].bio; | ||
131 | page = rbio->bi_io_vec[i].bv_page; | ||
132 | get_page(page); | ||
133 | } else | ||
134 | page = alloc_page(gfp_flags); | ||
127 | if (unlikely(!page)) | 135 | if (unlikely(!page)) |
128 | goto out_free_pages; | 136 | goto out_free_pages; |
129 | 137 | ||
@@ -173,7 +181,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) | |||
173 | 181 | ||
174 | for (i = 0; i < conf->copies; i++) { | 182 | for (i = 0; i < conf->copies; i++) { |
175 | struct bio **bio = & r10_bio->devs[i].bio; | 183 | struct bio **bio = & r10_bio->devs[i].bio; |
176 | if (*bio && *bio != IO_BLOCKED) | 184 | if (!BIO_SPECIAL(*bio)) |
177 | bio_put(*bio); | 185 | bio_put(*bio); |
178 | *bio = NULL; | 186 | *bio = NULL; |
179 | } | 187 | } |
@@ -183,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio) | |||
183 | { | 191 | { |
184 | conf_t *conf = r10_bio->mddev->private; | 192 | conf_t *conf = r10_bio->mddev->private; |
185 | 193 | ||
186 | /* | ||
187 | * Wake up any possible resync thread that waits for the device | ||
188 | * to go idle. | ||
189 | */ | ||
190 | allow_barrier(conf); | ||
191 | |||
192 | put_all_bios(conf, r10_bio); | 194 | put_all_bios(conf, r10_bio); |
193 | mempool_free(r10_bio, conf->r10bio_pool); | 195 | mempool_free(r10_bio, conf->r10bio_pool); |
194 | } | 196 | } |
@@ -227,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio) | |||
227 | static void raid_end_bio_io(r10bio_t *r10_bio) | 229 | static void raid_end_bio_io(r10bio_t *r10_bio) |
228 | { | 230 | { |
229 | struct bio *bio = r10_bio->master_bio; | 231 | struct bio *bio = r10_bio->master_bio; |
232 | int done; | ||
233 | conf_t *conf = r10_bio->mddev->private; | ||
230 | 234 | ||
231 | bio_endio(bio, | 235 | if (bio->bi_phys_segments) { |
232 | test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); | 236 | unsigned long flags; |
237 | spin_lock_irqsave(&conf->device_lock, flags); | ||
238 | bio->bi_phys_segments--; | ||
239 | done = (bio->bi_phys_segments == 0); | ||
240 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
241 | } else | ||
242 | done = 1; | ||
243 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) | ||
244 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
245 | if (done) { | ||
246 | bio_endio(bio, 0); | ||
247 | /* | ||
248 | * Wake up any possible resync thread that waits for the device | ||
249 | * to go idle. | ||
250 | */ | ||
251 | allow_barrier(conf); | ||
252 | } | ||
233 | free_r10bio(r10_bio); | 253 | free_r10bio(r10_bio); |
234 | } | 254 | } |
235 | 255 | ||
@@ -244,6 +264,26 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio) | |||
244 | r10_bio->devs[slot].addr + (r10_bio->sectors); | 264 | r10_bio->devs[slot].addr + (r10_bio->sectors); |
245 | } | 265 | } |
246 | 266 | ||
267 | /* | ||
268 | * Find the disk number which triggered given bio | ||
269 | */ | ||
270 | static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio, | ||
271 | struct bio *bio, int *slotp) | ||
272 | { | ||
273 | int slot; | ||
274 | |||
275 | for (slot = 0; slot < conf->copies; slot++) | ||
276 | if (r10_bio->devs[slot].bio == bio) | ||
277 | break; | ||
278 | |||
279 | BUG_ON(slot == conf->copies); | ||
280 | update_head_pos(slot, r10_bio); | ||
281 | |||
282 | if (slotp) | ||
283 | *slotp = slot; | ||
284 | return r10_bio->devs[slot].devnum; | ||
285 | } | ||
286 | |||
247 | static void raid10_end_read_request(struct bio *bio, int error) | 287 | static void raid10_end_read_request(struct bio *bio, int error) |
248 | { | 288 | { |
249 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 289 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
@@ -277,34 +317,45 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
277 | * oops, read error - keep the refcount on the rdev | 317 | * oops, read error - keep the refcount on the rdev |
278 | */ | 318 | */ |
279 | char b[BDEVNAME_SIZE]; | 319 | char b[BDEVNAME_SIZE]; |
280 | if (printk_ratelimit()) | 320 | printk_ratelimited(KERN_ERR |
281 | printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", | 321 | "md/raid10:%s: %s: rescheduling sector %llu\n", |
282 | mdname(conf->mddev), | 322 | mdname(conf->mddev), |
283 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); | 323 | bdevname(conf->mirrors[dev].rdev->bdev, b), |
324 | (unsigned long long)r10_bio->sector); | ||
325 | set_bit(R10BIO_ReadError, &r10_bio->state); | ||
284 | reschedule_retry(r10_bio); | 326 | reschedule_retry(r10_bio); |
285 | } | 327 | } |
286 | } | 328 | } |
287 | 329 | ||
330 | static void close_write(r10bio_t *r10_bio) | ||
331 | { | ||
332 | /* clear the bitmap if all writes complete successfully */ | ||
333 | bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, | ||
334 | r10_bio->sectors, | ||
335 | !test_bit(R10BIO_Degraded, &r10_bio->state), | ||
336 | 0); | ||
337 | md_write_end(r10_bio->mddev); | ||
338 | } | ||
339 | |||
288 | static void raid10_end_write_request(struct bio *bio, int error) | 340 | static void raid10_end_write_request(struct bio *bio, int error) |
289 | { | 341 | { |
290 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 342 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
291 | r10bio_t *r10_bio = bio->bi_private; | 343 | r10bio_t *r10_bio = bio->bi_private; |
292 | int slot, dev; | 344 | int dev; |
345 | int dec_rdev = 1; | ||
293 | conf_t *conf = r10_bio->mddev->private; | 346 | conf_t *conf = r10_bio->mddev->private; |
347 | int slot; | ||
294 | 348 | ||
295 | for (slot = 0; slot < conf->copies; slot++) | 349 | dev = find_bio_disk(conf, r10_bio, bio, &slot); |
296 | if (r10_bio->devs[slot].bio == bio) | ||
297 | break; | ||
298 | dev = r10_bio->devs[slot].devnum; | ||
299 | 350 | ||
300 | /* | 351 | /* |
301 | * this branch is our 'one mirror IO has finished' event handler: | 352 | * this branch is our 'one mirror IO has finished' event handler: |
302 | */ | 353 | */ |
303 | if (!uptodate) { | 354 | if (!uptodate) { |
304 | md_error(r10_bio->mddev, conf->mirrors[dev].rdev); | 355 | set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); |
305 | /* an I/O failed, we can't clear the bitmap */ | 356 | set_bit(R10BIO_WriteError, &r10_bio->state); |
306 | set_bit(R10BIO_Degraded, &r10_bio->state); | 357 | dec_rdev = 0; |
307 | } else | 358 | } else { |
308 | /* | 359 | /* |
309 | * Set R10BIO_Uptodate in our master bio, so that | 360 | * Set R10BIO_Uptodate in our master bio, so that |
310 | * we will return a good error code for to the higher | 361 | * we will return a good error code for to the higher |
@@ -314,9 +365,22 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
314 | * user-side. So if something waits for IO, then it will | 365 | * user-side. So if something waits for IO, then it will |
315 | * wait for the 'master' bio. | 366 | * wait for the 'master' bio. |
316 | */ | 367 | */ |
368 | sector_t first_bad; | ||
369 | int bad_sectors; | ||
370 | |||
317 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 371 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
318 | 372 | ||
319 | update_head_pos(slot, r10_bio); | 373 | /* Maybe we can clear some bad blocks. */ |
374 | if (is_badblock(conf->mirrors[dev].rdev, | ||
375 | r10_bio->devs[slot].addr, | ||
376 | r10_bio->sectors, | ||
377 | &first_bad, &bad_sectors)) { | ||
378 | bio_put(bio); | ||
379 | r10_bio->devs[slot].bio = IO_MADE_GOOD; | ||
380 | dec_rdev = 0; | ||
381 | set_bit(R10BIO_MadeGood, &r10_bio->state); | ||
382 | } | ||
383 | } | ||
320 | 384 | ||
321 | /* | 385 | /* |
322 | * | 386 | * |
@@ -324,16 +388,18 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
324 | * already. | 388 | * already. |
325 | */ | 389 | */ |
326 | if (atomic_dec_and_test(&r10_bio->remaining)) { | 390 | if (atomic_dec_and_test(&r10_bio->remaining)) { |
327 | /* clear the bitmap if all writes complete successfully */ | 391 | if (test_bit(R10BIO_WriteError, &r10_bio->state)) |
328 | bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, | 392 | reschedule_retry(r10_bio); |
329 | r10_bio->sectors, | 393 | else { |
330 | !test_bit(R10BIO_Degraded, &r10_bio->state), | 394 | close_write(r10_bio); |
331 | 0); | 395 | if (test_bit(R10BIO_MadeGood, &r10_bio->state)) |
332 | md_write_end(r10_bio->mddev); | 396 | reschedule_retry(r10_bio); |
333 | raid_end_bio_io(r10_bio); | 397 | else |
398 | raid_end_bio_io(r10_bio); | ||
399 | } | ||
334 | } | 400 | } |
335 | 401 | if (dec_rdev) | |
336 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | 402 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); |
337 | } | 403 | } |
338 | 404 | ||
339 | 405 | ||
@@ -484,11 +550,12 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
484 | * FIXME: possibly should rethink readbalancing and do it differently | 550 | * FIXME: possibly should rethink readbalancing and do it differently |
485 | * depending on near_copies / far_copies geometry. | 551 | * depending on near_copies / far_copies geometry. |
486 | */ | 552 | */ |
487 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) | 553 | static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors) |
488 | { | 554 | { |
489 | const sector_t this_sector = r10_bio->sector; | 555 | const sector_t this_sector = r10_bio->sector; |
490 | int disk, slot; | 556 | int disk, slot; |
491 | const int sectors = r10_bio->sectors; | 557 | int sectors = r10_bio->sectors; |
558 | int best_good_sectors; | ||
492 | sector_t new_distance, best_dist; | 559 | sector_t new_distance, best_dist; |
493 | mdk_rdev_t *rdev; | 560 | mdk_rdev_t *rdev; |
494 | int do_balance; | 561 | int do_balance; |
@@ -497,8 +564,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) | |||
497 | raid10_find_phys(conf, r10_bio); | 564 | raid10_find_phys(conf, r10_bio); |
498 | rcu_read_lock(); | 565 | rcu_read_lock(); |
499 | retry: | 566 | retry: |
567 | sectors = r10_bio->sectors; | ||
500 | best_slot = -1; | 568 | best_slot = -1; |
501 | best_dist = MaxSector; | 569 | best_dist = MaxSector; |
570 | best_good_sectors = 0; | ||
502 | do_balance = 1; | 571 | do_balance = 1; |
503 | /* | 572 | /* |
504 | * Check if we can balance. We can balance on the whole | 573 | * Check if we can balance. We can balance on the whole |
@@ -511,6 +580,10 @@ retry: | |||
511 | do_balance = 0; | 580 | do_balance = 0; |
512 | 581 | ||
513 | for (slot = 0; slot < conf->copies ; slot++) { | 582 | for (slot = 0; slot < conf->copies ; slot++) { |
583 | sector_t first_bad; | ||
584 | int bad_sectors; | ||
585 | sector_t dev_sector; | ||
586 | |||
514 | if (r10_bio->devs[slot].bio == IO_BLOCKED) | 587 | if (r10_bio->devs[slot].bio == IO_BLOCKED) |
515 | continue; | 588 | continue; |
516 | disk = r10_bio->devs[slot].devnum; | 589 | disk = r10_bio->devs[slot].devnum; |
@@ -520,6 +593,37 @@ retry: | |||
520 | if (!test_bit(In_sync, &rdev->flags)) | 593 | if (!test_bit(In_sync, &rdev->flags)) |
521 | continue; | 594 | continue; |
522 | 595 | ||
596 | dev_sector = r10_bio->devs[slot].addr; | ||
597 | if (is_badblock(rdev, dev_sector, sectors, | ||
598 | &first_bad, &bad_sectors)) { | ||
599 | if (best_dist < MaxSector) | ||
600 | /* Already have a better slot */ | ||
601 | continue; | ||
602 | if (first_bad <= dev_sector) { | ||
603 | /* Cannot read here. If this is the | ||
604 | * 'primary' device, then we must not read | ||
605 | * beyond 'bad_sectors' from another device. | ||
606 | */ | ||
607 | bad_sectors -= (dev_sector - first_bad); | ||
608 | if (!do_balance && sectors > bad_sectors) | ||
609 | sectors = bad_sectors; | ||
610 | if (best_good_sectors > sectors) | ||
611 | best_good_sectors = sectors; | ||
612 | } else { | ||
613 | sector_t good_sectors = | ||
614 | first_bad - dev_sector; | ||
615 | if (good_sectors > best_good_sectors) { | ||
616 | best_good_sectors = good_sectors; | ||
617 | best_slot = slot; | ||
618 | } | ||
619 | if (!do_balance) | ||
620 | /* Must read from here */ | ||
621 | break; | ||
622 | } | ||
623 | continue; | ||
624 | } else | ||
625 | best_good_sectors = sectors; | ||
626 | |||
523 | if (!do_balance) | 627 | if (!do_balance) |
524 | break; | 628 | break; |
525 | 629 | ||
@@ -561,6 +665,7 @@ retry: | |||
561 | } else | 665 | } else |
562 | disk = -1; | 666 | disk = -1; |
563 | rcu_read_unlock(); | 667 | rcu_read_unlock(); |
668 | *max_sectors = best_good_sectors; | ||
564 | 669 | ||
565 | return disk; | 670 | return disk; |
566 | } | 671 | } |
@@ -734,6 +839,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
734 | unsigned long flags; | 839 | unsigned long flags; |
735 | mdk_rdev_t *blocked_rdev; | 840 | mdk_rdev_t *blocked_rdev; |
736 | int plugged; | 841 | int plugged; |
842 | int sectors_handled; | ||
843 | int max_sectors; | ||
737 | 844 | ||
738 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | 845 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
739 | md_flush_request(mddev, bio); | 846 | md_flush_request(mddev, bio); |
@@ -808,12 +915,26 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
808 | r10_bio->sector = bio->bi_sector; | 915 | r10_bio->sector = bio->bi_sector; |
809 | r10_bio->state = 0; | 916 | r10_bio->state = 0; |
810 | 917 | ||
918 | /* We might need to issue multiple reads to different | ||
919 | * devices if there are bad blocks around, so we keep | ||
920 | * track of the number of reads in bio->bi_phys_segments. | ||
921 | * If this is 0, there is only one r10_bio and no locking | ||
922 | * will be needed when the request completes. If it is | ||
923 | * non-zero, then it is the number of not-completed requests. | ||
924 | */ | ||
925 | bio->bi_phys_segments = 0; | ||
926 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
927 | |||
811 | if (rw == READ) { | 928 | if (rw == READ) { |
812 | /* | 929 | /* |
813 | * read balancing logic: | 930 | * read balancing logic: |
814 | */ | 931 | */ |
815 | int disk = read_balance(conf, r10_bio); | 932 | int disk; |
816 | int slot = r10_bio->read_slot; | 933 | int slot; |
934 | |||
935 | read_again: | ||
936 | disk = read_balance(conf, r10_bio, &max_sectors); | ||
937 | slot = r10_bio->read_slot; | ||
817 | if (disk < 0) { | 938 | if (disk < 0) { |
818 | raid_end_bio_io(r10_bio); | 939 | raid_end_bio_io(r10_bio); |
819 | return 0; | 940 | return 0; |
@@ -821,6 +942,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
821 | mirror = conf->mirrors + disk; | 942 | mirror = conf->mirrors + disk; |
822 | 943 | ||
823 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 944 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
945 | md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, | ||
946 | max_sectors); | ||
824 | 947 | ||
825 | r10_bio->devs[slot].bio = read_bio; | 948 | r10_bio->devs[slot].bio = read_bio; |
826 | 949 | ||
@@ -831,7 +954,37 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
831 | read_bio->bi_rw = READ | do_sync; | 954 | read_bio->bi_rw = READ | do_sync; |
832 | read_bio->bi_private = r10_bio; | 955 | read_bio->bi_private = r10_bio; |
833 | 956 | ||
834 | generic_make_request(read_bio); | 957 | if (max_sectors < r10_bio->sectors) { |
958 | /* Could not read all from this device, so we will | ||
959 | * need another r10_bio. | ||
960 | */ | ||
961 | sectors_handled = (r10_bio->sectors + max_sectors | ||
962 | - bio->bi_sector); | ||
963 | r10_bio->sectors = max_sectors; | ||
964 | spin_lock_irq(&conf->device_lock); | ||
965 | if (bio->bi_phys_segments == 0) | ||
966 | bio->bi_phys_segments = 2; | ||
967 | else | ||
968 | bio->bi_phys_segments++; | ||
969 | spin_unlock(&conf->device_lock); | ||
970 | /* Cannot call generic_make_request directly | ||
971 | * as that will be queued in __generic_make_request | ||
972 | * and subsequent mempool_alloc might block | ||
973 | * waiting for it. so hand bio over to raid10d. | ||
974 | */ | ||
975 | reschedule_retry(r10_bio); | ||
976 | |||
977 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
978 | |||
979 | r10_bio->master_bio = bio; | ||
980 | r10_bio->sectors = ((bio->bi_size >> 9) | ||
981 | - sectors_handled); | ||
982 | r10_bio->state = 0; | ||
983 | r10_bio->mddev = mddev; | ||
984 | r10_bio->sector = bio->bi_sector + sectors_handled; | ||
985 | goto read_again; | ||
986 | } else | ||
987 | generic_make_request(read_bio); | ||
835 | return 0; | 988 | return 0; |
836 | } | 989 | } |
837 | 990 | ||
@@ -841,13 +994,22 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
841 | /* first select target devices under rcu_lock and | 994 | /* first select target devices under rcu_lock and |
842 | * inc refcount on their rdev. Record them by setting | 995 | * inc refcount on their rdev. Record them by setting |
843 | * bios[x] to bio | 996 | * bios[x] to bio |
997 | * If there are known/acknowledged bad blocks on any device | ||
998 | * on which we have seen a write error, we want to avoid | ||
999 | * writing to those blocks. This potentially requires several | ||
1000 | * writes to write around the bad blocks. Each set of writes | ||
1001 | * gets its own r10_bio with a set of bios attached. The number | ||
1002 | * of r10_bios is recored in bio->bi_phys_segments just as with | ||
1003 | * the read case. | ||
844 | */ | 1004 | */ |
845 | plugged = mddev_check_plugged(mddev); | 1005 | plugged = mddev_check_plugged(mddev); |
846 | 1006 | ||
847 | raid10_find_phys(conf, r10_bio); | 1007 | raid10_find_phys(conf, r10_bio); |
848 | retry_write: | 1008 | retry_write: |
849 | blocked_rdev = NULL; | 1009 | blocked_rdev = NULL; |
850 | rcu_read_lock(); | 1010 | rcu_read_lock(); |
1011 | max_sectors = r10_bio->sectors; | ||
1012 | |||
851 | for (i = 0; i < conf->copies; i++) { | 1013 | for (i = 0; i < conf->copies; i++) { |
852 | int d = r10_bio->devs[i].devnum; | 1014 | int d = r10_bio->devs[i].devnum; |
853 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); | 1015 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); |
@@ -856,13 +1018,55 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
856 | blocked_rdev = rdev; | 1018 | blocked_rdev = rdev; |
857 | break; | 1019 | break; |
858 | } | 1020 | } |
859 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 1021 | r10_bio->devs[i].bio = NULL; |
860 | atomic_inc(&rdev->nr_pending); | 1022 | if (!rdev || test_bit(Faulty, &rdev->flags)) { |
861 | r10_bio->devs[i].bio = bio; | ||
862 | } else { | ||
863 | r10_bio->devs[i].bio = NULL; | ||
864 | set_bit(R10BIO_Degraded, &r10_bio->state); | 1023 | set_bit(R10BIO_Degraded, &r10_bio->state); |
1024 | continue; | ||
865 | } | 1025 | } |
1026 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
1027 | sector_t first_bad; | ||
1028 | sector_t dev_sector = r10_bio->devs[i].addr; | ||
1029 | int bad_sectors; | ||
1030 | int is_bad; | ||
1031 | |||
1032 | is_bad = is_badblock(rdev, dev_sector, | ||
1033 | max_sectors, | ||
1034 | &first_bad, &bad_sectors); | ||
1035 | if (is_bad < 0) { | ||
1036 | /* Mustn't write here until the bad block | ||
1037 | * is acknowledged | ||
1038 | */ | ||
1039 | atomic_inc(&rdev->nr_pending); | ||
1040 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
1041 | blocked_rdev = rdev; | ||
1042 | break; | ||
1043 | } | ||
1044 | if (is_bad && first_bad <= dev_sector) { | ||
1045 | /* Cannot write here at all */ | ||
1046 | bad_sectors -= (dev_sector - first_bad); | ||
1047 | if (bad_sectors < max_sectors) | ||
1048 | /* Mustn't write more than bad_sectors | ||
1049 | * to other devices yet | ||
1050 | */ | ||
1051 | max_sectors = bad_sectors; | ||
1052 | /* We don't set R10BIO_Degraded as that | ||
1053 | * only applies if the disk is missing, | ||
1054 | * so it might be re-added, and we want to | ||
1055 | * know to recover this chunk. | ||
1056 | * In this case the device is here, and the | ||
1057 | * fact that this chunk is not in-sync is | ||
1058 | * recorded in the bad block log. | ||
1059 | */ | ||
1060 | continue; | ||
1061 | } | ||
1062 | if (is_bad) { | ||
1063 | int good_sectors = first_bad - dev_sector; | ||
1064 | if (good_sectors < max_sectors) | ||
1065 | max_sectors = good_sectors; | ||
1066 | } | ||
1067 | } | ||
1068 | r10_bio->devs[i].bio = bio; | ||
1069 | atomic_inc(&rdev->nr_pending); | ||
866 | } | 1070 | } |
867 | rcu_read_unlock(); | 1071 | rcu_read_unlock(); |
868 | 1072 | ||
@@ -882,8 +1086,22 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
882 | goto retry_write; | 1086 | goto retry_write; |
883 | } | 1087 | } |
884 | 1088 | ||
1089 | if (max_sectors < r10_bio->sectors) { | ||
1090 | /* We are splitting this into multiple parts, so | ||
1091 | * we need to prepare for allocating another r10_bio. | ||
1092 | */ | ||
1093 | r10_bio->sectors = max_sectors; | ||
1094 | spin_lock_irq(&conf->device_lock); | ||
1095 | if (bio->bi_phys_segments == 0) | ||
1096 | bio->bi_phys_segments = 2; | ||
1097 | else | ||
1098 | bio->bi_phys_segments++; | ||
1099 | spin_unlock_irq(&conf->device_lock); | ||
1100 | } | ||
1101 | sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; | ||
1102 | |||
885 | atomic_set(&r10_bio->remaining, 1); | 1103 | atomic_set(&r10_bio->remaining, 1); |
886 | bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); | 1104 | bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); |
887 | 1105 | ||
888 | for (i = 0; i < conf->copies; i++) { | 1106 | for (i = 0; i < conf->copies; i++) { |
889 | struct bio *mbio; | 1107 | struct bio *mbio; |
@@ -892,10 +1110,12 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
892 | continue; | 1110 | continue; |
893 | 1111 | ||
894 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1112 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
1113 | md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, | ||
1114 | max_sectors); | ||
895 | r10_bio->devs[i].bio = mbio; | 1115 | r10_bio->devs[i].bio = mbio; |
896 | 1116 | ||
897 | mbio->bi_sector = r10_bio->devs[i].addr+ | 1117 | mbio->bi_sector = (r10_bio->devs[i].addr+ |
898 | conf->mirrors[d].rdev->data_offset; | 1118 | conf->mirrors[d].rdev->data_offset); |
899 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 1119 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
900 | mbio->bi_end_io = raid10_end_write_request; | 1120 | mbio->bi_end_io = raid10_end_write_request; |
901 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1121 | mbio->bi_rw = WRITE | do_sync | do_fua; |
@@ -920,6 +1140,21 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
920 | /* In case raid10d snuck in to freeze_array */ | 1140 | /* In case raid10d snuck in to freeze_array */ |
921 | wake_up(&conf->wait_barrier); | 1141 | wake_up(&conf->wait_barrier); |
922 | 1142 | ||
1143 | if (sectors_handled < (bio->bi_size >> 9)) { | ||
1144 | /* We need another r10_bio. It has already been counted | ||
1145 | * in bio->bi_phys_segments. | ||
1146 | */ | ||
1147 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
1148 | |||
1149 | r10_bio->master_bio = bio; | ||
1150 | r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; | ||
1151 | |||
1152 | r10_bio->mddev = mddev; | ||
1153 | r10_bio->sector = bio->bi_sector + sectors_handled; | ||
1154 | r10_bio->state = 0; | ||
1155 | goto retry_write; | ||
1156 | } | ||
1157 | |||
923 | if (do_sync || !mddev->bitmap || !plugged) | 1158 | if (do_sync || !mddev->bitmap || !plugged) |
924 | md_wakeup_thread(mddev->thread); | 1159 | md_wakeup_thread(mddev->thread); |
925 | return 0; | 1160 | return 0; |
@@ -949,6 +1184,30 @@ static void status(struct seq_file *seq, mddev_t *mddev) | |||
949 | seq_printf(seq, "]"); | 1184 | seq_printf(seq, "]"); |
950 | } | 1185 | } |
951 | 1186 | ||
1187 | /* check if there are enough drives for | ||
1188 | * every block to appear on atleast one. | ||
1189 | * Don't consider the device numbered 'ignore' | ||
1190 | * as we might be about to remove it. | ||
1191 | */ | ||
1192 | static int enough(conf_t *conf, int ignore) | ||
1193 | { | ||
1194 | int first = 0; | ||
1195 | |||
1196 | do { | ||
1197 | int n = conf->copies; | ||
1198 | int cnt = 0; | ||
1199 | while (n--) { | ||
1200 | if (conf->mirrors[first].rdev && | ||
1201 | first != ignore) | ||
1202 | cnt++; | ||
1203 | first = (first+1) % conf->raid_disks; | ||
1204 | } | ||
1205 | if (cnt == 0) | ||
1206 | return 0; | ||
1207 | } while (first != 0); | ||
1208 | return 1; | ||
1209 | } | ||
1210 | |||
952 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | 1211 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) |
953 | { | 1212 | { |
954 | char b[BDEVNAME_SIZE]; | 1213 | char b[BDEVNAME_SIZE]; |
@@ -961,13 +1220,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
961 | * else mark the drive as failed | 1220 | * else mark the drive as failed |
962 | */ | 1221 | */ |
963 | if (test_bit(In_sync, &rdev->flags) | 1222 | if (test_bit(In_sync, &rdev->flags) |
964 | && conf->raid_disks-mddev->degraded == 1) | 1223 | && !enough(conf, rdev->raid_disk)) |
965 | /* | 1224 | /* |
966 | * Don't fail the drive, just return an IO error. | 1225 | * Don't fail the drive, just return an IO error. |
967 | * The test should really be more sophisticated than | ||
968 | * "working_disks == 1", but it isn't critical, and | ||
969 | * can wait until we do more sophisticated "is the drive | ||
970 | * really dead" tests... | ||
971 | */ | 1226 | */ |
972 | return; | 1227 | return; |
973 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1228 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
@@ -980,6 +1235,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
980 | */ | 1235 | */ |
981 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 1236 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
982 | } | 1237 | } |
1238 | set_bit(Blocked, &rdev->flags); | ||
983 | set_bit(Faulty, &rdev->flags); | 1239 | set_bit(Faulty, &rdev->flags); |
984 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1240 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
985 | printk(KERN_ALERT | 1241 | printk(KERN_ALERT |
@@ -1022,27 +1278,6 @@ static void close_sync(conf_t *conf) | |||
1022 | conf->r10buf_pool = NULL; | 1278 | conf->r10buf_pool = NULL; |
1023 | } | 1279 | } |
1024 | 1280 | ||
1025 | /* check if there are enough drives for | ||
1026 | * every block to appear on atleast one | ||
1027 | */ | ||
1028 | static int enough(conf_t *conf) | ||
1029 | { | ||
1030 | int first = 0; | ||
1031 | |||
1032 | do { | ||
1033 | int n = conf->copies; | ||
1034 | int cnt = 0; | ||
1035 | while (n--) { | ||
1036 | if (conf->mirrors[first].rdev) | ||
1037 | cnt++; | ||
1038 | first = (first+1) % conf->raid_disks; | ||
1039 | } | ||
1040 | if (cnt == 0) | ||
1041 | return 0; | ||
1042 | } while (first != 0); | ||
1043 | return 1; | ||
1044 | } | ||
1045 | |||
1046 | static int raid10_spare_active(mddev_t *mddev) | 1281 | static int raid10_spare_active(mddev_t *mddev) |
1047 | { | 1282 | { |
1048 | int i; | 1283 | int i; |
@@ -1078,7 +1313,6 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1078 | conf_t *conf = mddev->private; | 1313 | conf_t *conf = mddev->private; |
1079 | int err = -EEXIST; | 1314 | int err = -EEXIST; |
1080 | int mirror; | 1315 | int mirror; |
1081 | mirror_info_t *p; | ||
1082 | int first = 0; | 1316 | int first = 0; |
1083 | int last = conf->raid_disks - 1; | 1317 | int last = conf->raid_disks - 1; |
1084 | 1318 | ||
@@ -1087,44 +1321,47 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1087 | * very different from resync | 1321 | * very different from resync |
1088 | */ | 1322 | */ |
1089 | return -EBUSY; | 1323 | return -EBUSY; |
1090 | if (!enough(conf)) | 1324 | if (!enough(conf, -1)) |
1091 | return -EINVAL; | 1325 | return -EINVAL; |
1092 | 1326 | ||
1093 | if (rdev->raid_disk >= 0) | 1327 | if (rdev->raid_disk >= 0) |
1094 | first = last = rdev->raid_disk; | 1328 | first = last = rdev->raid_disk; |
1095 | 1329 | ||
1096 | if (rdev->saved_raid_disk >= 0 && | 1330 | if (rdev->saved_raid_disk >= first && |
1097 | rdev->saved_raid_disk >= first && | ||
1098 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) | 1331 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) |
1099 | mirror = rdev->saved_raid_disk; | 1332 | mirror = rdev->saved_raid_disk; |
1100 | else | 1333 | else |
1101 | mirror = first; | 1334 | mirror = first; |
1102 | for ( ; mirror <= last ; mirror++) | 1335 | for ( ; mirror <= last ; mirror++) { |
1103 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1336 | mirror_info_t *p = &conf->mirrors[mirror]; |
1104 | 1337 | if (p->recovery_disabled == mddev->recovery_disabled) | |
1105 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1338 | continue; |
1106 | rdev->data_offset << 9); | 1339 | if (!p->rdev) |
1107 | /* as we don't honour merge_bvec_fn, we must | 1340 | continue; |
1108 | * never risk violating it, so limit | ||
1109 | * ->max_segments to one lying with a single | ||
1110 | * page, as a one page request is never in | ||
1111 | * violation. | ||
1112 | */ | ||
1113 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
1114 | blk_queue_max_segments(mddev->queue, 1); | ||
1115 | blk_queue_segment_boundary(mddev->queue, | ||
1116 | PAGE_CACHE_SIZE - 1); | ||
1117 | } | ||
1118 | 1341 | ||
1119 | p->head_position = 0; | 1342 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1120 | rdev->raid_disk = mirror; | 1343 | rdev->data_offset << 9); |
1121 | err = 0; | 1344 | /* as we don't honour merge_bvec_fn, we must |
1122 | if (rdev->saved_raid_disk != mirror) | 1345 | * never risk violating it, so limit |
1123 | conf->fullsync = 1; | 1346 | * ->max_segments to one lying with a single |
1124 | rcu_assign_pointer(p->rdev, rdev); | 1347 | * page, as a one page request is never in |
1125 | break; | 1348 | * violation. |
1349 | */ | ||
1350 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
1351 | blk_queue_max_segments(mddev->queue, 1); | ||
1352 | blk_queue_segment_boundary(mddev->queue, | ||
1353 | PAGE_CACHE_SIZE - 1); | ||
1126 | } | 1354 | } |
1127 | 1355 | ||
1356 | p->head_position = 0; | ||
1357 | rdev->raid_disk = mirror; | ||
1358 | err = 0; | ||
1359 | if (rdev->saved_raid_disk != mirror) | ||
1360 | conf->fullsync = 1; | ||
1361 | rcu_assign_pointer(p->rdev, rdev); | ||
1362 | break; | ||
1363 | } | ||
1364 | |||
1128 | md_integrity_add_rdev(rdev, mddev); | 1365 | md_integrity_add_rdev(rdev, mddev); |
1129 | print_conf(conf); | 1366 | print_conf(conf); |
1130 | return err; | 1367 | return err; |
@@ -1149,7 +1386,8 @@ static int raid10_remove_disk(mddev_t *mddev, int number) | |||
1149 | * is not possible. | 1386 | * is not possible. |
1150 | */ | 1387 | */ |
1151 | if (!test_bit(Faulty, &rdev->flags) && | 1388 | if (!test_bit(Faulty, &rdev->flags) && |
1152 | enough(conf)) { | 1389 | mddev->recovery_disabled != p->recovery_disabled && |
1390 | enough(conf, -1)) { | ||
1153 | err = -EBUSY; | 1391 | err = -EBUSY; |
1154 | goto abort; | 1392 | goto abort; |
1155 | } | 1393 | } |
@@ -1174,24 +1412,18 @@ static void end_sync_read(struct bio *bio, int error) | |||
1174 | { | 1412 | { |
1175 | r10bio_t *r10_bio = bio->bi_private; | 1413 | r10bio_t *r10_bio = bio->bi_private; |
1176 | conf_t *conf = r10_bio->mddev->private; | 1414 | conf_t *conf = r10_bio->mddev->private; |
1177 | int i,d; | 1415 | int d; |
1178 | 1416 | ||
1179 | for (i=0; i<conf->copies; i++) | 1417 | d = find_bio_disk(conf, r10_bio, bio, NULL); |
1180 | if (r10_bio->devs[i].bio == bio) | ||
1181 | break; | ||
1182 | BUG_ON(i == conf->copies); | ||
1183 | update_head_pos(i, r10_bio); | ||
1184 | d = r10_bio->devs[i].devnum; | ||
1185 | 1418 | ||
1186 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 1419 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
1187 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 1420 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
1188 | else { | 1421 | else |
1422 | /* The write handler will notice the lack of | ||
1423 | * R10BIO_Uptodate and record any errors etc | ||
1424 | */ | ||
1189 | atomic_add(r10_bio->sectors, | 1425 | atomic_add(r10_bio->sectors, |
1190 | &conf->mirrors[d].rdev->corrected_errors); | 1426 | &conf->mirrors[d].rdev->corrected_errors); |
1191 | if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) | ||
1192 | md_error(r10_bio->mddev, | ||
1193 | conf->mirrors[d].rdev); | ||
1194 | } | ||
1195 | 1427 | ||
1196 | /* for reconstruct, we always reschedule after a read. | 1428 | /* for reconstruct, we always reschedule after a read. |
1197 | * for resync, only after all reads | 1429 | * for resync, only after all reads |
@@ -1206,40 +1438,60 @@ static void end_sync_read(struct bio *bio, int error) | |||
1206 | } | 1438 | } |
1207 | } | 1439 | } |
1208 | 1440 | ||
1209 | static void end_sync_write(struct bio *bio, int error) | 1441 | static void end_sync_request(r10bio_t *r10_bio) |
1210 | { | 1442 | { |
1211 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1212 | r10bio_t *r10_bio = bio->bi_private; | ||
1213 | mddev_t *mddev = r10_bio->mddev; | 1443 | mddev_t *mddev = r10_bio->mddev; |
1214 | conf_t *conf = mddev->private; | ||
1215 | int i,d; | ||
1216 | |||
1217 | for (i = 0; i < conf->copies; i++) | ||
1218 | if (r10_bio->devs[i].bio == bio) | ||
1219 | break; | ||
1220 | d = r10_bio->devs[i].devnum; | ||
1221 | 1444 | ||
1222 | if (!uptodate) | ||
1223 | md_error(mddev, conf->mirrors[d].rdev); | ||
1224 | |||
1225 | update_head_pos(i, r10_bio); | ||
1226 | |||
1227 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | ||
1228 | while (atomic_dec_and_test(&r10_bio->remaining)) { | 1445 | while (atomic_dec_and_test(&r10_bio->remaining)) { |
1229 | if (r10_bio->master_bio == NULL) { | 1446 | if (r10_bio->master_bio == NULL) { |
1230 | /* the primary of several recovery bios */ | 1447 | /* the primary of several recovery bios */ |
1231 | sector_t s = r10_bio->sectors; | 1448 | sector_t s = r10_bio->sectors; |
1232 | put_buf(r10_bio); | 1449 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
1450 | test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
1451 | reschedule_retry(r10_bio); | ||
1452 | else | ||
1453 | put_buf(r10_bio); | ||
1233 | md_done_sync(mddev, s, 1); | 1454 | md_done_sync(mddev, s, 1); |
1234 | break; | 1455 | break; |
1235 | } else { | 1456 | } else { |
1236 | r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; | 1457 | r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; |
1237 | put_buf(r10_bio); | 1458 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
1459 | test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
1460 | reschedule_retry(r10_bio); | ||
1461 | else | ||
1462 | put_buf(r10_bio); | ||
1238 | r10_bio = r10_bio2; | 1463 | r10_bio = r10_bio2; |
1239 | } | 1464 | } |
1240 | } | 1465 | } |
1241 | } | 1466 | } |
1242 | 1467 | ||
1468 | static void end_sync_write(struct bio *bio, int error) | ||
1469 | { | ||
1470 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1471 | r10bio_t *r10_bio = bio->bi_private; | ||
1472 | mddev_t *mddev = r10_bio->mddev; | ||
1473 | conf_t *conf = mddev->private; | ||
1474 | int d; | ||
1475 | sector_t first_bad; | ||
1476 | int bad_sectors; | ||
1477 | int slot; | ||
1478 | |||
1479 | d = find_bio_disk(conf, r10_bio, bio, &slot); | ||
1480 | |||
1481 | if (!uptodate) { | ||
1482 | set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); | ||
1483 | set_bit(R10BIO_WriteError, &r10_bio->state); | ||
1484 | } else if (is_badblock(conf->mirrors[d].rdev, | ||
1485 | r10_bio->devs[slot].addr, | ||
1486 | r10_bio->sectors, | ||
1487 | &first_bad, &bad_sectors)) | ||
1488 | set_bit(R10BIO_MadeGood, &r10_bio->state); | ||
1489 | |||
1490 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | ||
1491 | |||
1492 | end_sync_request(r10_bio); | ||
1493 | } | ||
1494 | |||
1243 | /* | 1495 | /* |
1244 | * Note: sync and recover and handled very differently for raid10 | 1496 | * Note: sync and recover and handled very differently for raid10 |
1245 | * This code is for resync. | 1497 | * This code is for resync. |
@@ -1299,11 +1551,12 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) | |||
1299 | if (j == vcnt) | 1551 | if (j == vcnt) |
1300 | continue; | 1552 | continue; |
1301 | mddev->resync_mismatches += r10_bio->sectors; | 1553 | mddev->resync_mismatches += r10_bio->sectors; |
1554 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) | ||
1555 | /* Don't fix anything. */ | ||
1556 | continue; | ||
1302 | } | 1557 | } |
1303 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) | 1558 | /* Ok, we need to write this bio, either to correct an |
1304 | /* Don't fix anything. */ | 1559 | * inconsistency or to correct an unreadable block. |
1305 | continue; | ||
1306 | /* Ok, we need to write this bio | ||
1307 | * First we need to fixup bv_offset, bv_len and | 1560 | * First we need to fixup bv_offset, bv_len and |
1308 | * bi_vecs, as the read request might have corrupted these | 1561 | * bi_vecs, as the read request might have corrupted these |
1309 | */ | 1562 | */ |
@@ -1355,32 +1608,107 @@ done: | |||
1355 | * The second for writing. | 1608 | * The second for writing. |
1356 | * | 1609 | * |
1357 | */ | 1610 | */ |
1611 | static void fix_recovery_read_error(r10bio_t *r10_bio) | ||
1612 | { | ||
1613 | /* We got a read error during recovery. | ||
1614 | * We repeat the read in smaller page-sized sections. | ||
1615 | * If a read succeeds, write it to the new device or record | ||
1616 | * a bad block if we cannot. | ||
1617 | * If a read fails, record a bad block on both old and | ||
1618 | * new devices. | ||
1619 | */ | ||
1620 | mddev_t *mddev = r10_bio->mddev; | ||
1621 | conf_t *conf = mddev->private; | ||
1622 | struct bio *bio = r10_bio->devs[0].bio; | ||
1623 | sector_t sect = 0; | ||
1624 | int sectors = r10_bio->sectors; | ||
1625 | int idx = 0; | ||
1626 | int dr = r10_bio->devs[0].devnum; | ||
1627 | int dw = r10_bio->devs[1].devnum; | ||
1628 | |||
1629 | while (sectors) { | ||
1630 | int s = sectors; | ||
1631 | mdk_rdev_t *rdev; | ||
1632 | sector_t addr; | ||
1633 | int ok; | ||
1634 | |||
1635 | if (s > (PAGE_SIZE>>9)) | ||
1636 | s = PAGE_SIZE >> 9; | ||
1637 | |||
1638 | rdev = conf->mirrors[dr].rdev; | ||
1639 | addr = r10_bio->devs[0].addr + sect, | ||
1640 | ok = sync_page_io(rdev, | ||
1641 | addr, | ||
1642 | s << 9, | ||
1643 | bio->bi_io_vec[idx].bv_page, | ||
1644 | READ, false); | ||
1645 | if (ok) { | ||
1646 | rdev = conf->mirrors[dw].rdev; | ||
1647 | addr = r10_bio->devs[1].addr + sect; | ||
1648 | ok = sync_page_io(rdev, | ||
1649 | addr, | ||
1650 | s << 9, | ||
1651 | bio->bi_io_vec[idx].bv_page, | ||
1652 | WRITE, false); | ||
1653 | if (!ok) | ||
1654 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1655 | } | ||
1656 | if (!ok) { | ||
1657 | /* We don't worry if we cannot set a bad block - | ||
1658 | * it really is bad so there is no loss in not | ||
1659 | * recording it yet | ||
1660 | */ | ||
1661 | rdev_set_badblocks(rdev, addr, s, 0); | ||
1662 | |||
1663 | if (rdev != conf->mirrors[dw].rdev) { | ||
1664 | /* need bad block on destination too */ | ||
1665 | mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev; | ||
1666 | addr = r10_bio->devs[1].addr + sect; | ||
1667 | ok = rdev_set_badblocks(rdev2, addr, s, 0); | ||
1668 | if (!ok) { | ||
1669 | /* just abort the recovery */ | ||
1670 | printk(KERN_NOTICE | ||
1671 | "md/raid10:%s: recovery aborted" | ||
1672 | " due to read error\n", | ||
1673 | mdname(mddev)); | ||
1674 | |||
1675 | conf->mirrors[dw].recovery_disabled | ||
1676 | = mddev->recovery_disabled; | ||
1677 | set_bit(MD_RECOVERY_INTR, | ||
1678 | &mddev->recovery); | ||
1679 | break; | ||
1680 | } | ||
1681 | } | ||
1682 | } | ||
1683 | |||
1684 | sectors -= s; | ||
1685 | sect += s; | ||
1686 | idx++; | ||
1687 | } | ||
1688 | } | ||
1358 | 1689 | ||
1359 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) | 1690 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) |
1360 | { | 1691 | { |
1361 | conf_t *conf = mddev->private; | 1692 | conf_t *conf = mddev->private; |
1362 | int i, d; | 1693 | int d; |
1363 | struct bio *bio, *wbio; | 1694 | struct bio *wbio; |
1364 | 1695 | ||
1696 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { | ||
1697 | fix_recovery_read_error(r10_bio); | ||
1698 | end_sync_request(r10_bio); | ||
1699 | return; | ||
1700 | } | ||
1365 | 1701 | ||
1366 | /* move the pages across to the second bio | 1702 | /* |
1703 | * share the pages with the first bio | ||
1367 | * and submit the write request | 1704 | * and submit the write request |
1368 | */ | 1705 | */ |
1369 | bio = r10_bio->devs[0].bio; | ||
1370 | wbio = r10_bio->devs[1].bio; | 1706 | wbio = r10_bio->devs[1].bio; |
1371 | for (i=0; i < wbio->bi_vcnt; i++) { | ||
1372 | struct page *p = bio->bi_io_vec[i].bv_page; | ||
1373 | bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page; | ||
1374 | wbio->bi_io_vec[i].bv_page = p; | ||
1375 | } | ||
1376 | d = r10_bio->devs[1].devnum; | 1707 | d = r10_bio->devs[1].devnum; |
1377 | 1708 | ||
1378 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 1709 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
1379 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); | 1710 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); |
1380 | if (test_bit(R10BIO_Uptodate, &r10_bio->state)) | 1711 | generic_make_request(wbio); |
1381 | generic_make_request(wbio); | ||
1382 | else | ||
1383 | bio_endio(wbio, -EIO); | ||
1384 | } | 1712 | } |
1385 | 1713 | ||
1386 | 1714 | ||
@@ -1421,6 +1749,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1421 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); | 1749 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); |
1422 | } | 1750 | } |
1423 | 1751 | ||
1752 | static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector, | ||
1753 | int sectors, struct page *page, int rw) | ||
1754 | { | ||
1755 | sector_t first_bad; | ||
1756 | int bad_sectors; | ||
1757 | |||
1758 | if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) | ||
1759 | && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) | ||
1760 | return -1; | ||
1761 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) | ||
1762 | /* success */ | ||
1763 | return 1; | ||
1764 | if (rw == WRITE) | ||
1765 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1766 | /* need to record an error - either for the block or the device */ | ||
1767 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) | ||
1768 | md_error(rdev->mddev, rdev); | ||
1769 | return 0; | ||
1770 | } | ||
1771 | |||
1424 | /* | 1772 | /* |
1425 | * This is a kernel thread which: | 1773 | * This is a kernel thread which: |
1426 | * | 1774 | * |
@@ -1476,10 +1824,15 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1476 | 1824 | ||
1477 | rcu_read_lock(); | 1825 | rcu_read_lock(); |
1478 | do { | 1826 | do { |
1827 | sector_t first_bad; | ||
1828 | int bad_sectors; | ||
1829 | |||
1479 | d = r10_bio->devs[sl].devnum; | 1830 | d = r10_bio->devs[sl].devnum; |
1480 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1831 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
1481 | if (rdev && | 1832 | if (rdev && |
1482 | test_bit(In_sync, &rdev->flags)) { | 1833 | test_bit(In_sync, &rdev->flags) && |
1834 | is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, | ||
1835 | &first_bad, &bad_sectors) == 0) { | ||
1483 | atomic_inc(&rdev->nr_pending); | 1836 | atomic_inc(&rdev->nr_pending); |
1484 | rcu_read_unlock(); | 1837 | rcu_read_unlock(); |
1485 | success = sync_page_io(rdev, | 1838 | success = sync_page_io(rdev, |
@@ -1499,9 +1852,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1499 | rcu_read_unlock(); | 1852 | rcu_read_unlock(); |
1500 | 1853 | ||
1501 | if (!success) { | 1854 | if (!success) { |
1502 | /* Cannot read from anywhere -- bye bye array */ | 1855 | /* Cannot read from anywhere, just mark the block |
1856 | * as bad on the first device to discourage future | ||
1857 | * reads. | ||
1858 | */ | ||
1503 | int dn = r10_bio->devs[r10_bio->read_slot].devnum; | 1859 | int dn = r10_bio->devs[r10_bio->read_slot].devnum; |
1504 | md_error(mddev, conf->mirrors[dn].rdev); | 1860 | rdev = conf->mirrors[dn].rdev; |
1861 | |||
1862 | if (!rdev_set_badblocks( | ||
1863 | rdev, | ||
1864 | r10_bio->devs[r10_bio->read_slot].addr | ||
1865 | + sect, | ||
1866 | s, 0)) | ||
1867 | md_error(mddev, rdev); | ||
1505 | break; | 1868 | break; |
1506 | } | 1869 | } |
1507 | 1870 | ||
@@ -1516,80 +1879,82 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1516 | sl--; | 1879 | sl--; |
1517 | d = r10_bio->devs[sl].devnum; | 1880 | d = r10_bio->devs[sl].devnum; |
1518 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1881 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
1519 | if (rdev && | 1882 | if (!rdev || |
1520 | test_bit(In_sync, &rdev->flags)) { | 1883 | !test_bit(In_sync, &rdev->flags)) |
1521 | atomic_inc(&rdev->nr_pending); | 1884 | continue; |
1522 | rcu_read_unlock(); | 1885 | |
1523 | atomic_add(s, &rdev->corrected_errors); | 1886 | atomic_inc(&rdev->nr_pending); |
1524 | if (sync_page_io(rdev, | 1887 | rcu_read_unlock(); |
1525 | r10_bio->devs[sl].addr + | 1888 | if (r10_sync_page_io(rdev, |
1526 | sect, | 1889 | r10_bio->devs[sl].addr + |
1527 | s<<9, conf->tmppage, WRITE, false) | 1890 | sect, |
1528 | == 0) { | 1891 | s<<9, conf->tmppage, WRITE) |
1529 | /* Well, this device is dead */ | 1892 | == 0) { |
1530 | printk(KERN_NOTICE | 1893 | /* Well, this device is dead */ |
1531 | "md/raid10:%s: read correction " | 1894 | printk(KERN_NOTICE |
1532 | "write failed" | 1895 | "md/raid10:%s: read correction " |
1533 | " (%d sectors at %llu on %s)\n", | 1896 | "write failed" |
1534 | mdname(mddev), s, | 1897 | " (%d sectors at %llu on %s)\n", |
1535 | (unsigned long long)( | 1898 | mdname(mddev), s, |
1536 | sect + rdev->data_offset), | 1899 | (unsigned long long)( |
1537 | bdevname(rdev->bdev, b)); | 1900 | sect + rdev->data_offset), |
1538 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 1901 | bdevname(rdev->bdev, b)); |
1539 | "drive\n", | 1902 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
1540 | mdname(mddev), | 1903 | "drive\n", |
1541 | bdevname(rdev->bdev, b)); | 1904 | mdname(mddev), |
1542 | md_error(mddev, rdev); | 1905 | bdevname(rdev->bdev, b)); |
1543 | } | ||
1544 | rdev_dec_pending(rdev, mddev); | ||
1545 | rcu_read_lock(); | ||
1546 | } | 1906 | } |
1907 | rdev_dec_pending(rdev, mddev); | ||
1908 | rcu_read_lock(); | ||
1547 | } | 1909 | } |
1548 | sl = start; | 1910 | sl = start; |
1549 | while (sl != r10_bio->read_slot) { | 1911 | while (sl != r10_bio->read_slot) { |
1912 | char b[BDEVNAME_SIZE]; | ||
1550 | 1913 | ||
1551 | if (sl==0) | 1914 | if (sl==0) |
1552 | sl = conf->copies; | 1915 | sl = conf->copies; |
1553 | sl--; | 1916 | sl--; |
1554 | d = r10_bio->devs[sl].devnum; | 1917 | d = r10_bio->devs[sl].devnum; |
1555 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1918 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
1556 | if (rdev && | 1919 | if (!rdev || |
1557 | test_bit(In_sync, &rdev->flags)) { | 1920 | !test_bit(In_sync, &rdev->flags)) |
1558 | char b[BDEVNAME_SIZE]; | 1921 | continue; |
1559 | atomic_inc(&rdev->nr_pending); | ||
1560 | rcu_read_unlock(); | ||
1561 | if (sync_page_io(rdev, | ||
1562 | r10_bio->devs[sl].addr + | ||
1563 | sect, | ||
1564 | s<<9, conf->tmppage, | ||
1565 | READ, false) == 0) { | ||
1566 | /* Well, this device is dead */ | ||
1567 | printk(KERN_NOTICE | ||
1568 | "md/raid10:%s: unable to read back " | ||
1569 | "corrected sectors" | ||
1570 | " (%d sectors at %llu on %s)\n", | ||
1571 | mdname(mddev), s, | ||
1572 | (unsigned long long)( | ||
1573 | sect + rdev->data_offset), | ||
1574 | bdevname(rdev->bdev, b)); | ||
1575 | printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", | ||
1576 | mdname(mddev), | ||
1577 | bdevname(rdev->bdev, b)); | ||
1578 | |||
1579 | md_error(mddev, rdev); | ||
1580 | } else { | ||
1581 | printk(KERN_INFO | ||
1582 | "md/raid10:%s: read error corrected" | ||
1583 | " (%d sectors at %llu on %s)\n", | ||
1584 | mdname(mddev), s, | ||
1585 | (unsigned long long)( | ||
1586 | sect + rdev->data_offset), | ||
1587 | bdevname(rdev->bdev, b)); | ||
1588 | } | ||
1589 | 1922 | ||
1590 | rdev_dec_pending(rdev, mddev); | 1923 | atomic_inc(&rdev->nr_pending); |
1591 | rcu_read_lock(); | 1924 | rcu_read_unlock(); |
1925 | switch (r10_sync_page_io(rdev, | ||
1926 | r10_bio->devs[sl].addr + | ||
1927 | sect, | ||
1928 | s<<9, conf->tmppage, | ||
1929 | READ)) { | ||
1930 | case 0: | ||
1931 | /* Well, this device is dead */ | ||
1932 | printk(KERN_NOTICE | ||
1933 | "md/raid10:%s: unable to read back " | ||
1934 | "corrected sectors" | ||
1935 | " (%d sectors at %llu on %s)\n", | ||
1936 | mdname(mddev), s, | ||
1937 | (unsigned long long)( | ||
1938 | sect + rdev->data_offset), | ||
1939 | bdevname(rdev->bdev, b)); | ||
1940 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | ||
1941 | "drive\n", | ||
1942 | mdname(mddev), | ||
1943 | bdevname(rdev->bdev, b)); | ||
1944 | break; | ||
1945 | case 1: | ||
1946 | printk(KERN_INFO | ||
1947 | "md/raid10:%s: read error corrected" | ||
1948 | " (%d sectors at %llu on %s)\n", | ||
1949 | mdname(mddev), s, | ||
1950 | (unsigned long long)( | ||
1951 | sect + rdev->data_offset), | ||
1952 | bdevname(rdev->bdev, b)); | ||
1953 | atomic_add(s, &rdev->corrected_errors); | ||
1592 | } | 1954 | } |
1955 | |||
1956 | rdev_dec_pending(rdev, mddev); | ||
1957 | rcu_read_lock(); | ||
1593 | } | 1958 | } |
1594 | rcu_read_unlock(); | 1959 | rcu_read_unlock(); |
1595 | 1960 | ||
@@ -1598,21 +1963,254 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1598 | } | 1963 | } |
1599 | } | 1964 | } |
1600 | 1965 | ||
1966 | static void bi_complete(struct bio *bio, int error) | ||
1967 | { | ||
1968 | complete((struct completion *)bio->bi_private); | ||
1969 | } | ||
1970 | |||
1971 | static int submit_bio_wait(int rw, struct bio *bio) | ||
1972 | { | ||
1973 | struct completion event; | ||
1974 | rw |= REQ_SYNC; | ||
1975 | |||
1976 | init_completion(&event); | ||
1977 | bio->bi_private = &event; | ||
1978 | bio->bi_end_io = bi_complete; | ||
1979 | submit_bio(rw, bio); | ||
1980 | wait_for_completion(&event); | ||
1981 | |||
1982 | return test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1983 | } | ||
1984 | |||
1985 | static int narrow_write_error(r10bio_t *r10_bio, int i) | ||
1986 | { | ||
1987 | struct bio *bio = r10_bio->master_bio; | ||
1988 | mddev_t *mddev = r10_bio->mddev; | ||
1989 | conf_t *conf = mddev->private; | ||
1990 | mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; | ||
1991 | /* bio has the data to be written to slot 'i' where | ||
1992 | * we just recently had a write error. | ||
1993 | * We repeatedly clone the bio and trim down to one block, | ||
1994 | * then try the write. Where the write fails we record | ||
1995 | * a bad block. | ||
1996 | * It is conceivable that the bio doesn't exactly align with | ||
1997 | * blocks. We must handle this. | ||
1998 | * | ||
1999 | * We currently own a reference to the rdev. | ||
2000 | */ | ||
2001 | |||
2002 | int block_sectors; | ||
2003 | sector_t sector; | ||
2004 | int sectors; | ||
2005 | int sect_to_write = r10_bio->sectors; | ||
2006 | int ok = 1; | ||
2007 | |||
2008 | if (rdev->badblocks.shift < 0) | ||
2009 | return 0; | ||
2010 | |||
2011 | block_sectors = 1 << rdev->badblocks.shift; | ||
2012 | sector = r10_bio->sector; | ||
2013 | sectors = ((r10_bio->sector + block_sectors) | ||
2014 | & ~(sector_t)(block_sectors - 1)) | ||
2015 | - sector; | ||
2016 | |||
2017 | while (sect_to_write) { | ||
2018 | struct bio *wbio; | ||
2019 | if (sectors > sect_to_write) | ||
2020 | sectors = sect_to_write; | ||
2021 | /* Write at 'sector' for 'sectors' */ | ||
2022 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | ||
2023 | md_trim_bio(wbio, sector - bio->bi_sector, sectors); | ||
2024 | wbio->bi_sector = (r10_bio->devs[i].addr+ | ||
2025 | rdev->data_offset+ | ||
2026 | (sector - r10_bio->sector)); | ||
2027 | wbio->bi_bdev = rdev->bdev; | ||
2028 | if (submit_bio_wait(WRITE, wbio) == 0) | ||
2029 | /* Failure! */ | ||
2030 | ok = rdev_set_badblocks(rdev, sector, | ||
2031 | sectors, 0) | ||
2032 | && ok; | ||
2033 | |||
2034 | bio_put(wbio); | ||
2035 | sect_to_write -= sectors; | ||
2036 | sector += sectors; | ||
2037 | sectors = block_sectors; | ||
2038 | } | ||
2039 | return ok; | ||
2040 | } | ||
2041 | |||
2042 | static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio) | ||
2043 | { | ||
2044 | int slot = r10_bio->read_slot; | ||
2045 | int mirror = r10_bio->devs[slot].devnum; | ||
2046 | struct bio *bio; | ||
2047 | conf_t *conf = mddev->private; | ||
2048 | mdk_rdev_t *rdev; | ||
2049 | char b[BDEVNAME_SIZE]; | ||
2050 | unsigned long do_sync; | ||
2051 | int max_sectors; | ||
2052 | |||
2053 | /* we got a read error. Maybe the drive is bad. Maybe just | ||
2054 | * the block and we can fix it. | ||
2055 | * We freeze all other IO, and try reading the block from | ||
2056 | * other devices. When we find one, we re-write | ||
2057 | * and check it that fixes the read error. | ||
2058 | * This is all done synchronously while the array is | ||
2059 | * frozen. | ||
2060 | */ | ||
2061 | if (mddev->ro == 0) { | ||
2062 | freeze_array(conf); | ||
2063 | fix_read_error(conf, mddev, r10_bio); | ||
2064 | unfreeze_array(conf); | ||
2065 | } | ||
2066 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
2067 | |||
2068 | bio = r10_bio->devs[slot].bio; | ||
2069 | bdevname(bio->bi_bdev, b); | ||
2070 | r10_bio->devs[slot].bio = | ||
2071 | mddev->ro ? IO_BLOCKED : NULL; | ||
2072 | read_more: | ||
2073 | mirror = read_balance(conf, r10_bio, &max_sectors); | ||
2074 | if (mirror == -1) { | ||
2075 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | ||
2076 | " read error for block %llu\n", | ||
2077 | mdname(mddev), b, | ||
2078 | (unsigned long long)r10_bio->sector); | ||
2079 | raid_end_bio_io(r10_bio); | ||
2080 | bio_put(bio); | ||
2081 | return; | ||
2082 | } | ||
2083 | |||
2084 | do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); | ||
2085 | if (bio) | ||
2086 | bio_put(bio); | ||
2087 | slot = r10_bio->read_slot; | ||
2088 | rdev = conf->mirrors[mirror].rdev; | ||
2089 | printk_ratelimited( | ||
2090 | KERN_ERR | ||
2091 | "md/raid10:%s: %s: redirecting" | ||
2092 | "sector %llu to another mirror\n", | ||
2093 | mdname(mddev), | ||
2094 | bdevname(rdev->bdev, b), | ||
2095 | (unsigned long long)r10_bio->sector); | ||
2096 | bio = bio_clone_mddev(r10_bio->master_bio, | ||
2097 | GFP_NOIO, mddev); | ||
2098 | md_trim_bio(bio, | ||
2099 | r10_bio->sector - bio->bi_sector, | ||
2100 | max_sectors); | ||
2101 | r10_bio->devs[slot].bio = bio; | ||
2102 | bio->bi_sector = r10_bio->devs[slot].addr | ||
2103 | + rdev->data_offset; | ||
2104 | bio->bi_bdev = rdev->bdev; | ||
2105 | bio->bi_rw = READ | do_sync; | ||
2106 | bio->bi_private = r10_bio; | ||
2107 | bio->bi_end_io = raid10_end_read_request; | ||
2108 | if (max_sectors < r10_bio->sectors) { | ||
2109 | /* Drat - have to split this up more */ | ||
2110 | struct bio *mbio = r10_bio->master_bio; | ||
2111 | int sectors_handled = | ||
2112 | r10_bio->sector + max_sectors | ||
2113 | - mbio->bi_sector; | ||
2114 | r10_bio->sectors = max_sectors; | ||
2115 | spin_lock_irq(&conf->device_lock); | ||
2116 | if (mbio->bi_phys_segments == 0) | ||
2117 | mbio->bi_phys_segments = 2; | ||
2118 | else | ||
2119 | mbio->bi_phys_segments++; | ||
2120 | spin_unlock_irq(&conf->device_lock); | ||
2121 | generic_make_request(bio); | ||
2122 | bio = NULL; | ||
2123 | |||
2124 | r10_bio = mempool_alloc(conf->r10bio_pool, | ||
2125 | GFP_NOIO); | ||
2126 | r10_bio->master_bio = mbio; | ||
2127 | r10_bio->sectors = (mbio->bi_size >> 9) | ||
2128 | - sectors_handled; | ||
2129 | r10_bio->state = 0; | ||
2130 | set_bit(R10BIO_ReadError, | ||
2131 | &r10_bio->state); | ||
2132 | r10_bio->mddev = mddev; | ||
2133 | r10_bio->sector = mbio->bi_sector | ||
2134 | + sectors_handled; | ||
2135 | |||
2136 | goto read_more; | ||
2137 | } else | ||
2138 | generic_make_request(bio); | ||
2139 | } | ||
2140 | |||
2141 | static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio) | ||
2142 | { | ||
2143 | /* Some sort of write request has finished and it | ||
2144 | * succeeded in writing where we thought there was a | ||
2145 | * bad block. So forget the bad block. | ||
2146 | * Or possibly if failed and we need to record | ||
2147 | * a bad block. | ||
2148 | */ | ||
2149 | int m; | ||
2150 | mdk_rdev_t *rdev; | ||
2151 | |||
2152 | if (test_bit(R10BIO_IsSync, &r10_bio->state) || | ||
2153 | test_bit(R10BIO_IsRecover, &r10_bio->state)) { | ||
2154 | for (m = 0; m < conf->copies; m++) { | ||
2155 | int dev = r10_bio->devs[m].devnum; | ||
2156 | rdev = conf->mirrors[dev].rdev; | ||
2157 | if (r10_bio->devs[m].bio == NULL) | ||
2158 | continue; | ||
2159 | if (test_bit(BIO_UPTODATE, | ||
2160 | &r10_bio->devs[m].bio->bi_flags)) { | ||
2161 | rdev_clear_badblocks( | ||
2162 | rdev, | ||
2163 | r10_bio->devs[m].addr, | ||
2164 | r10_bio->sectors); | ||
2165 | } else { | ||
2166 | if (!rdev_set_badblocks( | ||
2167 | rdev, | ||
2168 | r10_bio->devs[m].addr, | ||
2169 | r10_bio->sectors, 0)) | ||
2170 | md_error(conf->mddev, rdev); | ||
2171 | } | ||
2172 | } | ||
2173 | put_buf(r10_bio); | ||
2174 | } else { | ||
2175 | for (m = 0; m < conf->copies; m++) { | ||
2176 | int dev = r10_bio->devs[m].devnum; | ||
2177 | struct bio *bio = r10_bio->devs[m].bio; | ||
2178 | rdev = conf->mirrors[dev].rdev; | ||
2179 | if (bio == IO_MADE_GOOD) { | ||
2180 | rdev_clear_badblocks( | ||
2181 | rdev, | ||
2182 | r10_bio->devs[m].addr, | ||
2183 | r10_bio->sectors); | ||
2184 | rdev_dec_pending(rdev, conf->mddev); | ||
2185 | } else if (bio != NULL && | ||
2186 | !test_bit(BIO_UPTODATE, &bio->bi_flags)) { | ||
2187 | if (!narrow_write_error(r10_bio, m)) { | ||
2188 | md_error(conf->mddev, rdev); | ||
2189 | set_bit(R10BIO_Degraded, | ||
2190 | &r10_bio->state); | ||
2191 | } | ||
2192 | rdev_dec_pending(rdev, conf->mddev); | ||
2193 | } | ||
2194 | } | ||
2195 | if (test_bit(R10BIO_WriteError, | ||
2196 | &r10_bio->state)) | ||
2197 | close_write(r10_bio); | ||
2198 | raid_end_bio_io(r10_bio); | ||
2199 | } | ||
2200 | } | ||
2201 | |||
1601 | static void raid10d(mddev_t *mddev) | 2202 | static void raid10d(mddev_t *mddev) |
1602 | { | 2203 | { |
1603 | r10bio_t *r10_bio; | 2204 | r10bio_t *r10_bio; |
1604 | struct bio *bio; | ||
1605 | unsigned long flags; | 2205 | unsigned long flags; |
1606 | conf_t *conf = mddev->private; | 2206 | conf_t *conf = mddev->private; |
1607 | struct list_head *head = &conf->retry_list; | 2207 | struct list_head *head = &conf->retry_list; |
1608 | mdk_rdev_t *rdev; | ||
1609 | struct blk_plug plug; | 2208 | struct blk_plug plug; |
1610 | 2209 | ||
1611 | md_check_recovery(mddev); | 2210 | md_check_recovery(mddev); |
1612 | 2211 | ||
1613 | blk_start_plug(&plug); | 2212 | blk_start_plug(&plug); |
1614 | for (;;) { | 2213 | for (;;) { |
1615 | char b[BDEVNAME_SIZE]; | ||
1616 | 2214 | ||
1617 | flush_pending_writes(conf); | 2215 | flush_pending_writes(conf); |
1618 | 2216 | ||
@@ -1628,64 +2226,26 @@ static void raid10d(mddev_t *mddev) | |||
1628 | 2226 | ||
1629 | mddev = r10_bio->mddev; | 2227 | mddev = r10_bio->mddev; |
1630 | conf = mddev->private; | 2228 | conf = mddev->private; |
1631 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) | 2229 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
2230 | test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
2231 | handle_write_completed(conf, r10_bio); | ||
2232 | else if (test_bit(R10BIO_IsSync, &r10_bio->state)) | ||
1632 | sync_request_write(mddev, r10_bio); | 2233 | sync_request_write(mddev, r10_bio); |
1633 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) | 2234 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
1634 | recovery_request_write(mddev, r10_bio); | 2235 | recovery_request_write(mddev, r10_bio); |
2236 | else if (test_bit(R10BIO_ReadError, &r10_bio->state)) | ||
2237 | handle_read_error(mddev, r10_bio); | ||
1635 | else { | 2238 | else { |
1636 | int slot = r10_bio->read_slot; | 2239 | /* just a partial read to be scheduled from a |
1637 | int mirror = r10_bio->devs[slot].devnum; | 2240 | * separate context |
1638 | /* we got a read error. Maybe the drive is bad. Maybe just | ||
1639 | * the block and we can fix it. | ||
1640 | * We freeze all other IO, and try reading the block from | ||
1641 | * other devices. When we find one, we re-write | ||
1642 | * and check it that fixes the read error. | ||
1643 | * This is all done synchronously while the array is | ||
1644 | * frozen. | ||
1645 | */ | 2241 | */ |
1646 | if (mddev->ro == 0) { | 2242 | int slot = r10_bio->read_slot; |
1647 | freeze_array(conf); | 2243 | generic_make_request(r10_bio->devs[slot].bio); |
1648 | fix_read_error(conf, mddev, r10_bio); | ||
1649 | unfreeze_array(conf); | ||
1650 | } | ||
1651 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
1652 | |||
1653 | bio = r10_bio->devs[slot].bio; | ||
1654 | r10_bio->devs[slot].bio = | ||
1655 | mddev->ro ? IO_BLOCKED : NULL; | ||
1656 | mirror = read_balance(conf, r10_bio); | ||
1657 | if (mirror == -1) { | ||
1658 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | ||
1659 | " read error for block %llu\n", | ||
1660 | mdname(mddev), | ||
1661 | bdevname(bio->bi_bdev,b), | ||
1662 | (unsigned long long)r10_bio->sector); | ||
1663 | raid_end_bio_io(r10_bio); | ||
1664 | bio_put(bio); | ||
1665 | } else { | ||
1666 | const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); | ||
1667 | bio_put(bio); | ||
1668 | slot = r10_bio->read_slot; | ||
1669 | rdev = conf->mirrors[mirror].rdev; | ||
1670 | if (printk_ratelimit()) | ||
1671 | printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" | ||
1672 | " another mirror\n", | ||
1673 | mdname(mddev), | ||
1674 | bdevname(rdev->bdev,b), | ||
1675 | (unsigned long long)r10_bio->sector); | ||
1676 | bio = bio_clone_mddev(r10_bio->master_bio, | ||
1677 | GFP_NOIO, mddev); | ||
1678 | r10_bio->devs[slot].bio = bio; | ||
1679 | bio->bi_sector = r10_bio->devs[slot].addr | ||
1680 | + rdev->data_offset; | ||
1681 | bio->bi_bdev = rdev->bdev; | ||
1682 | bio->bi_rw = READ | do_sync; | ||
1683 | bio->bi_private = r10_bio; | ||
1684 | bio->bi_end_io = raid10_end_read_request; | ||
1685 | generic_make_request(bio); | ||
1686 | } | ||
1687 | } | 2244 | } |
2245 | |||
1688 | cond_resched(); | 2246 | cond_resched(); |
2247 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | ||
2248 | md_check_recovery(mddev); | ||
1689 | } | 2249 | } |
1690 | blk_finish_plug(&plug); | 2250 | blk_finish_plug(&plug); |
1691 | } | 2251 | } |
@@ -1746,7 +2306,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1746 | int i; | 2306 | int i; |
1747 | int max_sync; | 2307 | int max_sync; |
1748 | sector_t sync_blocks; | 2308 | sector_t sync_blocks; |
1749 | |||
1750 | sector_t sectors_skipped = 0; | 2309 | sector_t sectors_skipped = 0; |
1751 | int chunks_skipped = 0; | 2310 | int chunks_skipped = 0; |
1752 | 2311 | ||
@@ -1828,7 +2387,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1828 | max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); | 2387 | max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); |
1829 | if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | 2388 | if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
1830 | /* recovery... the complicated one */ | 2389 | /* recovery... the complicated one */ |
1831 | int j, k; | 2390 | int j; |
1832 | r10_bio = NULL; | 2391 | r10_bio = NULL; |
1833 | 2392 | ||
1834 | for (i=0 ; i<conf->raid_disks; i++) { | 2393 | for (i=0 ; i<conf->raid_disks; i++) { |
@@ -1836,6 +2395,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1836 | r10bio_t *rb2; | 2395 | r10bio_t *rb2; |
1837 | sector_t sect; | 2396 | sector_t sect; |
1838 | int must_sync; | 2397 | int must_sync; |
2398 | int any_working; | ||
1839 | 2399 | ||
1840 | if (conf->mirrors[i].rdev == NULL || | 2400 | if (conf->mirrors[i].rdev == NULL || |
1841 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) | 2401 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) |
@@ -1887,19 +2447,42 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1887 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | 2447 | must_sync = bitmap_start_sync(mddev->bitmap, sect, |
1888 | &sync_blocks, still_degraded); | 2448 | &sync_blocks, still_degraded); |
1889 | 2449 | ||
2450 | any_working = 0; | ||
1890 | for (j=0; j<conf->copies;j++) { | 2451 | for (j=0; j<conf->copies;j++) { |
2452 | int k; | ||
1891 | int d = r10_bio->devs[j].devnum; | 2453 | int d = r10_bio->devs[j].devnum; |
2454 | sector_t from_addr, to_addr; | ||
2455 | mdk_rdev_t *rdev; | ||
2456 | sector_t sector, first_bad; | ||
2457 | int bad_sectors; | ||
1892 | if (!conf->mirrors[d].rdev || | 2458 | if (!conf->mirrors[d].rdev || |
1893 | !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) | 2459 | !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) |
1894 | continue; | 2460 | continue; |
1895 | /* This is where we read from */ | 2461 | /* This is where we read from */ |
2462 | any_working = 1; | ||
2463 | rdev = conf->mirrors[d].rdev; | ||
2464 | sector = r10_bio->devs[j].addr; | ||
2465 | |||
2466 | if (is_badblock(rdev, sector, max_sync, | ||
2467 | &first_bad, &bad_sectors)) { | ||
2468 | if (first_bad > sector) | ||
2469 | max_sync = first_bad - sector; | ||
2470 | else { | ||
2471 | bad_sectors -= (sector | ||
2472 | - first_bad); | ||
2473 | if (max_sync > bad_sectors) | ||
2474 | max_sync = bad_sectors; | ||
2475 | continue; | ||
2476 | } | ||
2477 | } | ||
1896 | bio = r10_bio->devs[0].bio; | 2478 | bio = r10_bio->devs[0].bio; |
1897 | bio->bi_next = biolist; | 2479 | bio->bi_next = biolist; |
1898 | biolist = bio; | 2480 | biolist = bio; |
1899 | bio->bi_private = r10_bio; | 2481 | bio->bi_private = r10_bio; |
1900 | bio->bi_end_io = end_sync_read; | 2482 | bio->bi_end_io = end_sync_read; |
1901 | bio->bi_rw = READ; | 2483 | bio->bi_rw = READ; |
1902 | bio->bi_sector = r10_bio->devs[j].addr + | 2484 | from_addr = r10_bio->devs[j].addr; |
2485 | bio->bi_sector = from_addr + | ||
1903 | conf->mirrors[d].rdev->data_offset; | 2486 | conf->mirrors[d].rdev->data_offset; |
1904 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2487 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; |
1905 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2488 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
@@ -1916,26 +2499,48 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1916 | bio->bi_private = r10_bio; | 2499 | bio->bi_private = r10_bio; |
1917 | bio->bi_end_io = end_sync_write; | 2500 | bio->bi_end_io = end_sync_write; |
1918 | bio->bi_rw = WRITE; | 2501 | bio->bi_rw = WRITE; |
1919 | bio->bi_sector = r10_bio->devs[k].addr + | 2502 | to_addr = r10_bio->devs[k].addr; |
2503 | bio->bi_sector = to_addr + | ||
1920 | conf->mirrors[i].rdev->data_offset; | 2504 | conf->mirrors[i].rdev->data_offset; |
1921 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | 2505 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; |
1922 | 2506 | ||
1923 | r10_bio->devs[0].devnum = d; | 2507 | r10_bio->devs[0].devnum = d; |
2508 | r10_bio->devs[0].addr = from_addr; | ||
1924 | r10_bio->devs[1].devnum = i; | 2509 | r10_bio->devs[1].devnum = i; |
2510 | r10_bio->devs[1].addr = to_addr; | ||
1925 | 2511 | ||
1926 | break; | 2512 | break; |
1927 | } | 2513 | } |
1928 | if (j == conf->copies) { | 2514 | if (j == conf->copies) { |
1929 | /* Cannot recover, so abort the recovery */ | 2515 | /* Cannot recover, so abort the recovery or |
2516 | * record a bad block */ | ||
1930 | put_buf(r10_bio); | 2517 | put_buf(r10_bio); |
1931 | if (rb2) | 2518 | if (rb2) |
1932 | atomic_dec(&rb2->remaining); | 2519 | atomic_dec(&rb2->remaining); |
1933 | r10_bio = rb2; | 2520 | r10_bio = rb2; |
1934 | if (!test_and_set_bit(MD_RECOVERY_INTR, | 2521 | if (any_working) { |
1935 | &mddev->recovery)) | 2522 | /* problem is that there are bad blocks |
1936 | printk(KERN_INFO "md/raid10:%s: insufficient " | 2523 | * on other device(s) |
1937 | "working devices for recovery.\n", | 2524 | */ |
1938 | mdname(mddev)); | 2525 | int k; |
2526 | for (k = 0; k < conf->copies; k++) | ||
2527 | if (r10_bio->devs[k].devnum == i) | ||
2528 | break; | ||
2529 | if (!rdev_set_badblocks( | ||
2530 | conf->mirrors[i].rdev, | ||
2531 | r10_bio->devs[k].addr, | ||
2532 | max_sync, 0)) | ||
2533 | any_working = 0; | ||
2534 | } | ||
2535 | if (!any_working) { | ||
2536 | if (!test_and_set_bit(MD_RECOVERY_INTR, | ||
2537 | &mddev->recovery)) | ||
2538 | printk(KERN_INFO "md/raid10:%s: insufficient " | ||
2539 | "working devices for recovery.\n", | ||
2540 | mdname(mddev)); | ||
2541 | conf->mirrors[i].recovery_disabled | ||
2542 | = mddev->recovery_disabled; | ||
2543 | } | ||
1939 | break; | 2544 | break; |
1940 | } | 2545 | } |
1941 | } | 2546 | } |
@@ -1979,12 +2584,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1979 | 2584 | ||
1980 | for (i=0; i<conf->copies; i++) { | 2585 | for (i=0; i<conf->copies; i++) { |
1981 | int d = r10_bio->devs[i].devnum; | 2586 | int d = r10_bio->devs[i].devnum; |
2587 | sector_t first_bad, sector; | ||
2588 | int bad_sectors; | ||
2589 | |||
1982 | bio = r10_bio->devs[i].bio; | 2590 | bio = r10_bio->devs[i].bio; |
1983 | bio->bi_end_io = NULL; | 2591 | bio->bi_end_io = NULL; |
1984 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | 2592 | clear_bit(BIO_UPTODATE, &bio->bi_flags); |
1985 | if (conf->mirrors[d].rdev == NULL || | 2593 | if (conf->mirrors[d].rdev == NULL || |
1986 | test_bit(Faulty, &conf->mirrors[d].rdev->flags)) | 2594 | test_bit(Faulty, &conf->mirrors[d].rdev->flags)) |
1987 | continue; | 2595 | continue; |
2596 | sector = r10_bio->devs[i].addr; | ||
2597 | if (is_badblock(conf->mirrors[d].rdev, | ||
2598 | sector, max_sync, | ||
2599 | &first_bad, &bad_sectors)) { | ||
2600 | if (first_bad > sector) | ||
2601 | max_sync = first_bad - sector; | ||
2602 | else { | ||
2603 | bad_sectors -= (sector - first_bad); | ||
2604 | if (max_sync > bad_sectors) | ||
2605 | max_sync = max_sync; | ||
2606 | continue; | ||
2607 | } | ||
2608 | } | ||
1988 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2609 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
1989 | atomic_inc(&r10_bio->remaining); | 2610 | atomic_inc(&r10_bio->remaining); |
1990 | bio->bi_next = biolist; | 2611 | bio->bi_next = biolist; |
@@ -1992,7 +2613,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1992 | bio->bi_private = r10_bio; | 2613 | bio->bi_private = r10_bio; |
1993 | bio->bi_end_io = end_sync_read; | 2614 | bio->bi_end_io = end_sync_read; |
1994 | bio->bi_rw = READ; | 2615 | bio->bi_rw = READ; |
1995 | bio->bi_sector = r10_bio->devs[i].addr + | 2616 | bio->bi_sector = sector + |
1996 | conf->mirrors[d].rdev->data_offset; | 2617 | conf->mirrors[d].rdev->data_offset; |
1997 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2618 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; |
1998 | count++; | 2619 | count++; |
@@ -2079,7 +2700,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
2079 | return sectors_skipped + nr_sectors; | 2700 | return sectors_skipped + nr_sectors; |
2080 | giveup: | 2701 | giveup: |
2081 | /* There is nowhere to write, so all non-sync | 2702 | /* There is nowhere to write, so all non-sync |
2082 | * drives must be failed, so try the next chunk... | 2703 | * drives must be failed or in resync, all drives |
2704 | * have a bad block, so try the next chunk... | ||
2083 | */ | 2705 | */ |
2084 | if (sector_nr + max_sync < max_sector) | 2706 | if (sector_nr + max_sync < max_sector) |
2085 | max_sector = sector_nr + max_sync; | 2707 | max_sector = sector_nr + max_sync; |
@@ -2249,6 +2871,7 @@ static int run(mddev_t *mddev) | |||
2249 | (conf->raid_disks / conf->near_copies)); | 2871 | (conf->raid_disks / conf->near_copies)); |
2250 | 2872 | ||
2251 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2873 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
2874 | |||
2252 | disk_idx = rdev->raid_disk; | 2875 | disk_idx = rdev->raid_disk; |
2253 | if (disk_idx >= conf->raid_disks | 2876 | if (disk_idx >= conf->raid_disks |
2254 | || disk_idx < 0) | 2877 | || disk_idx < 0) |
@@ -2271,7 +2894,7 @@ static int run(mddev_t *mddev) | |||
2271 | disk->head_position = 0; | 2894 | disk->head_position = 0; |
2272 | } | 2895 | } |
2273 | /* need to check that every block has at least one working mirror */ | 2896 | /* need to check that every block has at least one working mirror */ |
2274 | if (!enough(conf)) { | 2897 | if (!enough(conf, -1)) { |
2275 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", | 2898 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", |
2276 | mdname(mddev)); | 2899 | mdname(mddev)); |
2277 | goto out_free_conf; | 2900 | goto out_free_conf; |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 944b1104d3b4..79cb52a0d4a2 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -6,6 +6,11 @@ typedef struct mirror_info mirror_info_t; | |||
6 | struct mirror_info { | 6 | struct mirror_info { |
7 | mdk_rdev_t *rdev; | 7 | mdk_rdev_t *rdev; |
8 | sector_t head_position; | 8 | sector_t head_position; |
9 | int recovery_disabled; /* matches | ||
10 | * mddev->recovery_disabled | ||
11 | * when we shouldn't try | ||
12 | * recovering this device. | ||
13 | */ | ||
9 | }; | 14 | }; |
10 | 15 | ||
11 | typedef struct r10bio_s r10bio_t; | 16 | typedef struct r10bio_s r10bio_t; |
@@ -113,10 +118,26 @@ struct r10bio_s { | |||
113 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | 118 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer |
114 | */ | 119 | */ |
115 | #define IO_BLOCKED ((struct bio*)1) | 120 | #define IO_BLOCKED ((struct bio*)1) |
121 | /* When we successfully write to a known bad-block, we need to remove the | ||
122 | * bad-block marking which must be done from process context. So we record | ||
123 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
124 | */ | ||
125 | #define IO_MADE_GOOD ((struct bio *)2) | ||
126 | |||
127 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
116 | 128 | ||
117 | /* bits for r10bio.state */ | 129 | /* bits for r10bio.state */ |
118 | #define R10BIO_Uptodate 0 | 130 | #define R10BIO_Uptodate 0 |
119 | #define R10BIO_IsSync 1 | 131 | #define R10BIO_IsSync 1 |
120 | #define R10BIO_IsRecover 2 | 132 | #define R10BIO_IsRecover 2 |
121 | #define R10BIO_Degraded 3 | 133 | #define R10BIO_Degraded 3 |
134 | /* Set ReadError on bios that experience a read error | ||
135 | * so that raid10d knows what to do with them. | ||
136 | */ | ||
137 | #define R10BIO_ReadError 4 | ||
138 | /* If a write for this request means we can clear some | ||
139 | * known-bad-block records, we set this flag. | ||
140 | */ | ||
141 | #define R10BIO_MadeGood 5 | ||
142 | #define R10BIO_WriteError 6 | ||
122 | #endif | 143 | #endif |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b72edf35ec54..dbae459fb02d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
52 | #include <linux/cpu.h> | 52 | #include <linux/cpu.h> |
53 | #include <linux/slab.h> | 53 | #include <linux/slab.h> |
54 | #include <linux/ratelimit.h> | ||
54 | #include "md.h" | 55 | #include "md.h" |
55 | #include "raid5.h" | 56 | #include "raid5.h" |
56 | #include "raid0.h" | 57 | #include "raid0.h" |
@@ -96,8 +97,6 @@ | |||
96 | #define __inline__ | 97 | #define __inline__ |
97 | #endif | 98 | #endif |
98 | 99 | ||
99 | #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) | ||
100 | |||
101 | /* | 100 | /* |
102 | * We maintain a biased count of active stripes in the bottom 16 bits of | 101 | * We maintain a biased count of active stripes in the bottom 16 bits of |
103 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits | 102 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits |
@@ -341,7 +340,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | |||
341 | (unsigned long long)sh->sector, i, dev->toread, | 340 | (unsigned long long)sh->sector, i, dev->toread, |
342 | dev->read, dev->towrite, dev->written, | 341 | dev->read, dev->towrite, dev->written, |
343 | test_bit(R5_LOCKED, &dev->flags)); | 342 | test_bit(R5_LOCKED, &dev->flags)); |
344 | BUG(); | 343 | WARN_ON(1); |
345 | } | 344 | } |
346 | dev->flags = 0; | 345 | dev->flags = 0; |
347 | raid5_build_block(sh, i, previous); | 346 | raid5_build_block(sh, i, previous); |
@@ -527,6 +526,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
527 | atomic_inc(&rdev->nr_pending); | 526 | atomic_inc(&rdev->nr_pending); |
528 | rcu_read_unlock(); | 527 | rcu_read_unlock(); |
529 | 528 | ||
529 | /* We have already checked bad blocks for reads. Now | ||
530 | * need to check for writes. | ||
531 | */ | ||
532 | while ((rw & WRITE) && rdev && | ||
533 | test_bit(WriteErrorSeen, &rdev->flags)) { | ||
534 | sector_t first_bad; | ||
535 | int bad_sectors; | ||
536 | int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, | ||
537 | &first_bad, &bad_sectors); | ||
538 | if (!bad) | ||
539 | break; | ||
540 | |||
541 | if (bad < 0) { | ||
542 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
543 | if (!conf->mddev->external && | ||
544 | conf->mddev->flags) { | ||
545 | /* It is very unlikely, but we might | ||
546 | * still need to write out the | ||
547 | * bad block log - better give it | ||
548 | * a chance*/ | ||
549 | md_check_recovery(conf->mddev); | ||
550 | } | ||
551 | md_wait_for_blocked_rdev(rdev, conf->mddev); | ||
552 | } else { | ||
553 | /* Acknowledged bad block - skip the write */ | ||
554 | rdev_dec_pending(rdev, conf->mddev); | ||
555 | rdev = NULL; | ||
556 | } | ||
557 | } | ||
558 | |||
530 | if (rdev) { | 559 | if (rdev) { |
531 | if (s->syncing || s->expanding || s->expanded) | 560 | if (s->syncing || s->expanding || s->expanded) |
532 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 561 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
@@ -548,10 +577,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
548 | bi->bi_io_vec[0].bv_offset = 0; | 577 | bi->bi_io_vec[0].bv_offset = 0; |
549 | bi->bi_size = STRIPE_SIZE; | 578 | bi->bi_size = STRIPE_SIZE; |
550 | bi->bi_next = NULL; | 579 | bi->bi_next = NULL; |
551 | if ((rw & WRITE) && | ||
552 | test_bit(R5_ReWrite, &sh->dev[i].flags)) | ||
553 | atomic_add(STRIPE_SECTORS, | ||
554 | &rdev->corrected_errors); | ||
555 | generic_make_request(bi); | 580 | generic_make_request(bi); |
556 | } else { | 581 | } else { |
557 | if (rw & WRITE) | 582 | if (rw & WRITE) |
@@ -1020,12 +1045,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1020 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { | 1045 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { |
1021 | struct bio *wbi; | 1046 | struct bio *wbi; |
1022 | 1047 | ||
1023 | spin_lock(&sh->lock); | 1048 | spin_lock_irq(&sh->raid_conf->device_lock); |
1024 | chosen = dev->towrite; | 1049 | chosen = dev->towrite; |
1025 | dev->towrite = NULL; | 1050 | dev->towrite = NULL; |
1026 | BUG_ON(dev->written); | 1051 | BUG_ON(dev->written); |
1027 | wbi = dev->written = chosen; | 1052 | wbi = dev->written = chosen; |
1028 | spin_unlock(&sh->lock); | 1053 | spin_unlock_irq(&sh->raid_conf->device_lock); |
1029 | 1054 | ||
1030 | while (wbi && wbi->bi_sector < | 1055 | while (wbi && wbi->bi_sector < |
1031 | dev->sector + STRIPE_SECTORS) { | 1056 | dev->sector + STRIPE_SECTORS) { |
@@ -1315,12 +1340,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
1315 | static int grow_one_stripe(raid5_conf_t *conf) | 1340 | static int grow_one_stripe(raid5_conf_t *conf) |
1316 | { | 1341 | { |
1317 | struct stripe_head *sh; | 1342 | struct stripe_head *sh; |
1318 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); | 1343 | sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); |
1319 | if (!sh) | 1344 | if (!sh) |
1320 | return 0; | 1345 | return 0; |
1321 | memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); | 1346 | |
1322 | sh->raid_conf = conf; | 1347 | sh->raid_conf = conf; |
1323 | spin_lock_init(&sh->lock); | ||
1324 | #ifdef CONFIG_MULTICORE_RAID456 | 1348 | #ifdef CONFIG_MULTICORE_RAID456 |
1325 | init_waitqueue_head(&sh->ops.wait_for_ops); | 1349 | init_waitqueue_head(&sh->ops.wait_for_ops); |
1326 | #endif | 1350 | #endif |
@@ -1435,14 +1459,11 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1435 | return -ENOMEM; | 1459 | return -ENOMEM; |
1436 | 1460 | ||
1437 | for (i = conf->max_nr_stripes; i; i--) { | 1461 | for (i = conf->max_nr_stripes; i; i--) { |
1438 | nsh = kmem_cache_alloc(sc, GFP_KERNEL); | 1462 | nsh = kmem_cache_zalloc(sc, GFP_KERNEL); |
1439 | if (!nsh) | 1463 | if (!nsh) |
1440 | break; | 1464 | break; |
1441 | 1465 | ||
1442 | memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); | ||
1443 | |||
1444 | nsh->raid_conf = conf; | 1466 | nsh->raid_conf = conf; |
1445 | spin_lock_init(&nsh->lock); | ||
1446 | #ifdef CONFIG_MULTICORE_RAID456 | 1467 | #ifdef CONFIG_MULTICORE_RAID456 |
1447 | init_waitqueue_head(&nsh->ops.wait_for_ops); | 1468 | init_waitqueue_head(&nsh->ops.wait_for_ops); |
1448 | #endif | 1469 | #endif |
@@ -1587,12 +1608,15 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1587 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 1608 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
1588 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | 1609 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { |
1589 | rdev = conf->disks[i].rdev; | 1610 | rdev = conf->disks[i].rdev; |
1590 | printk_rl(KERN_INFO "md/raid:%s: read error corrected" | 1611 | printk_ratelimited( |
1591 | " (%lu sectors at %llu on %s)\n", | 1612 | KERN_INFO |
1592 | mdname(conf->mddev), STRIPE_SECTORS, | 1613 | "md/raid:%s: read error corrected" |
1593 | (unsigned long long)(sh->sector | 1614 | " (%lu sectors at %llu on %s)\n", |
1594 | + rdev->data_offset), | 1615 | mdname(conf->mddev), STRIPE_SECTORS, |
1595 | bdevname(rdev->bdev, b)); | 1616 | (unsigned long long)(sh->sector |
1617 | + rdev->data_offset), | ||
1618 | bdevname(rdev->bdev, b)); | ||
1619 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | ||
1596 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1620 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
1597 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | 1621 | clear_bit(R5_ReWrite, &sh->dev[i].flags); |
1598 | } | 1622 | } |
@@ -1606,22 +1630,24 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1606 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | 1630 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); |
1607 | atomic_inc(&rdev->read_errors); | 1631 | atomic_inc(&rdev->read_errors); |
1608 | if (conf->mddev->degraded >= conf->max_degraded) | 1632 | if (conf->mddev->degraded >= conf->max_degraded) |
1609 | printk_rl(KERN_WARNING | 1633 | printk_ratelimited( |
1610 | "md/raid:%s: read error not correctable " | 1634 | KERN_WARNING |
1611 | "(sector %llu on %s).\n", | 1635 | "md/raid:%s: read error not correctable " |
1612 | mdname(conf->mddev), | 1636 | "(sector %llu on %s).\n", |
1613 | (unsigned long long)(sh->sector | 1637 | mdname(conf->mddev), |
1614 | + rdev->data_offset), | 1638 | (unsigned long long)(sh->sector |
1615 | bdn); | 1639 | + rdev->data_offset), |
1640 | bdn); | ||
1616 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) | 1641 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) |
1617 | /* Oh, no!!! */ | 1642 | /* Oh, no!!! */ |
1618 | printk_rl(KERN_WARNING | 1643 | printk_ratelimited( |
1619 | "md/raid:%s: read error NOT corrected!! " | 1644 | KERN_WARNING |
1620 | "(sector %llu on %s).\n", | 1645 | "md/raid:%s: read error NOT corrected!! " |
1621 | mdname(conf->mddev), | 1646 | "(sector %llu on %s).\n", |
1622 | (unsigned long long)(sh->sector | 1647 | mdname(conf->mddev), |
1623 | + rdev->data_offset), | 1648 | (unsigned long long)(sh->sector |
1624 | bdn); | 1649 | + rdev->data_offset), |
1650 | bdn); | ||
1625 | else if (atomic_read(&rdev->read_errors) | 1651 | else if (atomic_read(&rdev->read_errors) |
1626 | > conf->max_nr_stripes) | 1652 | > conf->max_nr_stripes) |
1627 | printk(KERN_WARNING | 1653 | printk(KERN_WARNING |
@@ -1649,6 +1675,8 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
1649 | raid5_conf_t *conf = sh->raid_conf; | 1675 | raid5_conf_t *conf = sh->raid_conf; |
1650 | int disks = sh->disks, i; | 1676 | int disks = sh->disks, i; |
1651 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1677 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
1678 | sector_t first_bad; | ||
1679 | int bad_sectors; | ||
1652 | 1680 | ||
1653 | for (i=0 ; i<disks; i++) | 1681 | for (i=0 ; i<disks; i++) |
1654 | if (bi == &sh->dev[i].req) | 1682 | if (bi == &sh->dev[i].req) |
@@ -1662,8 +1690,12 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
1662 | return; | 1690 | return; |
1663 | } | 1691 | } |
1664 | 1692 | ||
1665 | if (!uptodate) | 1693 | if (!uptodate) { |
1666 | md_error(conf->mddev, conf->disks[i].rdev); | 1694 | set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); |
1695 | set_bit(R5_WriteError, &sh->dev[i].flags); | ||
1696 | } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, | ||
1697 | &first_bad, &bad_sectors)) | ||
1698 | set_bit(R5_MadeGood, &sh->dev[i].flags); | ||
1667 | 1699 | ||
1668 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | 1700 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); |
1669 | 1701 | ||
@@ -1710,6 +1742,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1710 | */ | 1742 | */ |
1711 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 1743 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
1712 | } | 1744 | } |
1745 | set_bit(Blocked, &rdev->flags); | ||
1713 | set_bit(Faulty, &rdev->flags); | 1746 | set_bit(Faulty, &rdev->flags); |
1714 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1747 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
1715 | printk(KERN_ALERT | 1748 | printk(KERN_ALERT |
@@ -1760,7 +1793,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1760 | /* | 1793 | /* |
1761 | * Select the parity disk based on the user selected algorithm. | 1794 | * Select the parity disk based on the user selected algorithm. |
1762 | */ | 1795 | */ |
1763 | pd_idx = qd_idx = ~0; | 1796 | pd_idx = qd_idx = -1; |
1764 | switch(conf->level) { | 1797 | switch(conf->level) { |
1765 | case 4: | 1798 | case 4: |
1766 | pd_idx = data_disks; | 1799 | pd_idx = data_disks; |
@@ -2143,12 +2176,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2143 | raid5_conf_t *conf = sh->raid_conf; | 2176 | raid5_conf_t *conf = sh->raid_conf; |
2144 | int firstwrite=0; | 2177 | int firstwrite=0; |
2145 | 2178 | ||
2146 | pr_debug("adding bh b#%llu to stripe s#%llu\n", | 2179 | pr_debug("adding bi b#%llu to stripe s#%llu\n", |
2147 | (unsigned long long)bi->bi_sector, | 2180 | (unsigned long long)bi->bi_sector, |
2148 | (unsigned long long)sh->sector); | 2181 | (unsigned long long)sh->sector); |
2149 | 2182 | ||
2150 | 2183 | ||
2151 | spin_lock(&sh->lock); | ||
2152 | spin_lock_irq(&conf->device_lock); | 2184 | spin_lock_irq(&conf->device_lock); |
2153 | if (forwrite) { | 2185 | if (forwrite) { |
2154 | bip = &sh->dev[dd_idx].towrite; | 2186 | bip = &sh->dev[dd_idx].towrite; |
@@ -2169,19 +2201,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2169 | bi->bi_next = *bip; | 2201 | bi->bi_next = *bip; |
2170 | *bip = bi; | 2202 | *bip = bi; |
2171 | bi->bi_phys_segments++; | 2203 | bi->bi_phys_segments++; |
2172 | spin_unlock_irq(&conf->device_lock); | ||
2173 | spin_unlock(&sh->lock); | ||
2174 | |||
2175 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | ||
2176 | (unsigned long long)bi->bi_sector, | ||
2177 | (unsigned long long)sh->sector, dd_idx); | ||
2178 | |||
2179 | if (conf->mddev->bitmap && firstwrite) { | ||
2180 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | ||
2181 | STRIPE_SECTORS, 0); | ||
2182 | sh->bm_seq = conf->seq_flush+1; | ||
2183 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
2184 | } | ||
2185 | 2204 | ||
2186 | if (forwrite) { | 2205 | if (forwrite) { |
2187 | /* check if page is covered */ | 2206 | /* check if page is covered */ |
@@ -2196,12 +2215,23 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2196 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | 2215 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) |
2197 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | 2216 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); |
2198 | } | 2217 | } |
2218 | spin_unlock_irq(&conf->device_lock); | ||
2219 | |||
2220 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | ||
2221 | (unsigned long long)(*bip)->bi_sector, | ||
2222 | (unsigned long long)sh->sector, dd_idx); | ||
2223 | |||
2224 | if (conf->mddev->bitmap && firstwrite) { | ||
2225 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | ||
2226 | STRIPE_SECTORS, 0); | ||
2227 | sh->bm_seq = conf->seq_flush+1; | ||
2228 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
2229 | } | ||
2199 | return 1; | 2230 | return 1; |
2200 | 2231 | ||
2201 | overlap: | 2232 | overlap: |
2202 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); | 2233 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); |
2203 | spin_unlock_irq(&conf->device_lock); | 2234 | spin_unlock_irq(&conf->device_lock); |
2204 | spin_unlock(&sh->lock); | ||
2205 | return 0; | 2235 | return 0; |
2206 | } | 2236 | } |
2207 | 2237 | ||
@@ -2238,9 +2268,18 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, | |||
2238 | rcu_read_lock(); | 2268 | rcu_read_lock(); |
2239 | rdev = rcu_dereference(conf->disks[i].rdev); | 2269 | rdev = rcu_dereference(conf->disks[i].rdev); |
2240 | if (rdev && test_bit(In_sync, &rdev->flags)) | 2270 | if (rdev && test_bit(In_sync, &rdev->flags)) |
2241 | /* multiple read failures in one stripe */ | 2271 | atomic_inc(&rdev->nr_pending); |
2242 | md_error(conf->mddev, rdev); | 2272 | else |
2273 | rdev = NULL; | ||
2243 | rcu_read_unlock(); | 2274 | rcu_read_unlock(); |
2275 | if (rdev) { | ||
2276 | if (!rdev_set_badblocks( | ||
2277 | rdev, | ||
2278 | sh->sector, | ||
2279 | STRIPE_SECTORS, 0)) | ||
2280 | md_error(conf->mddev, rdev); | ||
2281 | rdev_dec_pending(rdev, conf->mddev); | ||
2282 | } | ||
2244 | } | 2283 | } |
2245 | spin_lock_irq(&conf->device_lock); | 2284 | spin_lock_irq(&conf->device_lock); |
2246 | /* fail all writes first */ | 2285 | /* fail all writes first */ |
@@ -2308,6 +2347,10 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, | |||
2308 | if (bitmap_end) | 2347 | if (bitmap_end) |
2309 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | 2348 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, |
2310 | STRIPE_SECTORS, 0, 0); | 2349 | STRIPE_SECTORS, 0, 0); |
2350 | /* If we were in the middle of a write the parity block might | ||
2351 | * still be locked - so just clear all R5_LOCKED flags | ||
2352 | */ | ||
2353 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
2311 | } | 2354 | } |
2312 | 2355 | ||
2313 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) | 2356 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) |
@@ -2315,109 +2358,73 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, | |||
2315 | md_wakeup_thread(conf->mddev->thread); | 2358 | md_wakeup_thread(conf->mddev->thread); |
2316 | } | 2359 | } |
2317 | 2360 | ||
2318 | /* fetch_block5 - checks the given member device to see if its data needs | 2361 | static void |
2319 | * to be read or computed to satisfy a request. | 2362 | handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh, |
2320 | * | 2363 | struct stripe_head_state *s) |
2321 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
2322 | * 0 to tell the loop in handle_stripe_fill5 to continue | ||
2323 | */ | ||
2324 | static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, | ||
2325 | int disk_idx, int disks) | ||
2326 | { | ||
2327 | struct r5dev *dev = &sh->dev[disk_idx]; | ||
2328 | struct r5dev *failed_dev = &sh->dev[s->failed_num]; | ||
2329 | |||
2330 | /* is the data in this block needed, and can we get it? */ | ||
2331 | if (!test_bit(R5_LOCKED, &dev->flags) && | ||
2332 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
2333 | (dev->toread || | ||
2334 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | ||
2335 | s->syncing || s->expanding || | ||
2336 | (s->failed && | ||
2337 | (failed_dev->toread || | ||
2338 | (failed_dev->towrite && | ||
2339 | !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { | ||
2340 | /* We would like to get this block, possibly by computing it, | ||
2341 | * otherwise read it if the backing disk is insync | ||
2342 | */ | ||
2343 | if ((s->uptodate == disks - 1) && | ||
2344 | (s->failed && disk_idx == s->failed_num)) { | ||
2345 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
2346 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2347 | set_bit(R5_Wantcompute, &dev->flags); | ||
2348 | sh->ops.target = disk_idx; | ||
2349 | sh->ops.target2 = -1; | ||
2350 | s->req_compute = 1; | ||
2351 | /* Careful: from this point on 'uptodate' is in the eye | ||
2352 | * of raid_run_ops which services 'compute' operations | ||
2353 | * before writes. R5_Wantcompute flags a block that will | ||
2354 | * be R5_UPTODATE by the time it is needed for a | ||
2355 | * subsequent operation. | ||
2356 | */ | ||
2357 | s->uptodate++; | ||
2358 | return 1; /* uptodate + compute == disks */ | ||
2359 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
2360 | set_bit(R5_LOCKED, &dev->flags); | ||
2361 | set_bit(R5_Wantread, &dev->flags); | ||
2362 | s->locked++; | ||
2363 | pr_debug("Reading block %d (sync=%d)\n", disk_idx, | ||
2364 | s->syncing); | ||
2365 | } | ||
2366 | } | ||
2367 | |||
2368 | return 0; | ||
2369 | } | ||
2370 | |||
2371 | /** | ||
2372 | * handle_stripe_fill5 - read or compute data to satisfy pending requests. | ||
2373 | */ | ||
2374 | static void handle_stripe_fill5(struct stripe_head *sh, | ||
2375 | struct stripe_head_state *s, int disks) | ||
2376 | { | 2364 | { |
2365 | int abort = 0; | ||
2377 | int i; | 2366 | int i; |
2378 | 2367 | ||
2379 | /* look for blocks to read/compute, skip this if a compute | 2368 | md_done_sync(conf->mddev, STRIPE_SECTORS, 0); |
2380 | * is already in flight, or if the stripe contents are in the | 2369 | clear_bit(STRIPE_SYNCING, &sh->state); |
2381 | * midst of changing due to a write | 2370 | s->syncing = 0; |
2371 | /* There is nothing more to do for sync/check/repair. | ||
2372 | * For recover we need to record a bad block on all | ||
2373 | * non-sync devices, or abort the recovery | ||
2382 | */ | 2374 | */ |
2383 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | 2375 | if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) |
2384 | !sh->reconstruct_state) | 2376 | return; |
2385 | for (i = disks; i--; ) | 2377 | /* During recovery devices cannot be removed, so locking and |
2386 | if (fetch_block5(sh, s, i, disks)) | 2378 | * refcounting of rdevs is not needed |
2387 | break; | 2379 | */ |
2388 | set_bit(STRIPE_HANDLE, &sh->state); | 2380 | for (i = 0; i < conf->raid_disks; i++) { |
2381 | mdk_rdev_t *rdev = conf->disks[i].rdev; | ||
2382 | if (!rdev | ||
2383 | || test_bit(Faulty, &rdev->flags) | ||
2384 | || test_bit(In_sync, &rdev->flags)) | ||
2385 | continue; | ||
2386 | if (!rdev_set_badblocks(rdev, sh->sector, | ||
2387 | STRIPE_SECTORS, 0)) | ||
2388 | abort = 1; | ||
2389 | } | ||
2390 | if (abort) { | ||
2391 | conf->recovery_disabled = conf->mddev->recovery_disabled; | ||
2392 | set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery); | ||
2393 | } | ||
2389 | } | 2394 | } |
2390 | 2395 | ||
2391 | /* fetch_block6 - checks the given member device to see if its data needs | 2396 | /* fetch_block - checks the given member device to see if its data needs |
2392 | * to be read or computed to satisfy a request. | 2397 | * to be read or computed to satisfy a request. |
2393 | * | 2398 | * |
2394 | * Returns 1 when no more member devices need to be checked, otherwise returns | 2399 | * Returns 1 when no more member devices need to be checked, otherwise returns |
2395 | * 0 to tell the loop in handle_stripe_fill6 to continue | 2400 | * 0 to tell the loop in handle_stripe_fill to continue |
2396 | */ | 2401 | */ |
2397 | static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | 2402 | static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, |
2398 | struct r6_state *r6s, int disk_idx, int disks) | 2403 | int disk_idx, int disks) |
2399 | { | 2404 | { |
2400 | struct r5dev *dev = &sh->dev[disk_idx]; | 2405 | struct r5dev *dev = &sh->dev[disk_idx]; |
2401 | struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], | 2406 | struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], |
2402 | &sh->dev[r6s->failed_num[1]] }; | 2407 | &sh->dev[s->failed_num[1]] }; |
2403 | 2408 | ||
2409 | /* is the data in this block needed, and can we get it? */ | ||
2404 | if (!test_bit(R5_LOCKED, &dev->flags) && | 2410 | if (!test_bit(R5_LOCKED, &dev->flags) && |
2405 | !test_bit(R5_UPTODATE, &dev->flags) && | 2411 | !test_bit(R5_UPTODATE, &dev->flags) && |
2406 | (dev->toread || | 2412 | (dev->toread || |
2407 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | 2413 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
2408 | s->syncing || s->expanding || | 2414 | s->syncing || s->expanding || |
2409 | (s->failed >= 1 && | 2415 | (s->failed >= 1 && fdev[0]->toread) || |
2410 | (fdev[0]->toread || s->to_write)) || | 2416 | (s->failed >= 2 && fdev[1]->toread) || |
2411 | (s->failed >= 2 && | 2417 | (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && |
2412 | (fdev[1]->toread || s->to_write)))) { | 2418 | !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || |
2419 | (sh->raid_conf->level == 6 && s->failed && s->to_write))) { | ||
2413 | /* we would like to get this block, possibly by computing it, | 2420 | /* we would like to get this block, possibly by computing it, |
2414 | * otherwise read it if the backing disk is insync | 2421 | * otherwise read it if the backing disk is insync |
2415 | */ | 2422 | */ |
2416 | BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); | 2423 | BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); |
2417 | BUG_ON(test_bit(R5_Wantread, &dev->flags)); | 2424 | BUG_ON(test_bit(R5_Wantread, &dev->flags)); |
2418 | if ((s->uptodate == disks - 1) && | 2425 | if ((s->uptodate == disks - 1) && |
2419 | (s->failed && (disk_idx == r6s->failed_num[0] || | 2426 | (s->failed && (disk_idx == s->failed_num[0] || |
2420 | disk_idx == r6s->failed_num[1]))) { | 2427 | disk_idx == s->failed_num[1]))) { |
2421 | /* have disk failed, and we're requested to fetch it; | 2428 | /* have disk failed, and we're requested to fetch it; |
2422 | * do compute it | 2429 | * do compute it |
2423 | */ | 2430 | */ |
@@ -2429,6 +2436,12 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | |||
2429 | sh->ops.target = disk_idx; | 2436 | sh->ops.target = disk_idx; |
2430 | sh->ops.target2 = -1; /* no 2nd target */ | 2437 | sh->ops.target2 = -1; /* no 2nd target */ |
2431 | s->req_compute = 1; | 2438 | s->req_compute = 1; |
2439 | /* Careful: from this point on 'uptodate' is in the eye | ||
2440 | * of raid_run_ops which services 'compute' operations | ||
2441 | * before writes. R5_Wantcompute flags a block that will | ||
2442 | * be R5_UPTODATE by the time it is needed for a | ||
2443 | * subsequent operation. | ||
2444 | */ | ||
2432 | s->uptodate++; | 2445 | s->uptodate++; |
2433 | return 1; | 2446 | return 1; |
2434 | } else if (s->uptodate == disks-2 && s->failed >= 2) { | 2447 | } else if (s->uptodate == disks-2 && s->failed >= 2) { |
@@ -2469,11 +2482,11 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | |||
2469 | } | 2482 | } |
2470 | 2483 | ||
2471 | /** | 2484 | /** |
2472 | * handle_stripe_fill6 - read or compute data to satisfy pending requests. | 2485 | * handle_stripe_fill - read or compute data to satisfy pending requests. |
2473 | */ | 2486 | */ |
2474 | static void handle_stripe_fill6(struct stripe_head *sh, | 2487 | static void handle_stripe_fill(struct stripe_head *sh, |
2475 | struct stripe_head_state *s, struct r6_state *r6s, | 2488 | struct stripe_head_state *s, |
2476 | int disks) | 2489 | int disks) |
2477 | { | 2490 | { |
2478 | int i; | 2491 | int i; |
2479 | 2492 | ||
@@ -2484,7 +2497,7 @@ static void handle_stripe_fill6(struct stripe_head *sh, | |||
2484 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | 2497 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && |
2485 | !sh->reconstruct_state) | 2498 | !sh->reconstruct_state) |
2486 | for (i = disks; i--; ) | 2499 | for (i = disks; i--; ) |
2487 | if (fetch_block6(sh, s, r6s, i, disks)) | 2500 | if (fetch_block(sh, s, i, disks)) |
2488 | break; | 2501 | break; |
2489 | set_bit(STRIPE_HANDLE, &sh->state); | 2502 | set_bit(STRIPE_HANDLE, &sh->state); |
2490 | } | 2503 | } |
@@ -2540,11 +2553,19 @@ static void handle_stripe_clean_event(raid5_conf_t *conf, | |||
2540 | md_wakeup_thread(conf->mddev->thread); | 2553 | md_wakeup_thread(conf->mddev->thread); |
2541 | } | 2554 | } |
2542 | 2555 | ||
2543 | static void handle_stripe_dirtying5(raid5_conf_t *conf, | 2556 | static void handle_stripe_dirtying(raid5_conf_t *conf, |
2544 | struct stripe_head *sh, struct stripe_head_state *s, int disks) | 2557 | struct stripe_head *sh, |
2558 | struct stripe_head_state *s, | ||
2559 | int disks) | ||
2545 | { | 2560 | { |
2546 | int rmw = 0, rcw = 0, i; | 2561 | int rmw = 0, rcw = 0, i; |
2547 | for (i = disks; i--; ) { | 2562 | if (conf->max_degraded == 2) { |
2563 | /* RAID6 requires 'rcw' in current implementation | ||
2564 | * Calculate the real rcw later - for now fake it | ||
2565 | * look like rcw is cheaper | ||
2566 | */ | ||
2567 | rcw = 1; rmw = 2; | ||
2568 | } else for (i = disks; i--; ) { | ||
2548 | /* would I have to read this buffer for read_modify_write */ | 2569 | /* would I have to read this buffer for read_modify_write */ |
2549 | struct r5dev *dev = &sh->dev[i]; | 2570 | struct r5dev *dev = &sh->dev[i]; |
2550 | if ((dev->towrite || i == sh->pd_idx) && | 2571 | if ((dev->towrite || i == sh->pd_idx) && |
@@ -2591,16 +2612,19 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
2591 | } | 2612 | } |
2592 | } | 2613 | } |
2593 | } | 2614 | } |
2594 | if (rcw <= rmw && rcw > 0) | 2615 | if (rcw <= rmw && rcw > 0) { |
2595 | /* want reconstruct write, but need to get some data */ | 2616 | /* want reconstruct write, but need to get some data */ |
2617 | rcw = 0; | ||
2596 | for (i = disks; i--; ) { | 2618 | for (i = disks; i--; ) { |
2597 | struct r5dev *dev = &sh->dev[i]; | 2619 | struct r5dev *dev = &sh->dev[i]; |
2598 | if (!test_bit(R5_OVERWRITE, &dev->flags) && | 2620 | if (!test_bit(R5_OVERWRITE, &dev->flags) && |
2599 | i != sh->pd_idx && | 2621 | i != sh->pd_idx && i != sh->qd_idx && |
2600 | !test_bit(R5_LOCKED, &dev->flags) && | 2622 | !test_bit(R5_LOCKED, &dev->flags) && |
2601 | !(test_bit(R5_UPTODATE, &dev->flags) || | 2623 | !(test_bit(R5_UPTODATE, &dev->flags) || |
2602 | test_bit(R5_Wantcompute, &dev->flags)) && | 2624 | test_bit(R5_Wantcompute, &dev->flags))) { |
2603 | test_bit(R5_Insync, &dev->flags)) { | 2625 | rcw++; |
2626 | if (!test_bit(R5_Insync, &dev->flags)) | ||
2627 | continue; /* it's a failed drive */ | ||
2604 | if ( | 2628 | if ( |
2605 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 2629 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
2606 | pr_debug("Read_old block " | 2630 | pr_debug("Read_old block " |
@@ -2614,6 +2638,7 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
2614 | } | 2638 | } |
2615 | } | 2639 | } |
2616 | } | 2640 | } |
2641 | } | ||
2617 | /* now if nothing is locked, and if we have enough data, | 2642 | /* now if nothing is locked, and if we have enough data, |
2618 | * we can start a write request | 2643 | * we can start a write request |
2619 | */ | 2644 | */ |
@@ -2630,53 +2655,6 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
2630 | schedule_reconstruction(sh, s, rcw == 0, 0); | 2655 | schedule_reconstruction(sh, s, rcw == 0, 0); |
2631 | } | 2656 | } |
2632 | 2657 | ||
2633 | static void handle_stripe_dirtying6(raid5_conf_t *conf, | ||
2634 | struct stripe_head *sh, struct stripe_head_state *s, | ||
2635 | struct r6_state *r6s, int disks) | ||
2636 | { | ||
2637 | int rcw = 0, pd_idx = sh->pd_idx, i; | ||
2638 | int qd_idx = sh->qd_idx; | ||
2639 | |||
2640 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2641 | for (i = disks; i--; ) { | ||
2642 | struct r5dev *dev = &sh->dev[i]; | ||
2643 | /* check if we haven't enough data */ | ||
2644 | if (!test_bit(R5_OVERWRITE, &dev->flags) && | ||
2645 | i != pd_idx && i != qd_idx && | ||
2646 | !test_bit(R5_LOCKED, &dev->flags) && | ||
2647 | !(test_bit(R5_UPTODATE, &dev->flags) || | ||
2648 | test_bit(R5_Wantcompute, &dev->flags))) { | ||
2649 | rcw++; | ||
2650 | if (!test_bit(R5_Insync, &dev->flags)) | ||
2651 | continue; /* it's a failed drive */ | ||
2652 | |||
2653 | if ( | ||
2654 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2655 | pr_debug("Read_old stripe %llu " | ||
2656 | "block %d for Reconstruct\n", | ||
2657 | (unsigned long long)sh->sector, i); | ||
2658 | set_bit(R5_LOCKED, &dev->flags); | ||
2659 | set_bit(R5_Wantread, &dev->flags); | ||
2660 | s->locked++; | ||
2661 | } else { | ||
2662 | pr_debug("Request delayed stripe %llu " | ||
2663 | "block %d for Reconstruct\n", | ||
2664 | (unsigned long long)sh->sector, i); | ||
2665 | set_bit(STRIPE_DELAYED, &sh->state); | ||
2666 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2667 | } | ||
2668 | } | ||
2669 | } | ||
2670 | /* now if nothing is locked, and if we have enough data, we can start a | ||
2671 | * write request | ||
2672 | */ | ||
2673 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && | ||
2674 | s->locked == 0 && rcw == 0 && | ||
2675 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | ||
2676 | schedule_reconstruction(sh, s, 1, 0); | ||
2677 | } | ||
2678 | } | ||
2679 | |||
2680 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | 2658 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, |
2681 | struct stripe_head_state *s, int disks) | 2659 | struct stripe_head_state *s, int disks) |
2682 | { | 2660 | { |
@@ -2695,7 +2673,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2695 | s->uptodate--; | 2673 | s->uptodate--; |
2696 | break; | 2674 | break; |
2697 | } | 2675 | } |
2698 | dev = &sh->dev[s->failed_num]; | 2676 | dev = &sh->dev[s->failed_num[0]]; |
2699 | /* fall through */ | 2677 | /* fall through */ |
2700 | case check_state_compute_result: | 2678 | case check_state_compute_result: |
2701 | sh->check_state = check_state_idle; | 2679 | sh->check_state = check_state_idle; |
@@ -2767,7 +2745,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2767 | 2745 | ||
2768 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | 2746 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, |
2769 | struct stripe_head_state *s, | 2747 | struct stripe_head_state *s, |
2770 | struct r6_state *r6s, int disks) | 2748 | int disks) |
2771 | { | 2749 | { |
2772 | int pd_idx = sh->pd_idx; | 2750 | int pd_idx = sh->pd_idx; |
2773 | int qd_idx = sh->qd_idx; | 2751 | int qd_idx = sh->qd_idx; |
@@ -2786,14 +2764,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2786 | switch (sh->check_state) { | 2764 | switch (sh->check_state) { |
2787 | case check_state_idle: | 2765 | case check_state_idle: |
2788 | /* start a new check operation if there are < 2 failures */ | 2766 | /* start a new check operation if there are < 2 failures */ |
2789 | if (s->failed == r6s->q_failed) { | 2767 | if (s->failed == s->q_failed) { |
2790 | /* The only possible failed device holds Q, so it | 2768 | /* The only possible failed device holds Q, so it |
2791 | * makes sense to check P (If anything else were failed, | 2769 | * makes sense to check P (If anything else were failed, |
2792 | * we would have used P to recreate it). | 2770 | * we would have used P to recreate it). |
2793 | */ | 2771 | */ |
2794 | sh->check_state = check_state_run; | 2772 | sh->check_state = check_state_run; |
2795 | } | 2773 | } |
2796 | if (!r6s->q_failed && s->failed < 2) { | 2774 | if (!s->q_failed && s->failed < 2) { |
2797 | /* Q is not failed, and we didn't use it to generate | 2775 | /* Q is not failed, and we didn't use it to generate |
2798 | * anything, so it makes sense to check it | 2776 | * anything, so it makes sense to check it |
2799 | */ | 2777 | */ |
@@ -2835,13 +2813,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2835 | */ | 2813 | */ |
2836 | BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ | 2814 | BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ |
2837 | if (s->failed == 2) { | 2815 | if (s->failed == 2) { |
2838 | dev = &sh->dev[r6s->failed_num[1]]; | 2816 | dev = &sh->dev[s->failed_num[1]]; |
2839 | s->locked++; | 2817 | s->locked++; |
2840 | set_bit(R5_LOCKED, &dev->flags); | 2818 | set_bit(R5_LOCKED, &dev->flags); |
2841 | set_bit(R5_Wantwrite, &dev->flags); | 2819 | set_bit(R5_Wantwrite, &dev->flags); |
2842 | } | 2820 | } |
2843 | if (s->failed >= 1) { | 2821 | if (s->failed >= 1) { |
2844 | dev = &sh->dev[r6s->failed_num[0]]; | 2822 | dev = &sh->dev[s->failed_num[0]]; |
2845 | s->locked++; | 2823 | s->locked++; |
2846 | set_bit(R5_LOCKED, &dev->flags); | 2824 | set_bit(R5_LOCKED, &dev->flags); |
2847 | set_bit(R5_Wantwrite, &dev->flags); | 2825 | set_bit(R5_Wantwrite, &dev->flags); |
@@ -2928,8 +2906,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2928 | } | 2906 | } |
2929 | } | 2907 | } |
2930 | 2908 | ||
2931 | static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | 2909 | static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh) |
2932 | struct r6_state *r6s) | ||
2933 | { | 2910 | { |
2934 | int i; | 2911 | int i; |
2935 | 2912 | ||
@@ -2971,7 +2948,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2971 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); | 2948 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); |
2972 | for (j = 0; j < conf->raid_disks; j++) | 2949 | for (j = 0; j < conf->raid_disks; j++) |
2973 | if (j != sh2->pd_idx && | 2950 | if (j != sh2->pd_idx && |
2974 | (!r6s || j != sh2->qd_idx) && | 2951 | j != sh2->qd_idx && |
2975 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) | 2952 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) |
2976 | break; | 2953 | break; |
2977 | if (j == conf->raid_disks) { | 2954 | if (j == conf->raid_disks) { |
@@ -3006,43 +2983,35 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
3006 | * | 2983 | * |
3007 | */ | 2984 | */ |
3008 | 2985 | ||
3009 | static void handle_stripe5(struct stripe_head *sh) | 2986 | static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) |
3010 | { | 2987 | { |
3011 | raid5_conf_t *conf = sh->raid_conf; | 2988 | raid5_conf_t *conf = sh->raid_conf; |
3012 | int disks = sh->disks, i; | 2989 | int disks = sh->disks; |
3013 | struct bio *return_bi = NULL; | ||
3014 | struct stripe_head_state s; | ||
3015 | struct r5dev *dev; | 2990 | struct r5dev *dev; |
3016 | mdk_rdev_t *blocked_rdev = NULL; | 2991 | int i; |
3017 | int prexor; | ||
3018 | int dec_preread_active = 0; | ||
3019 | 2992 | ||
3020 | memset(&s, 0, sizeof(s)); | 2993 | memset(s, 0, sizeof(*s)); |
3021 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " | ||
3022 | "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, | ||
3023 | atomic_read(&sh->count), sh->pd_idx, sh->check_state, | ||
3024 | sh->reconstruct_state); | ||
3025 | 2994 | ||
3026 | spin_lock(&sh->lock); | 2995 | s->syncing = test_bit(STRIPE_SYNCING, &sh->state); |
3027 | clear_bit(STRIPE_HANDLE, &sh->state); | 2996 | s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
3028 | clear_bit(STRIPE_DELAYED, &sh->state); | 2997 | s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); |
3029 | 2998 | s->failed_num[0] = -1; | |
3030 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | 2999 | s->failed_num[1] = -1; |
3031 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
3032 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | ||
3033 | 3000 | ||
3034 | /* Now to look around and see what can be done */ | 3001 | /* Now to look around and see what can be done */ |
3035 | rcu_read_lock(); | 3002 | rcu_read_lock(); |
3003 | spin_lock_irq(&conf->device_lock); | ||
3036 | for (i=disks; i--; ) { | 3004 | for (i=disks; i--; ) { |
3037 | mdk_rdev_t *rdev; | 3005 | mdk_rdev_t *rdev; |
3006 | sector_t first_bad; | ||
3007 | int bad_sectors; | ||
3008 | int is_bad = 0; | ||
3038 | 3009 | ||
3039 | dev = &sh->dev[i]; | 3010 | dev = &sh->dev[i]; |
3040 | 3011 | ||
3041 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " | 3012 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
3042 | "written %p\n", i, dev->flags, dev->toread, dev->read, | 3013 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
3043 | dev->towrite, dev->written); | 3014 | /* maybe we can reply to a read |
3044 | |||
3045 | /* maybe we can request a biofill operation | ||
3046 | * | 3015 | * |
3047 | * new wantfill requests are only permitted while | 3016 | * new wantfill requests are only permitted while |
3048 | * ops_complete_biofill is guaranteed to be inactive | 3017 | * ops_complete_biofill is guaranteed to be inactive |
@@ -3052,37 +3021,74 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3052 | set_bit(R5_Wantfill, &dev->flags); | 3021 | set_bit(R5_Wantfill, &dev->flags); |
3053 | 3022 | ||
3054 | /* now count some things */ | 3023 | /* now count some things */ |
3055 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; | 3024 | if (test_bit(R5_LOCKED, &dev->flags)) |
3056 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; | 3025 | s->locked++; |
3057 | if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; | 3026 | if (test_bit(R5_UPTODATE, &dev->flags)) |
3027 | s->uptodate++; | ||
3028 | if (test_bit(R5_Wantcompute, &dev->flags)) { | ||
3029 | s->compute++; | ||
3030 | BUG_ON(s->compute > 2); | ||
3031 | } | ||
3058 | 3032 | ||
3059 | if (test_bit(R5_Wantfill, &dev->flags)) | 3033 | if (test_bit(R5_Wantfill, &dev->flags)) |
3060 | s.to_fill++; | 3034 | s->to_fill++; |
3061 | else if (dev->toread) | 3035 | else if (dev->toread) |
3062 | s.to_read++; | 3036 | s->to_read++; |
3063 | if (dev->towrite) { | 3037 | if (dev->towrite) { |
3064 | s.to_write++; | 3038 | s->to_write++; |
3065 | if (!test_bit(R5_OVERWRITE, &dev->flags)) | 3039 | if (!test_bit(R5_OVERWRITE, &dev->flags)) |
3066 | s.non_overwrite++; | 3040 | s->non_overwrite++; |
3067 | } | 3041 | } |
3068 | if (dev->written) | 3042 | if (dev->written) |
3069 | s.written++; | 3043 | s->written++; |
3070 | rdev = rcu_dereference(conf->disks[i].rdev); | 3044 | rdev = rcu_dereference(conf->disks[i].rdev); |
3071 | if (blocked_rdev == NULL && | 3045 | if (rdev) { |
3072 | rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | 3046 | is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, |
3073 | blocked_rdev = rdev; | 3047 | &first_bad, &bad_sectors); |
3074 | atomic_inc(&rdev->nr_pending); | 3048 | if (s->blocked_rdev == NULL |
3049 | && (test_bit(Blocked, &rdev->flags) | ||
3050 | || is_bad < 0)) { | ||
3051 | if (is_bad < 0) | ||
3052 | set_bit(BlockedBadBlocks, | ||
3053 | &rdev->flags); | ||
3054 | s->blocked_rdev = rdev; | ||
3055 | atomic_inc(&rdev->nr_pending); | ||
3056 | } | ||
3075 | } | 3057 | } |
3076 | clear_bit(R5_Insync, &dev->flags); | 3058 | clear_bit(R5_Insync, &dev->flags); |
3077 | if (!rdev) | 3059 | if (!rdev) |
3078 | /* Not in-sync */; | 3060 | /* Not in-sync */; |
3079 | else if (test_bit(In_sync, &rdev->flags)) | 3061 | else if (is_bad) { |
3062 | /* also not in-sync */ | ||
3063 | if (!test_bit(WriteErrorSeen, &rdev->flags)) { | ||
3064 | /* treat as in-sync, but with a read error | ||
3065 | * which we can now try to correct | ||
3066 | */ | ||
3067 | set_bit(R5_Insync, &dev->flags); | ||
3068 | set_bit(R5_ReadError, &dev->flags); | ||
3069 | } | ||
3070 | } else if (test_bit(In_sync, &rdev->flags)) | ||
3080 | set_bit(R5_Insync, &dev->flags); | 3071 | set_bit(R5_Insync, &dev->flags); |
3081 | else { | 3072 | else { |
3082 | /* could be in-sync depending on recovery/reshape status */ | 3073 | /* in sync if before recovery_offset */ |
3083 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | 3074 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) |
3084 | set_bit(R5_Insync, &dev->flags); | 3075 | set_bit(R5_Insync, &dev->flags); |
3085 | } | 3076 | } |
3077 | if (test_bit(R5_WriteError, &dev->flags)) { | ||
3078 | clear_bit(R5_Insync, &dev->flags); | ||
3079 | if (!test_bit(Faulty, &rdev->flags)) { | ||
3080 | s->handle_bad_blocks = 1; | ||
3081 | atomic_inc(&rdev->nr_pending); | ||
3082 | } else | ||
3083 | clear_bit(R5_WriteError, &dev->flags); | ||
3084 | } | ||
3085 | if (test_bit(R5_MadeGood, &dev->flags)) { | ||
3086 | if (!test_bit(Faulty, &rdev->flags)) { | ||
3087 | s->handle_bad_blocks = 1; | ||
3088 | atomic_inc(&rdev->nr_pending); | ||
3089 | } else | ||
3090 | clear_bit(R5_MadeGood, &dev->flags); | ||
3091 | } | ||
3086 | if (!test_bit(R5_Insync, &dev->flags)) { | 3092 | if (!test_bit(R5_Insync, &dev->flags)) { |
3087 | /* The ReadError flag will just be confusing now */ | 3093 | /* The ReadError flag will just be confusing now */ |
3088 | clear_bit(R5_ReadError, &dev->flags); | 3094 | clear_bit(R5_ReadError, &dev->flags); |
@@ -3091,313 +3097,60 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3091 | if (test_bit(R5_ReadError, &dev->flags)) | 3097 | if (test_bit(R5_ReadError, &dev->flags)) |
3092 | clear_bit(R5_Insync, &dev->flags); | 3098 | clear_bit(R5_Insync, &dev->flags); |
3093 | if (!test_bit(R5_Insync, &dev->flags)) { | 3099 | if (!test_bit(R5_Insync, &dev->flags)) { |
3094 | s.failed++; | 3100 | if (s->failed < 2) |
3095 | s.failed_num = i; | 3101 | s->failed_num[s->failed] = i; |
3102 | s->failed++; | ||
3096 | } | 3103 | } |
3097 | } | 3104 | } |
3105 | spin_unlock_irq(&conf->device_lock); | ||
3098 | rcu_read_unlock(); | 3106 | rcu_read_unlock(); |
3099 | |||
3100 | if (unlikely(blocked_rdev)) { | ||
3101 | if (s.syncing || s.expanding || s.expanded || | ||
3102 | s.to_write || s.written) { | ||
3103 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3104 | goto unlock; | ||
3105 | } | ||
3106 | /* There is nothing for the blocked_rdev to block */ | ||
3107 | rdev_dec_pending(blocked_rdev, conf->mddev); | ||
3108 | blocked_rdev = NULL; | ||
3109 | } | ||
3110 | |||
3111 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { | ||
3112 | set_bit(STRIPE_OP_BIOFILL, &s.ops_request); | ||
3113 | set_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
3114 | } | ||
3115 | |||
3116 | pr_debug("locked=%d uptodate=%d to_read=%d" | ||
3117 | " to_write=%d failed=%d failed_num=%d\n", | ||
3118 | s.locked, s.uptodate, s.to_read, s.to_write, | ||
3119 | s.failed, s.failed_num); | ||
3120 | /* check if the array has lost two devices and, if so, some requests might | ||
3121 | * need to be failed | ||
3122 | */ | ||
3123 | if (s.failed > 1 && s.to_read+s.to_write+s.written) | ||
3124 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); | ||
3125 | if (s.failed > 1 && s.syncing) { | ||
3126 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | ||
3127 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
3128 | s.syncing = 0; | ||
3129 | } | ||
3130 | |||
3131 | /* might be able to return some write requests if the parity block | ||
3132 | * is safe, or on a failed drive | ||
3133 | */ | ||
3134 | dev = &sh->dev[sh->pd_idx]; | ||
3135 | if ( s.written && | ||
3136 | ((test_bit(R5_Insync, &dev->flags) && | ||
3137 | !test_bit(R5_LOCKED, &dev->flags) && | ||
3138 | test_bit(R5_UPTODATE, &dev->flags)) || | ||
3139 | (s.failed == 1 && s.failed_num == sh->pd_idx))) | ||
3140 | handle_stripe_clean_event(conf, sh, disks, &return_bi); | ||
3141 | |||
3142 | /* Now we might consider reading some blocks, either to check/generate | ||
3143 | * parity, or to satisfy requests | ||
3144 | * or to load a block that is being partially written. | ||
3145 | */ | ||
3146 | if (s.to_read || s.non_overwrite || | ||
3147 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) | ||
3148 | handle_stripe_fill5(sh, &s, disks); | ||
3149 | |||
3150 | /* Now we check to see if any write operations have recently | ||
3151 | * completed | ||
3152 | */ | ||
3153 | prexor = 0; | ||
3154 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) | ||
3155 | prexor = 1; | ||
3156 | if (sh->reconstruct_state == reconstruct_state_drain_result || | ||
3157 | sh->reconstruct_state == reconstruct_state_prexor_drain_result) { | ||
3158 | sh->reconstruct_state = reconstruct_state_idle; | ||
3159 | |||
3160 | /* All the 'written' buffers and the parity block are ready to | ||
3161 | * be written back to disk | ||
3162 | */ | ||
3163 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | ||
3164 | for (i = disks; i--; ) { | ||
3165 | dev = &sh->dev[i]; | ||
3166 | if (test_bit(R5_LOCKED, &dev->flags) && | ||
3167 | (i == sh->pd_idx || dev->written)) { | ||
3168 | pr_debug("Writing block %d\n", i); | ||
3169 | set_bit(R5_Wantwrite, &dev->flags); | ||
3170 | if (prexor) | ||
3171 | continue; | ||
3172 | if (!test_bit(R5_Insync, &dev->flags) || | ||
3173 | (i == sh->pd_idx && s.failed == 0)) | ||
3174 | set_bit(STRIPE_INSYNC, &sh->state); | ||
3175 | } | ||
3176 | } | ||
3177 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
3178 | dec_preread_active = 1; | ||
3179 | } | ||
3180 | |||
3181 | /* Now to consider new write requests and what else, if anything | ||
3182 | * should be read. We do not handle new writes when: | ||
3183 | * 1/ A 'write' operation (copy+xor) is already in flight. | ||
3184 | * 2/ A 'check' operation is in flight, as it may clobber the parity | ||
3185 | * block. | ||
3186 | */ | ||
3187 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | ||
3188 | handle_stripe_dirtying5(conf, sh, &s, disks); | ||
3189 | |||
3190 | /* maybe we need to check and possibly fix the parity for this stripe | ||
3191 | * Any reads will already have been scheduled, so we just see if enough | ||
3192 | * data is available. The parity check is held off while parity | ||
3193 | * dependent operations are in flight. | ||
3194 | */ | ||
3195 | if (sh->check_state || | ||
3196 | (s.syncing && s.locked == 0 && | ||
3197 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | ||
3198 | !test_bit(STRIPE_INSYNC, &sh->state))) | ||
3199 | handle_parity_checks5(conf, sh, &s, disks); | ||
3200 | |||
3201 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | ||
3202 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | ||
3203 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
3204 | } | ||
3205 | |||
3206 | /* If the failed drive is just a ReadError, then we might need to progress | ||
3207 | * the repair/check process | ||
3208 | */ | ||
3209 | if (s.failed == 1 && !conf->mddev->ro && | ||
3210 | test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) | ||
3211 | && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) | ||
3212 | && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) | ||
3213 | ) { | ||
3214 | dev = &sh->dev[s.failed_num]; | ||
3215 | if (!test_bit(R5_ReWrite, &dev->flags)) { | ||
3216 | set_bit(R5_Wantwrite, &dev->flags); | ||
3217 | set_bit(R5_ReWrite, &dev->flags); | ||
3218 | set_bit(R5_LOCKED, &dev->flags); | ||
3219 | s.locked++; | ||
3220 | } else { | ||
3221 | /* let's read it back */ | ||
3222 | set_bit(R5_Wantread, &dev->flags); | ||
3223 | set_bit(R5_LOCKED, &dev->flags); | ||
3224 | s.locked++; | ||
3225 | } | ||
3226 | } | ||
3227 | |||
3228 | /* Finish reconstruct operations initiated by the expansion process */ | ||
3229 | if (sh->reconstruct_state == reconstruct_state_result) { | ||
3230 | struct stripe_head *sh2 | ||
3231 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | ||
3232 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | ||
3233 | /* sh cannot be written until sh2 has been read. | ||
3234 | * so arrange for sh to be delayed a little | ||
3235 | */ | ||
3236 | set_bit(STRIPE_DELAYED, &sh->state); | ||
3237 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3238 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
3239 | &sh2->state)) | ||
3240 | atomic_inc(&conf->preread_active_stripes); | ||
3241 | release_stripe(sh2); | ||
3242 | goto unlock; | ||
3243 | } | ||
3244 | if (sh2) | ||
3245 | release_stripe(sh2); | ||
3246 | |||
3247 | sh->reconstruct_state = reconstruct_state_idle; | ||
3248 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
3249 | for (i = conf->raid_disks; i--; ) { | ||
3250 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
3251 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3252 | s.locked++; | ||
3253 | } | ||
3254 | } | ||
3255 | |||
3256 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | ||
3257 | !sh->reconstruct_state) { | ||
3258 | /* Need to write out all blocks after computing parity */ | ||
3259 | sh->disks = conf->raid_disks; | ||
3260 | stripe_set_idx(sh->sector, conf, 0, sh); | ||
3261 | schedule_reconstruction(sh, &s, 1, 1); | ||
3262 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { | ||
3263 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | ||
3264 | atomic_dec(&conf->reshape_stripes); | ||
3265 | wake_up(&conf->wait_for_overlap); | ||
3266 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); | ||
3267 | } | ||
3268 | |||
3269 | if (s.expanding && s.locked == 0 && | ||
3270 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) | ||
3271 | handle_stripe_expansion(conf, sh, NULL); | ||
3272 | |||
3273 | unlock: | ||
3274 | spin_unlock(&sh->lock); | ||
3275 | |||
3276 | /* wait for this device to become unblocked */ | ||
3277 | if (unlikely(blocked_rdev)) | ||
3278 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | ||
3279 | |||
3280 | if (s.ops_request) | ||
3281 | raid_run_ops(sh, s.ops_request); | ||
3282 | |||
3283 | ops_run_io(sh, &s); | ||
3284 | |||
3285 | if (dec_preread_active) { | ||
3286 | /* We delay this until after ops_run_io so that if make_request | ||
3287 | * is waiting on a flush, it won't continue until the writes | ||
3288 | * have actually been submitted. | ||
3289 | */ | ||
3290 | atomic_dec(&conf->preread_active_stripes); | ||
3291 | if (atomic_read(&conf->preread_active_stripes) < | ||
3292 | IO_THRESHOLD) | ||
3293 | md_wakeup_thread(conf->mddev->thread); | ||
3294 | } | ||
3295 | return_io(return_bi); | ||
3296 | } | 3107 | } |
3297 | 3108 | ||
3298 | static void handle_stripe6(struct stripe_head *sh) | 3109 | static void handle_stripe(struct stripe_head *sh) |
3299 | { | 3110 | { |
3111 | struct stripe_head_state s; | ||
3300 | raid5_conf_t *conf = sh->raid_conf; | 3112 | raid5_conf_t *conf = sh->raid_conf; |
3113 | int i; | ||
3114 | int prexor; | ||
3301 | int disks = sh->disks; | 3115 | int disks = sh->disks; |
3302 | struct bio *return_bi = NULL; | 3116 | struct r5dev *pdev, *qdev; |
3303 | int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; | 3117 | |
3304 | struct stripe_head_state s; | 3118 | clear_bit(STRIPE_HANDLE, &sh->state); |
3305 | struct r6_state r6s; | 3119 | if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) { |
3306 | struct r5dev *dev, *pdev, *qdev; | 3120 | /* already being handled, ensure it gets handled |
3307 | mdk_rdev_t *blocked_rdev = NULL; | 3121 | * again when current action finishes */ |
3308 | int dec_preread_active = 0; | 3122 | set_bit(STRIPE_HANDLE, &sh->state); |
3123 | return; | ||
3124 | } | ||
3125 | |||
3126 | if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { | ||
3127 | set_bit(STRIPE_SYNCING, &sh->state); | ||
3128 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
3129 | } | ||
3130 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
3309 | 3131 | ||
3310 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " | 3132 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " |
3311 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", | 3133 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", |
3312 | (unsigned long long)sh->sector, sh->state, | 3134 | (unsigned long long)sh->sector, sh->state, |
3313 | atomic_read(&sh->count), pd_idx, qd_idx, | 3135 | atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, |
3314 | sh->check_state, sh->reconstruct_state); | 3136 | sh->check_state, sh->reconstruct_state); |
3315 | memset(&s, 0, sizeof(s)); | ||
3316 | |||
3317 | spin_lock(&sh->lock); | ||
3318 | clear_bit(STRIPE_HANDLE, &sh->state); | ||
3319 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
3320 | |||
3321 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | ||
3322 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
3323 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | ||
3324 | /* Now to look around and see what can be done */ | ||
3325 | |||
3326 | rcu_read_lock(); | ||
3327 | for (i=disks; i--; ) { | ||
3328 | mdk_rdev_t *rdev; | ||
3329 | dev = &sh->dev[i]; | ||
3330 | 3137 | ||
3331 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3138 | analyse_stripe(sh, &s); |
3332 | i, dev->flags, dev->toread, dev->towrite, dev->written); | ||
3333 | /* maybe we can reply to a read | ||
3334 | * | ||
3335 | * new wantfill requests are only permitted while | ||
3336 | * ops_complete_biofill is guaranteed to be inactive | ||
3337 | */ | ||
3338 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && | ||
3339 | !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) | ||
3340 | set_bit(R5_Wantfill, &dev->flags); | ||
3341 | 3139 | ||
3342 | /* now count some things */ | 3140 | if (s.handle_bad_blocks) { |
3343 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; | 3141 | set_bit(STRIPE_HANDLE, &sh->state); |
3344 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; | 3142 | goto finish; |
3345 | if (test_bit(R5_Wantcompute, &dev->flags)) { | ||
3346 | s.compute++; | ||
3347 | BUG_ON(s.compute > 2); | ||
3348 | } | ||
3349 | |||
3350 | if (test_bit(R5_Wantfill, &dev->flags)) { | ||
3351 | s.to_fill++; | ||
3352 | } else if (dev->toread) | ||
3353 | s.to_read++; | ||
3354 | if (dev->towrite) { | ||
3355 | s.to_write++; | ||
3356 | if (!test_bit(R5_OVERWRITE, &dev->flags)) | ||
3357 | s.non_overwrite++; | ||
3358 | } | ||
3359 | if (dev->written) | ||
3360 | s.written++; | ||
3361 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
3362 | if (blocked_rdev == NULL && | ||
3363 | rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | ||
3364 | blocked_rdev = rdev; | ||
3365 | atomic_inc(&rdev->nr_pending); | ||
3366 | } | ||
3367 | clear_bit(R5_Insync, &dev->flags); | ||
3368 | if (!rdev) | ||
3369 | /* Not in-sync */; | ||
3370 | else if (test_bit(In_sync, &rdev->flags)) | ||
3371 | set_bit(R5_Insync, &dev->flags); | ||
3372 | else { | ||
3373 | /* in sync if before recovery_offset */ | ||
3374 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
3375 | set_bit(R5_Insync, &dev->flags); | ||
3376 | } | ||
3377 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3378 | /* The ReadError flag will just be confusing now */ | ||
3379 | clear_bit(R5_ReadError, &dev->flags); | ||
3380 | clear_bit(R5_ReWrite, &dev->flags); | ||
3381 | } | ||
3382 | if (test_bit(R5_ReadError, &dev->flags)) | ||
3383 | clear_bit(R5_Insync, &dev->flags); | ||
3384 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3385 | if (s.failed < 2) | ||
3386 | r6s.failed_num[s.failed] = i; | ||
3387 | s.failed++; | ||
3388 | } | ||
3389 | } | 3143 | } |
3390 | rcu_read_unlock(); | ||
3391 | 3144 | ||
3392 | if (unlikely(blocked_rdev)) { | 3145 | if (unlikely(s.blocked_rdev)) { |
3393 | if (s.syncing || s.expanding || s.expanded || | 3146 | if (s.syncing || s.expanding || s.expanded || |
3394 | s.to_write || s.written) { | 3147 | s.to_write || s.written) { |
3395 | set_bit(STRIPE_HANDLE, &sh->state); | 3148 | set_bit(STRIPE_HANDLE, &sh->state); |
3396 | goto unlock; | 3149 | goto finish; |
3397 | } | 3150 | } |
3398 | /* There is nothing for the blocked_rdev to block */ | 3151 | /* There is nothing for the blocked_rdev to block */ |
3399 | rdev_dec_pending(blocked_rdev, conf->mddev); | 3152 | rdev_dec_pending(s.blocked_rdev, conf->mddev); |
3400 | blocked_rdev = NULL; | 3153 | s.blocked_rdev = NULL; |
3401 | } | 3154 | } |
3402 | 3155 | ||
3403 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { | 3156 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { |
@@ -3408,83 +3161,88 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3408 | pr_debug("locked=%d uptodate=%d to_read=%d" | 3161 | pr_debug("locked=%d uptodate=%d to_read=%d" |
3409 | " to_write=%d failed=%d failed_num=%d,%d\n", | 3162 | " to_write=%d failed=%d failed_num=%d,%d\n", |
3410 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, | 3163 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, |
3411 | r6s.failed_num[0], r6s.failed_num[1]); | 3164 | s.failed_num[0], s.failed_num[1]); |
3412 | /* check if the array has lost >2 devices and, if so, some requests | 3165 | /* check if the array has lost more than max_degraded devices and, |
3413 | * might need to be failed | 3166 | * if so, some requests might need to be failed. |
3414 | */ | 3167 | */ |
3415 | if (s.failed > 2 && s.to_read+s.to_write+s.written) | 3168 | if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written) |
3416 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); | 3169 | handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); |
3417 | if (s.failed > 2 && s.syncing) { | 3170 | if (s.failed > conf->max_degraded && s.syncing) |
3418 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 3171 | handle_failed_sync(conf, sh, &s); |
3419 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
3420 | s.syncing = 0; | ||
3421 | } | ||
3422 | 3172 | ||
3423 | /* | 3173 | /* |
3424 | * might be able to return some write requests if the parity blocks | 3174 | * might be able to return some write requests if the parity blocks |
3425 | * are safe, or on a failed drive | 3175 | * are safe, or on a failed drive |
3426 | */ | 3176 | */ |
3427 | pdev = &sh->dev[pd_idx]; | 3177 | pdev = &sh->dev[sh->pd_idx]; |
3428 | r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) | 3178 | s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) |
3429 | || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); | 3179 | || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); |
3430 | qdev = &sh->dev[qd_idx]; | 3180 | qdev = &sh->dev[sh->qd_idx]; |
3431 | r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) | 3181 | s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) |
3432 | || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); | 3182 | || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) |
3433 | 3183 | || conf->level < 6; | |
3434 | if ( s.written && | 3184 | |
3435 | ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) | 3185 | if (s.written && |
3186 | (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) | ||
3436 | && !test_bit(R5_LOCKED, &pdev->flags) | 3187 | && !test_bit(R5_LOCKED, &pdev->flags) |
3437 | && test_bit(R5_UPTODATE, &pdev->flags)))) && | 3188 | && test_bit(R5_UPTODATE, &pdev->flags)))) && |
3438 | ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) | 3189 | (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) |
3439 | && !test_bit(R5_LOCKED, &qdev->flags) | 3190 | && !test_bit(R5_LOCKED, &qdev->flags) |
3440 | && test_bit(R5_UPTODATE, &qdev->flags))))) | 3191 | && test_bit(R5_UPTODATE, &qdev->flags))))) |
3441 | handle_stripe_clean_event(conf, sh, disks, &return_bi); | 3192 | handle_stripe_clean_event(conf, sh, disks, &s.return_bi); |
3442 | 3193 | ||
3443 | /* Now we might consider reading some blocks, either to check/generate | 3194 | /* Now we might consider reading some blocks, either to check/generate |
3444 | * parity, or to satisfy requests | 3195 | * parity, or to satisfy requests |
3445 | * or to load a block that is being partially written. | 3196 | * or to load a block that is being partially written. |
3446 | */ | 3197 | */ |
3447 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || | 3198 | if (s.to_read || s.non_overwrite |
3448 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) | 3199 | || (conf->level == 6 && s.to_write && s.failed) |
3449 | handle_stripe_fill6(sh, &s, &r6s, disks); | 3200 | || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) |
3201 | handle_stripe_fill(sh, &s, disks); | ||
3450 | 3202 | ||
3451 | /* Now we check to see if any write operations have recently | 3203 | /* Now we check to see if any write operations have recently |
3452 | * completed | 3204 | * completed |
3453 | */ | 3205 | */ |
3454 | if (sh->reconstruct_state == reconstruct_state_drain_result) { | 3206 | prexor = 0; |
3455 | 3207 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) | |
3208 | prexor = 1; | ||
3209 | if (sh->reconstruct_state == reconstruct_state_drain_result || | ||
3210 | sh->reconstruct_state == reconstruct_state_prexor_drain_result) { | ||
3456 | sh->reconstruct_state = reconstruct_state_idle; | 3211 | sh->reconstruct_state = reconstruct_state_idle; |
3457 | /* All the 'written' buffers and the parity blocks are ready to | 3212 | |
3213 | /* All the 'written' buffers and the parity block are ready to | ||
3458 | * be written back to disk | 3214 | * be written back to disk |
3459 | */ | 3215 | */ |
3460 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | 3216 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); |
3461 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); | 3217 | BUG_ON(sh->qd_idx >= 0 && |
3218 | !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); | ||
3462 | for (i = disks; i--; ) { | 3219 | for (i = disks; i--; ) { |
3463 | dev = &sh->dev[i]; | 3220 | struct r5dev *dev = &sh->dev[i]; |
3464 | if (test_bit(R5_LOCKED, &dev->flags) && | 3221 | if (test_bit(R5_LOCKED, &dev->flags) && |
3465 | (i == sh->pd_idx || i == qd_idx || | 3222 | (i == sh->pd_idx || i == sh->qd_idx || |
3466 | dev->written)) { | 3223 | dev->written)) { |
3467 | pr_debug("Writing block %d\n", i); | 3224 | pr_debug("Writing block %d\n", i); |
3468 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | ||
3469 | set_bit(R5_Wantwrite, &dev->flags); | 3225 | set_bit(R5_Wantwrite, &dev->flags); |
3226 | if (prexor) | ||
3227 | continue; | ||
3470 | if (!test_bit(R5_Insync, &dev->flags) || | 3228 | if (!test_bit(R5_Insync, &dev->flags) || |
3471 | ((i == sh->pd_idx || i == qd_idx) && | 3229 | ((i == sh->pd_idx || i == sh->qd_idx) && |
3472 | s.failed == 0)) | 3230 | s.failed == 0)) |
3473 | set_bit(STRIPE_INSYNC, &sh->state); | 3231 | set_bit(STRIPE_INSYNC, &sh->state); |
3474 | } | 3232 | } |
3475 | } | 3233 | } |
3476 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 3234 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
3477 | dec_preread_active = 1; | 3235 | s.dec_preread_active = 1; |
3478 | } | 3236 | } |
3479 | 3237 | ||
3480 | /* Now to consider new write requests and what else, if anything | 3238 | /* Now to consider new write requests and what else, if anything |
3481 | * should be read. We do not handle new writes when: | 3239 | * should be read. We do not handle new writes when: |
3482 | * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. | 3240 | * 1/ A 'write' operation (copy+xor) is already in flight. |
3483 | * 2/ A 'check' operation is in flight, as it may clobber the parity | 3241 | * 2/ A 'check' operation is in flight, as it may clobber the parity |
3484 | * block. | 3242 | * block. |
3485 | */ | 3243 | */ |
3486 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | 3244 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) |
3487 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); | 3245 | handle_stripe_dirtying(conf, sh, &s, disks); |
3488 | 3246 | ||
3489 | /* maybe we need to check and possibly fix the parity for this stripe | 3247 | /* maybe we need to check and possibly fix the parity for this stripe |
3490 | * Any reads will already have been scheduled, so we just see if enough | 3248 | * Any reads will already have been scheduled, so we just see if enough |
@@ -3494,20 +3252,24 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3494 | if (sh->check_state || | 3252 | if (sh->check_state || |
3495 | (s.syncing && s.locked == 0 && | 3253 | (s.syncing && s.locked == 0 && |
3496 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | 3254 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && |
3497 | !test_bit(STRIPE_INSYNC, &sh->state))) | 3255 | !test_bit(STRIPE_INSYNC, &sh->state))) { |
3498 | handle_parity_checks6(conf, sh, &s, &r6s, disks); | 3256 | if (conf->level == 6) |
3257 | handle_parity_checks6(conf, sh, &s, disks); | ||
3258 | else | ||
3259 | handle_parity_checks5(conf, sh, &s, disks); | ||
3260 | } | ||
3499 | 3261 | ||
3500 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 3262 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
3501 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 3263 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); |
3502 | clear_bit(STRIPE_SYNCING, &sh->state); | 3264 | clear_bit(STRIPE_SYNCING, &sh->state); |
3503 | } | 3265 | } |
3504 | 3266 | ||
3505 | /* If the failed drives are just a ReadError, then we might need | 3267 | /* If the failed drives are just a ReadError, then we might need |
3506 | * to progress the repair/check process | 3268 | * to progress the repair/check process |
3507 | */ | 3269 | */ |
3508 | if (s.failed <= 2 && !conf->mddev->ro) | 3270 | if (s.failed <= conf->max_degraded && !conf->mddev->ro) |
3509 | for (i = 0; i < s.failed; i++) { | 3271 | for (i = 0; i < s.failed; i++) { |
3510 | dev = &sh->dev[r6s.failed_num[i]]; | 3272 | struct r5dev *dev = &sh->dev[s.failed_num[i]]; |
3511 | if (test_bit(R5_ReadError, &dev->flags) | 3273 | if (test_bit(R5_ReadError, &dev->flags) |
3512 | && !test_bit(R5_LOCKED, &dev->flags) | 3274 | && !test_bit(R5_LOCKED, &dev->flags) |
3513 | && test_bit(R5_UPTODATE, &dev->flags) | 3275 | && test_bit(R5_UPTODATE, &dev->flags) |
@@ -3526,8 +3288,26 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3526 | } | 3288 | } |
3527 | } | 3289 | } |
3528 | 3290 | ||
3291 | |||
3529 | /* Finish reconstruct operations initiated by the expansion process */ | 3292 | /* Finish reconstruct operations initiated by the expansion process */ |
3530 | if (sh->reconstruct_state == reconstruct_state_result) { | 3293 | if (sh->reconstruct_state == reconstruct_state_result) { |
3294 | struct stripe_head *sh_src | ||
3295 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | ||
3296 | if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { | ||
3297 | /* sh cannot be written until sh_src has been read. | ||
3298 | * so arrange for sh to be delayed a little | ||
3299 | */ | ||
3300 | set_bit(STRIPE_DELAYED, &sh->state); | ||
3301 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3302 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
3303 | &sh_src->state)) | ||
3304 | atomic_inc(&conf->preread_active_stripes); | ||
3305 | release_stripe(sh_src); | ||
3306 | goto finish; | ||
3307 | } | ||
3308 | if (sh_src) | ||
3309 | release_stripe(sh_src); | ||
3310 | |||
3531 | sh->reconstruct_state = reconstruct_state_idle; | 3311 | sh->reconstruct_state = reconstruct_state_idle; |
3532 | clear_bit(STRIPE_EXPANDING, &sh->state); | 3312 | clear_bit(STRIPE_EXPANDING, &sh->state); |
3533 | for (i = conf->raid_disks; i--; ) { | 3313 | for (i = conf->raid_disks; i--; ) { |
@@ -3539,24 +3319,7 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3539 | 3319 | ||
3540 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | 3320 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && |
3541 | !sh->reconstruct_state) { | 3321 | !sh->reconstruct_state) { |
3542 | struct stripe_head *sh2 | 3322 | /* Need to write out all blocks after computing parity */ |
3543 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | ||
3544 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | ||
3545 | /* sh cannot be written until sh2 has been read. | ||
3546 | * so arrange for sh to be delayed a little | ||
3547 | */ | ||
3548 | set_bit(STRIPE_DELAYED, &sh->state); | ||
3549 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3550 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
3551 | &sh2->state)) | ||
3552 | atomic_inc(&conf->preread_active_stripes); | ||
3553 | release_stripe(sh2); | ||
3554 | goto unlock; | ||
3555 | } | ||
3556 | if (sh2) | ||
3557 | release_stripe(sh2); | ||
3558 | |||
3559 | /* Need to write out all blocks after computing P&Q */ | ||
3560 | sh->disks = conf->raid_disks; | 3323 | sh->disks = conf->raid_disks; |
3561 | stripe_set_idx(sh->sector, conf, 0, sh); | 3324 | stripe_set_idx(sh->sector, conf, 0, sh); |
3562 | schedule_reconstruction(sh, &s, 1, 1); | 3325 | schedule_reconstruction(sh, &s, 1, 1); |
@@ -3569,22 +3332,39 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3569 | 3332 | ||
3570 | if (s.expanding && s.locked == 0 && | 3333 | if (s.expanding && s.locked == 0 && |
3571 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) | 3334 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) |
3572 | handle_stripe_expansion(conf, sh, &r6s); | 3335 | handle_stripe_expansion(conf, sh); |
3573 | |||
3574 | unlock: | ||
3575 | spin_unlock(&sh->lock); | ||
3576 | 3336 | ||
3337 | finish: | ||
3577 | /* wait for this device to become unblocked */ | 3338 | /* wait for this device to become unblocked */ |
3578 | if (unlikely(blocked_rdev)) | 3339 | if (unlikely(s.blocked_rdev)) |
3579 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 3340 | md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); |
3341 | |||
3342 | if (s.handle_bad_blocks) | ||
3343 | for (i = disks; i--; ) { | ||
3344 | mdk_rdev_t *rdev; | ||
3345 | struct r5dev *dev = &sh->dev[i]; | ||
3346 | if (test_and_clear_bit(R5_WriteError, &dev->flags)) { | ||
3347 | /* We own a safe reference to the rdev */ | ||
3348 | rdev = conf->disks[i].rdev; | ||
3349 | if (!rdev_set_badblocks(rdev, sh->sector, | ||
3350 | STRIPE_SECTORS, 0)) | ||
3351 | md_error(conf->mddev, rdev); | ||
3352 | rdev_dec_pending(rdev, conf->mddev); | ||
3353 | } | ||
3354 | if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { | ||
3355 | rdev = conf->disks[i].rdev; | ||
3356 | rdev_clear_badblocks(rdev, sh->sector, | ||
3357 | STRIPE_SECTORS); | ||
3358 | rdev_dec_pending(rdev, conf->mddev); | ||
3359 | } | ||
3360 | } | ||
3580 | 3361 | ||
3581 | if (s.ops_request) | 3362 | if (s.ops_request) |
3582 | raid_run_ops(sh, s.ops_request); | 3363 | raid_run_ops(sh, s.ops_request); |
3583 | 3364 | ||
3584 | ops_run_io(sh, &s); | 3365 | ops_run_io(sh, &s); |
3585 | 3366 | ||
3586 | 3367 | if (s.dec_preread_active) { | |
3587 | if (dec_preread_active) { | ||
3588 | /* We delay this until after ops_run_io so that if make_request | 3368 | /* We delay this until after ops_run_io so that if make_request |
3589 | * is waiting on a flush, it won't continue until the writes | 3369 | * is waiting on a flush, it won't continue until the writes |
3590 | * have actually been submitted. | 3370 | * have actually been submitted. |
@@ -3595,15 +3375,9 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3595 | md_wakeup_thread(conf->mddev->thread); | 3375 | md_wakeup_thread(conf->mddev->thread); |
3596 | } | 3376 | } |
3597 | 3377 | ||
3598 | return_io(return_bi); | 3378 | return_io(s.return_bi); |
3599 | } | ||
3600 | 3379 | ||
3601 | static void handle_stripe(struct stripe_head *sh) | 3380 | clear_bit(STRIPE_ACTIVE, &sh->state); |
3602 | { | ||
3603 | if (sh->raid_conf->level == 6) | ||
3604 | handle_stripe6(sh); | ||
3605 | else | ||
3606 | handle_stripe5(sh); | ||
3607 | } | 3381 | } |
3608 | 3382 | ||
3609 | static void raid5_activate_delayed(raid5_conf_t *conf) | 3383 | static void raid5_activate_delayed(raid5_conf_t *conf) |
@@ -3833,6 +3607,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) | |||
3833 | rcu_read_lock(); | 3607 | rcu_read_lock(); |
3834 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); | 3608 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); |
3835 | if (rdev && test_bit(In_sync, &rdev->flags)) { | 3609 | if (rdev && test_bit(In_sync, &rdev->flags)) { |
3610 | sector_t first_bad; | ||
3611 | int bad_sectors; | ||
3612 | |||
3836 | atomic_inc(&rdev->nr_pending); | 3613 | atomic_inc(&rdev->nr_pending); |
3837 | rcu_read_unlock(); | 3614 | rcu_read_unlock(); |
3838 | raid_bio->bi_next = (void*)rdev; | 3615 | raid_bio->bi_next = (void*)rdev; |
@@ -3840,8 +3617,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) | |||
3840 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); | 3617 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); |
3841 | align_bi->bi_sector += rdev->data_offset; | 3618 | align_bi->bi_sector += rdev->data_offset; |
3842 | 3619 | ||
3843 | if (!bio_fits_rdev(align_bi)) { | 3620 | if (!bio_fits_rdev(align_bi) || |
3844 | /* too big in some way */ | 3621 | is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, |
3622 | &first_bad, &bad_sectors)) { | ||
3623 | /* too big in some way, or has a known bad block */ | ||
3845 | bio_put(align_bi); | 3624 | bio_put(align_bi); |
3846 | rdev_dec_pending(rdev, mddev); | 3625 | rdev_dec_pending(rdev, mddev); |
3847 | return 0; | 3626 | return 0; |
@@ -4016,7 +3795,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4016 | } | 3795 | } |
4017 | } | 3796 | } |
4018 | 3797 | ||
4019 | if (bio_data_dir(bi) == WRITE && | 3798 | if (rw == WRITE && |
4020 | logical_sector >= mddev->suspend_lo && | 3799 | logical_sector >= mddev->suspend_lo && |
4021 | logical_sector < mddev->suspend_hi) { | 3800 | logical_sector < mddev->suspend_hi) { |
4022 | release_stripe(sh); | 3801 | release_stripe(sh); |
@@ -4034,7 +3813,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4034 | } | 3813 | } |
4035 | 3814 | ||
4036 | if (test_bit(STRIPE_EXPANDING, &sh->state) || | 3815 | if (test_bit(STRIPE_EXPANDING, &sh->state) || |
4037 | !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { | 3816 | !add_stripe_bio(sh, bi, dd_idx, rw)) { |
4038 | /* Stripe is busy expanding or | 3817 | /* Stripe is busy expanding or |
4039 | * add failed due to overlap. Flush everything | 3818 | * add failed due to overlap. Flush everything |
4040 | * and wait a while | 3819 | * and wait a while |
@@ -4375,10 +4154,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
4375 | 4154 | ||
4376 | bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); | 4155 | bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); |
4377 | 4156 | ||
4378 | spin_lock(&sh->lock); | 4157 | set_bit(STRIPE_SYNC_REQUESTED, &sh->state); |
4379 | set_bit(STRIPE_SYNCING, &sh->state); | ||
4380 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
4381 | spin_unlock(&sh->lock); | ||
4382 | 4158 | ||
4383 | handle_stripe(sh); | 4159 | handle_stripe(sh); |
4384 | release_stripe(sh); | 4160 | release_stripe(sh); |
@@ -4509,6 +4285,9 @@ static void raid5d(mddev_t *mddev) | |||
4509 | release_stripe(sh); | 4285 | release_stripe(sh); |
4510 | cond_resched(); | 4286 | cond_resched(); |
4511 | 4287 | ||
4288 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | ||
4289 | md_check_recovery(mddev); | ||
4290 | |||
4512 | spin_lock_irq(&conf->device_lock); | 4291 | spin_lock_irq(&conf->device_lock); |
4513 | } | 4292 | } |
4514 | pr_debug("%d stripes handled\n", handled); | 4293 | pr_debug("%d stripes handled\n", handled); |
@@ -5313,6 +5092,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) | |||
5313 | * isn't possible. | 5092 | * isn't possible. |
5314 | */ | 5093 | */ |
5315 | if (!test_bit(Faulty, &rdev->flags) && | 5094 | if (!test_bit(Faulty, &rdev->flags) && |
5095 | mddev->recovery_disabled != conf->recovery_disabled && | ||
5316 | !has_failed(conf) && | 5096 | !has_failed(conf) && |
5317 | number < conf->raid_disks) { | 5097 | number < conf->raid_disks) { |
5318 | err = -EBUSY; | 5098 | err = -EBUSY; |
@@ -5341,6 +5121,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
5341 | int first = 0; | 5121 | int first = 0; |
5342 | int last = conf->raid_disks - 1; | 5122 | int last = conf->raid_disks - 1; |
5343 | 5123 | ||
5124 | if (mddev->recovery_disabled == conf->recovery_disabled) | ||
5125 | return -EBUSY; | ||
5126 | |||
5344 | if (has_failed(conf)) | 5127 | if (has_failed(conf)) |
5345 | /* no point adding a device */ | 5128 | /* no point adding a device */ |
5346 | return -EINVAL; | 5129 | return -EINVAL; |
@@ -5519,16 +5302,14 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5519 | if (rdev->raid_disk < 0 && | 5302 | if (rdev->raid_disk < 0 && |
5520 | !test_bit(Faulty, &rdev->flags)) { | 5303 | !test_bit(Faulty, &rdev->flags)) { |
5521 | if (raid5_add_disk(mddev, rdev) == 0) { | 5304 | if (raid5_add_disk(mddev, rdev) == 0) { |
5522 | char nm[20]; | ||
5523 | if (rdev->raid_disk | 5305 | if (rdev->raid_disk |
5524 | >= conf->previous_raid_disks) { | 5306 | >= conf->previous_raid_disks) { |
5525 | set_bit(In_sync, &rdev->flags); | 5307 | set_bit(In_sync, &rdev->flags); |
5526 | added_devices++; | 5308 | added_devices++; |
5527 | } else | 5309 | } else |
5528 | rdev->recovery_offset = 0; | 5310 | rdev->recovery_offset = 0; |
5529 | sprintf(nm, "rd%d", rdev->raid_disk); | 5311 | |
5530 | if (sysfs_create_link(&mddev->kobj, | 5312 | if (sysfs_link_rdev(mddev, rdev)) |
5531 | &rdev->kobj, nm)) | ||
5532 | /* Failure here is OK */; | 5313 | /* Failure here is OK */; |
5533 | } | 5314 | } |
5534 | } else if (rdev->raid_disk >= conf->previous_raid_disks | 5315 | } else if (rdev->raid_disk >= conf->previous_raid_disks |
@@ -5624,9 +5405,7 @@ static void raid5_finish_reshape(mddev_t *mddev) | |||
5624 | d++) { | 5405 | d++) { |
5625 | mdk_rdev_t *rdev = conf->disks[d].rdev; | 5406 | mdk_rdev_t *rdev = conf->disks[d].rdev; |
5626 | if (rdev && raid5_remove_disk(mddev, d) == 0) { | 5407 | if (rdev && raid5_remove_disk(mddev, d) == 0) { |
5627 | char nm[20]; | 5408 | sysfs_unlink_rdev(mddev, rdev); |
5628 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
5629 | sysfs_remove_link(&mddev->kobj, nm); | ||
5630 | rdev->raid_disk = -1; | 5409 | rdev->raid_disk = -1; |
5631 | } | 5410 | } |
5632 | } | 5411 | } |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 3ca77a2613ba..11b9566184b2 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -6,11 +6,11 @@ | |||
6 | 6 | ||
7 | /* | 7 | /* |
8 | * | 8 | * |
9 | * Each stripe contains one buffer per disc. Each buffer can be in | 9 | * Each stripe contains one buffer per device. Each buffer can be in |
10 | * one of a number of states stored in "flags". Changes between | 10 | * one of a number of states stored in "flags". Changes between |
11 | * these states happen *almost* exclusively under a per-stripe | 11 | * these states happen *almost* exclusively under the protection of the |
12 | * spinlock. Some very specific changes can happen in bi_end_io, and | 12 | * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and |
13 | * these are not protected by the spin lock. | 13 | * these are not protected by STRIPE_ACTIVE. |
14 | * | 14 | * |
15 | * The flag bits that are used to represent these states are: | 15 | * The flag bits that are used to represent these states are: |
16 | * R5_UPTODATE and R5_LOCKED | 16 | * R5_UPTODATE and R5_LOCKED |
@@ -76,12 +76,10 @@ | |||
76 | * block and the cached buffer are successfully written, any buffer on | 76 | * block and the cached buffer are successfully written, any buffer on |
77 | * a written list can be returned with b_end_io. | 77 | * a written list can be returned with b_end_io. |
78 | * | 78 | * |
79 | * The write list and read list both act as fifos. The read list is | 79 | * The write list and read list both act as fifos. The read list, |
80 | * protected by the device_lock. The write and written lists are | 80 | * write list and written list are protected by the device_lock. |
81 | * protected by the stripe lock. The device_lock, which can be | 81 | * The device_lock is only for list manipulations and will only be |
82 | * claimed while the stipe lock is held, is only for list | 82 | * held for a very short time. It can be claimed from interrupts. |
83 | * manipulations and will only be held for a very short time. It can | ||
84 | * be claimed from interrupts. | ||
85 | * | 83 | * |
86 | * | 84 | * |
87 | * Stripes in the stripe cache can be on one of two lists (or on | 85 | * Stripes in the stripe cache can be on one of two lists (or on |
@@ -96,7 +94,6 @@ | |||
96 | * | 94 | * |
97 | * The inactive_list, handle_list and hash bucket lists are all protected by the | 95 | * The inactive_list, handle_list and hash bucket lists are all protected by the |
98 | * device_lock. | 96 | * device_lock. |
99 | * - stripes on the inactive_list never have their stripe_lock held. | ||
100 | * - stripes have a reference counter. If count==0, they are on a list. | 97 | * - stripes have a reference counter. If count==0, they are on a list. |
101 | * - If a stripe might need handling, STRIPE_HANDLE is set. | 98 | * - If a stripe might need handling, STRIPE_HANDLE is set. |
102 | * - When refcount reaches zero, then if STRIPE_HANDLE it is put on | 99 | * - When refcount reaches zero, then if STRIPE_HANDLE it is put on |
@@ -116,10 +113,10 @@ | |||
116 | * attach a request to an active stripe (add_stripe_bh()) | 113 | * attach a request to an active stripe (add_stripe_bh()) |
117 | * lockdev attach-buffer unlockdev | 114 | * lockdev attach-buffer unlockdev |
118 | * handle a stripe (handle_stripe()) | 115 | * handle a stripe (handle_stripe()) |
119 | * lockstripe clrSTRIPE_HANDLE ... | 116 | * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ... |
120 | * (lockdev check-buffers unlockdev) .. | 117 | * (lockdev check-buffers unlockdev) .. |
121 | * change-state .. | 118 | * change-state .. |
122 | * record io/ops needed unlockstripe schedule io/ops | 119 | * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops |
123 | * release an active stripe (release_stripe()) | 120 | * release an active stripe (release_stripe()) |
124 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev | 121 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev |
125 | * | 122 | * |
@@ -128,8 +125,7 @@ | |||
128 | * on a cached buffer, and plus one if the stripe is undergoing stripe | 125 | * on a cached buffer, and plus one if the stripe is undergoing stripe |
129 | * operations. | 126 | * operations. |
130 | * | 127 | * |
131 | * Stripe operations are performed outside the stripe lock, | 128 | * The stripe operations are: |
132 | * the stripe operations are: | ||
133 | * -copying data between the stripe cache and user application buffers | 129 | * -copying data between the stripe cache and user application buffers |
134 | * -computing blocks to save a disk access, or to recover a missing block | 130 | * -computing blocks to save a disk access, or to recover a missing block |
135 | * -updating the parity on a write operation (reconstruct write and | 131 | * -updating the parity on a write operation (reconstruct write and |
@@ -159,7 +155,8 @@ | |||
159 | */ | 155 | */ |
160 | 156 | ||
161 | /* | 157 | /* |
162 | * Operations state - intermediate states that are visible outside of sh->lock | 158 | * Operations state - intermediate states that are visible outside of |
159 | * STRIPE_ACTIVE. | ||
163 | * In general _idle indicates nothing is running, _run indicates a data | 160 | * In general _idle indicates nothing is running, _run indicates a data |
164 | * processing operation is active, and _result means the data processing result | 161 | * processing operation is active, and _result means the data processing result |
165 | * is stable and can be acted upon. For simple operations like biofill and | 162 | * is stable and can be acted upon. For simple operations like biofill and |
@@ -209,7 +206,6 @@ struct stripe_head { | |||
209 | short ddf_layout;/* use DDF ordering to calculate Q */ | 206 | short ddf_layout;/* use DDF ordering to calculate Q */ |
210 | unsigned long state; /* state flags */ | 207 | unsigned long state; /* state flags */ |
211 | atomic_t count; /* nr of active thread/requests */ | 208 | atomic_t count; /* nr of active thread/requests */ |
212 | spinlock_t lock; | ||
213 | int bm_seq; /* sequence number for bitmap flushes */ | 209 | int bm_seq; /* sequence number for bitmap flushes */ |
214 | int disks; /* disks in stripe */ | 210 | int disks; /* disks in stripe */ |
215 | enum check_states check_state; | 211 | enum check_states check_state; |
@@ -240,19 +236,20 @@ struct stripe_head { | |||
240 | }; | 236 | }; |
241 | 237 | ||
242 | /* stripe_head_state - collects and tracks the dynamic state of a stripe_head | 238 | /* stripe_head_state - collects and tracks the dynamic state of a stripe_head |
243 | * for handle_stripe. It is only valid under spin_lock(sh->lock); | 239 | * for handle_stripe. |
244 | */ | 240 | */ |
245 | struct stripe_head_state { | 241 | struct stripe_head_state { |
246 | int syncing, expanding, expanded; | 242 | int syncing, expanding, expanded; |
247 | int locked, uptodate, to_read, to_write, failed, written; | 243 | int locked, uptodate, to_read, to_write, failed, written; |
248 | int to_fill, compute, req_compute, non_overwrite; | 244 | int to_fill, compute, req_compute, non_overwrite; |
249 | int failed_num; | 245 | int failed_num[2]; |
246 | int p_failed, q_failed; | ||
247 | int dec_preread_active; | ||
250 | unsigned long ops_request; | 248 | unsigned long ops_request; |
251 | }; | ||
252 | 249 | ||
253 | /* r6_state - extra state data only relevant to r6 */ | 250 | struct bio *return_bi; |
254 | struct r6_state { | 251 | mdk_rdev_t *blocked_rdev; |
255 | int p_failed, q_failed, failed_num[2]; | 252 | int handle_bad_blocks; |
256 | }; | 253 | }; |
257 | 254 | ||
258 | /* Flags */ | 255 | /* Flags */ |
@@ -268,14 +265,16 @@ struct r6_state { | |||
268 | #define R5_ReWrite 9 /* have tried to over-write the readerror */ | 265 | #define R5_ReWrite 9 /* have tried to over-write the readerror */ |
269 | 266 | ||
270 | #define R5_Expanded 10 /* This block now has post-expand data */ | 267 | #define R5_Expanded 10 /* This block now has post-expand data */ |
271 | #define R5_Wantcompute 11 /* compute_block in progress treat as | 268 | #define R5_Wantcompute 11 /* compute_block in progress treat as |
272 | * uptodate | 269 | * uptodate |
273 | */ | 270 | */ |
274 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs | 271 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs |
275 | * filling | 272 | * filling |
276 | */ | 273 | */ |
277 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ | 274 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ |
278 | #define R5_WantFUA 14 /* Write should be FUA */ | 275 | #define R5_WantFUA 14 /* Write should be FUA */ |
276 | #define R5_WriteError 15 /* got a write error - need to record it */ | ||
277 | #define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/ | ||
279 | /* | 278 | /* |
280 | * Write method | 279 | * Write method |
281 | */ | 280 | */ |
@@ -289,21 +288,25 @@ struct r6_state { | |||
289 | /* | 288 | /* |
290 | * Stripe state | 289 | * Stripe state |
291 | */ | 290 | */ |
292 | #define STRIPE_HANDLE 2 | 291 | enum { |
293 | #define STRIPE_SYNCING 3 | 292 | STRIPE_ACTIVE, |
294 | #define STRIPE_INSYNC 4 | 293 | STRIPE_HANDLE, |
295 | #define STRIPE_PREREAD_ACTIVE 5 | 294 | STRIPE_SYNC_REQUESTED, |
296 | #define STRIPE_DELAYED 6 | 295 | STRIPE_SYNCING, |
297 | #define STRIPE_DEGRADED 7 | 296 | STRIPE_INSYNC, |
298 | #define STRIPE_BIT_DELAY 8 | 297 | STRIPE_PREREAD_ACTIVE, |
299 | #define STRIPE_EXPANDING 9 | 298 | STRIPE_DELAYED, |
300 | #define STRIPE_EXPAND_SOURCE 10 | 299 | STRIPE_DEGRADED, |
301 | #define STRIPE_EXPAND_READY 11 | 300 | STRIPE_BIT_DELAY, |
302 | #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ | 301 | STRIPE_EXPANDING, |
303 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ | 302 | STRIPE_EXPAND_SOURCE, |
304 | #define STRIPE_BIOFILL_RUN 14 | 303 | STRIPE_EXPAND_READY, |
305 | #define STRIPE_COMPUTE_RUN 15 | 304 | STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */ |
306 | #define STRIPE_OPS_REQ_PENDING 16 | 305 | STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ |
306 | STRIPE_BIOFILL_RUN, | ||
307 | STRIPE_COMPUTE_RUN, | ||
308 | STRIPE_OPS_REQ_PENDING, | ||
309 | }; | ||
307 | 310 | ||
308 | /* | 311 | /* |
309 | * Operation request flags | 312 | * Operation request flags |
@@ -336,7 +339,7 @@ struct r6_state { | |||
336 | * PREREAD_ACTIVE. | 339 | * PREREAD_ACTIVE. |
337 | * In stripe_handle, if we find pre-reading is necessary, we do it if | 340 | * In stripe_handle, if we find pre-reading is necessary, we do it if |
338 | * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. | 341 | * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. |
339 | * HANDLE gets cleared if stripe_handle leave nothing locked. | 342 | * HANDLE gets cleared if stripe_handle leaves nothing locked. |
340 | */ | 343 | */ |
341 | 344 | ||
342 | 345 | ||
@@ -399,7 +402,7 @@ struct raid5_private_data { | |||
399 | * (fresh device added). | 402 | * (fresh device added). |
400 | * Cleared when a sync completes. | 403 | * Cleared when a sync completes. |
401 | */ | 404 | */ |
402 | 405 | int recovery_disabled; | |
403 | /* per cpu variables */ | 406 | /* per cpu variables */ |
404 | struct raid5_percpu { | 407 | struct raid5_percpu { |
405 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | 408 | struct page *spare_page; /* Used when checking P/Q in raid6 */ |
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 75cbf4f62fe8..9e65d9e20662 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h | |||
@@ -245,10 +245,16 @@ struct mdp_superblock_1 { | |||
245 | __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ | 245 | __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ |
246 | __u8 devflags; /* per-device flags. Only one defined...*/ | 246 | __u8 devflags; /* per-device flags. Only one defined...*/ |
247 | #define WriteMostly1 1 /* mask for writemostly flag in above */ | 247 | #define WriteMostly1 1 /* mask for writemostly flag in above */ |
248 | __u8 pad2[64-57]; /* set to 0 when writing */ | 248 | /* Bad block log. If there are any bad blocks the feature flag is set. |
249 | * If offset and size are non-zero, that space is reserved and available | ||
250 | */ | ||
251 | __u8 bblog_shift; /* shift from sectors to block size */ | ||
252 | __le16 bblog_size; /* number of sectors reserved for list */ | ||
253 | __le32 bblog_offset; /* sector offset from superblock to bblog, | ||
254 | * signed - not unsigned */ | ||
249 | 255 | ||
250 | /* array state information - 64 bytes */ | 256 | /* array state information - 64 bytes */ |
251 | __le64 utime; /* 40 bits second, 24 btes microseconds */ | 257 | __le64 utime; /* 40 bits second, 24 bits microseconds */ |
252 | __le64 events; /* incremented when superblock updated */ | 258 | __le64 events; /* incremented when superblock updated */ |
253 | __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ | 259 | __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ |
254 | __le32 sb_csum; /* checksum up to devs[max_dev] */ | 260 | __le32 sb_csum; /* checksum up to devs[max_dev] */ |
@@ -270,8 +276,8 @@ struct mdp_superblock_1 { | |||
270 | * must be honoured | 276 | * must be honoured |
271 | */ | 277 | */ |
272 | #define MD_FEATURE_RESHAPE_ACTIVE 4 | 278 | #define MD_FEATURE_RESHAPE_ACTIVE 4 |
279 | #define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ | ||
273 | 280 | ||
274 | #define MD_FEATURE_ALL (1|2|4) | 281 | #define MD_FEATURE_ALL (1|2|4|8) |
275 | 282 | ||
276 | #endif | 283 | #endif |
277 | |||