diff options
| -rw-r--r-- | Documentation/md.txt | 29 | ||||
| -rw-r--r-- | drivers/md/bitmap.c | 137 | ||||
| -rw-r--r-- | drivers/md/bitmap.h | 5 | ||||
| -rw-r--r-- | drivers/md/md.c | 871 | ||||
| -rw-r--r-- | drivers/md/md.h | 110 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 962 | ||||
| -rw-r--r-- | drivers/md/raid1.h | 26 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 1183 | ||||
| -rw-r--r-- | drivers/md/raid10.h | 21 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 1015 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 99 | ||||
| -rw-r--r-- | include/linux/raid/md_p.h | 14 |
12 files changed, 3093 insertions, 1379 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt index f0eee83ff78a..fc94770f44ab 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt | |||
| @@ -360,18 +360,20 @@ Each directory contains: | |||
| 360 | A file recording the current state of the device in the array | 360 | A file recording the current state of the device in the array |
| 361 | which can be a comma separated list of | 361 | which can be a comma separated list of |
| 362 | faulty - device has been kicked from active use due to | 362 | faulty - device has been kicked from active use due to |
| 363 | a detected fault | 363 | a detected fault or it has unacknowledged bad |
| 364 | blocks | ||
| 364 | in_sync - device is a fully in-sync member of the array | 365 | in_sync - device is a fully in-sync member of the array |
| 365 | writemostly - device will only be subject to read | 366 | writemostly - device will only be subject to read |
| 366 | requests if there are no other options. | 367 | requests if there are no other options. |
| 367 | This applies only to raid1 arrays. | 368 | This applies only to raid1 arrays. |
| 368 | blocked - device has failed, metadata is "external", | 369 | blocked - device has failed, and the failure hasn't been |
| 369 | and the failure hasn't been acknowledged yet. | 370 | acknowledged yet by the metadata handler. |
| 370 | Writes that would write to this device if | 371 | Writes that would write to this device if |
| 371 | it were not faulty are blocked. | 372 | it were not faulty are blocked. |
| 372 | spare - device is working, but not a full member. | 373 | spare - device is working, but not a full member. |
| 373 | This includes spares that are in the process | 374 | This includes spares that are in the process |
| 374 | of being recovered to | 375 | of being recovered to |
| 376 | write_error - device has ever seen a write error. | ||
| 375 | This list may grow in future. | 377 | This list may grow in future. |
| 376 | This can be written to. | 378 | This can be written to. |
| 377 | Writing "faulty" simulates a failure on the device. | 379 | Writing "faulty" simulates a failure on the device. |
| @@ -379,9 +381,11 @@ Each directory contains: | |||
| 379 | Writing "writemostly" sets the writemostly flag. | 381 | Writing "writemostly" sets the writemostly flag. |
| 380 | Writing "-writemostly" clears the writemostly flag. | 382 | Writing "-writemostly" clears the writemostly flag. |
| 381 | Writing "blocked" sets the "blocked" flag. | 383 | Writing "blocked" sets the "blocked" flag. |
| 382 | Writing "-blocked" clears the "blocked" flag and allows writes | 384 | Writing "-blocked" clears the "blocked" flags and allows writes |
| 383 | to complete. | 385 | to complete and possibly simulates an error. |
| 384 | Writing "in_sync" sets the in_sync flag. | 386 | Writing "in_sync" sets the in_sync flag. |
| 387 | Writing "write_error" sets writeerrorseen flag. | ||
| 388 | Writing "-write_error" clears writeerrorseen flag. | ||
| 385 | 389 | ||
| 386 | This file responds to select/poll. Any change to 'faulty' | 390 | This file responds to select/poll. Any change to 'faulty' |
| 387 | or 'blocked' causes an event. | 391 | or 'blocked' causes an event. |
| @@ -419,7 +423,6 @@ Each directory contains: | |||
| 419 | written, it will be rejected. | 423 | written, it will be rejected. |
| 420 | 424 | ||
| 421 | recovery_start | 425 | recovery_start |
| 422 | |||
| 423 | When the device is not 'in_sync', this records the number of | 426 | When the device is not 'in_sync', this records the number of |
| 424 | sectors from the start of the device which are known to be | 427 | sectors from the start of the device which are known to be |
| 425 | correct. This is normally zero, but during a recovery | 428 | correct. This is normally zero, but during a recovery |
| @@ -435,6 +438,20 @@ Each directory contains: | |||
| 435 | Setting this to 'none' is equivalent to setting 'in_sync'. | 438 | Setting this to 'none' is equivalent to setting 'in_sync'. |
| 436 | Setting to any other value also clears the 'in_sync' flag. | 439 | Setting to any other value also clears the 'in_sync' flag. |
| 437 | 440 | ||
| 441 | bad_blocks | ||
| 442 | This gives the list of all known bad blocks in the form of | ||
| 443 | start address and length (in sectors respectively). If output | ||
| 444 | is too big to fit in a page, it will be truncated. Writing | ||
| 445 | "sector length" to this file adds new acknowledged (i.e. | ||
| 446 | recorded to disk safely) bad blocks. | ||
| 447 | |||
| 448 | unacknowledged_bad_blocks | ||
| 449 | This gives the list of known-but-not-yet-saved-to-disk bad | ||
| 450 | blocks in the same form of 'bad_blocks'. If output is too big | ||
| 451 | to fit in a page, it will be truncated. Writing to this file | ||
| 452 | adds bad blocks without acknowledging them. This is largely | ||
| 453 | for testing. | ||
| 454 | |||
| 438 | 455 | ||
| 439 | 456 | ||
| 440 | An active md device will also contain and entry for each active device | 457 | An active md device will also contain and entry for each active device |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 574b09afedd3..0dc6546b77a8 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
| @@ -29,7 +29,6 @@ | |||
| 29 | #include "md.h" | 29 | #include "md.h" |
| 30 | #include "bitmap.h" | 30 | #include "bitmap.h" |
| 31 | 31 | ||
| 32 | #include <linux/dm-dirty-log.h> | ||
| 33 | /* debug macros */ | 32 | /* debug macros */ |
| 34 | 33 | ||
| 35 | #define DEBUG 0 | 34 | #define DEBUG 0 |
| @@ -775,10 +774,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon | |||
| 775 | * 0 or page 1 | 774 | * 0 or page 1 |
| 776 | */ | 775 | */ |
| 777 | static inline struct page *filemap_get_page(struct bitmap *bitmap, | 776 | static inline struct page *filemap_get_page(struct bitmap *bitmap, |
| 778 | unsigned long chunk) | 777 | unsigned long chunk) |
| 779 | { | 778 | { |
| 780 | if (bitmap->filemap == NULL) | ||
| 781 | return NULL; | ||
| 782 | if (file_page_index(bitmap, chunk) >= bitmap->file_pages) | 779 | if (file_page_index(bitmap, chunk) >= bitmap->file_pages) |
| 783 | return NULL; | 780 | return NULL; |
| 784 | return bitmap->filemap[file_page_index(bitmap, chunk) | 781 | return bitmap->filemap[file_page_index(bitmap, chunk) |
| @@ -878,28 +875,19 @@ enum bitmap_page_attr { | |||
| 878 | static inline void set_page_attr(struct bitmap *bitmap, struct page *page, | 875 | static inline void set_page_attr(struct bitmap *bitmap, struct page *page, |
| 879 | enum bitmap_page_attr attr) | 876 | enum bitmap_page_attr attr) |
| 880 | { | 877 | { |
| 881 | if (page) | 878 | __set_bit((page->index<<2) + attr, bitmap->filemap_attr); |
| 882 | __set_bit((page->index<<2) + attr, bitmap->filemap_attr); | ||
| 883 | else | ||
| 884 | __set_bit(attr, &bitmap->logattrs); | ||
| 885 | } | 879 | } |
| 886 | 880 | ||
| 887 | static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, | 881 | static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, |
| 888 | enum bitmap_page_attr attr) | 882 | enum bitmap_page_attr attr) |
| 889 | { | 883 | { |
| 890 | if (page) | 884 | __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); |
| 891 | __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); | ||
| 892 | else | ||
| 893 | __clear_bit(attr, &bitmap->logattrs); | ||
| 894 | } | 885 | } |
| 895 | 886 | ||
| 896 | static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, | 887 | static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, |
| 897 | enum bitmap_page_attr attr) | 888 | enum bitmap_page_attr attr) |
| 898 | { | 889 | { |
| 899 | if (page) | 890 | return test_bit((page->index<<2) + attr, bitmap->filemap_attr); |
| 900 | return test_bit((page->index<<2) + attr, bitmap->filemap_attr); | ||
| 901 | else | ||
| 902 | return test_bit(attr, &bitmap->logattrs); | ||
| 903 | } | 891 | } |
| 904 | 892 | ||
| 905 | /* | 893 | /* |
| @@ -912,30 +900,26 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p | |||
| 912 | static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) | 900 | static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) |
| 913 | { | 901 | { |
| 914 | unsigned long bit; | 902 | unsigned long bit; |
| 915 | struct page *page = NULL; | 903 | struct page *page; |
| 916 | void *kaddr; | 904 | void *kaddr; |
| 917 | unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); | 905 | unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); |
| 918 | 906 | ||
| 919 | if (!bitmap->filemap) { | 907 | if (!bitmap->filemap) |
| 920 | struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; | 908 | return; |
| 921 | if (log) | ||
| 922 | log->type->mark_region(log, chunk); | ||
| 923 | } else { | ||
| 924 | 909 | ||
| 925 | page = filemap_get_page(bitmap, chunk); | 910 | page = filemap_get_page(bitmap, chunk); |
| 926 | if (!page) | 911 | if (!page) |
| 927 | return; | 912 | return; |
| 928 | bit = file_page_offset(bitmap, chunk); | 913 | bit = file_page_offset(bitmap, chunk); |
| 929 | 914 | ||
| 930 | /* set the bit */ | 915 | /* set the bit */ |
| 931 | kaddr = kmap_atomic(page, KM_USER0); | 916 | kaddr = kmap_atomic(page, KM_USER0); |
| 932 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 917 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
| 933 | set_bit(bit, kaddr); | 918 | set_bit(bit, kaddr); |
| 934 | else | 919 | else |
| 935 | __test_and_set_bit_le(bit, kaddr); | 920 | __set_bit_le(bit, kaddr); |
| 936 | kunmap_atomic(kaddr, KM_USER0); | 921 | kunmap_atomic(kaddr, KM_USER0); |
| 937 | PRINTK("set file bit %lu page %lu\n", bit, page->index); | 922 | PRINTK("set file bit %lu page %lu\n", bit, page->index); |
| 938 | } | ||
| 939 | /* record page number so it gets flushed to disk when unplug occurs */ | 923 | /* record page number so it gets flushed to disk when unplug occurs */ |
| 940 | set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); | 924 | set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); |
| 941 | } | 925 | } |
| @@ -952,16 +936,6 @@ void bitmap_unplug(struct bitmap *bitmap) | |||
| 952 | 936 | ||
| 953 | if (!bitmap) | 937 | if (!bitmap) |
| 954 | return; | 938 | return; |
| 955 | if (!bitmap->filemap) { | ||
| 956 | /* Must be using a dirty_log */ | ||
| 957 | struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; | ||
| 958 | dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs); | ||
| 959 | need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs); | ||
| 960 | if (dirty || need_write) | ||
| 961 | if (log->type->flush(log)) | ||
| 962 | bitmap->flags |= BITMAP_WRITE_ERROR; | ||
| 963 | goto out; | ||
| 964 | } | ||
| 965 | 939 | ||
| 966 | /* look at each page to see if there are any set bits that need to be | 940 | /* look at each page to see if there are any set bits that need to be |
| 967 | * flushed out to disk */ | 941 | * flushed out to disk */ |
| @@ -990,7 +964,6 @@ void bitmap_unplug(struct bitmap *bitmap) | |||
| 990 | else | 964 | else |
| 991 | md_super_wait(bitmap->mddev); | 965 | md_super_wait(bitmap->mddev); |
| 992 | } | 966 | } |
| 993 | out: | ||
| 994 | if (bitmap->flags & BITMAP_WRITE_ERROR) | 967 | if (bitmap->flags & BITMAP_WRITE_ERROR) |
| 995 | bitmap_file_kick(bitmap); | 968 | bitmap_file_kick(bitmap); |
| 996 | } | 969 | } |
| @@ -1199,7 +1172,6 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
| 1199 | struct page *page = NULL, *lastpage = NULL; | 1172 | struct page *page = NULL, *lastpage = NULL; |
| 1200 | sector_t blocks; | 1173 | sector_t blocks; |
| 1201 | void *paddr; | 1174 | void *paddr; |
| 1202 | struct dm_dirty_log *log = mddev->bitmap_info.log; | ||
| 1203 | 1175 | ||
| 1204 | /* Use a mutex to guard daemon_work against | 1176 | /* Use a mutex to guard daemon_work against |
| 1205 | * bitmap_destroy. | 1177 | * bitmap_destroy. |
| @@ -1224,12 +1196,11 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
| 1224 | spin_lock_irqsave(&bitmap->lock, flags); | 1196 | spin_lock_irqsave(&bitmap->lock, flags); |
| 1225 | for (j = 0; j < bitmap->chunks; j++) { | 1197 | for (j = 0; j < bitmap->chunks; j++) { |
| 1226 | bitmap_counter_t *bmc; | 1198 | bitmap_counter_t *bmc; |
| 1227 | if (!bitmap->filemap) { | 1199 | if (!bitmap->filemap) |
| 1228 | if (!log) | 1200 | /* error or shutdown */ |
| 1229 | /* error or shutdown */ | 1201 | break; |
| 1230 | break; | 1202 | |
| 1231 | } else | 1203 | page = filemap_get_page(bitmap, j); |
| 1232 | page = filemap_get_page(bitmap, j); | ||
| 1233 | 1204 | ||
| 1234 | if (page != lastpage) { | 1205 | if (page != lastpage) { |
| 1235 | /* skip this page unless it's marked as needing cleaning */ | 1206 | /* skip this page unless it's marked as needing cleaning */ |
| @@ -1298,17 +1269,16 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
| 1298 | -1); | 1269 | -1); |
| 1299 | 1270 | ||
| 1300 | /* clear the bit */ | 1271 | /* clear the bit */ |
| 1301 | if (page) { | 1272 | paddr = kmap_atomic(page, KM_USER0); |
| 1302 | paddr = kmap_atomic(page, KM_USER0); | 1273 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
| 1303 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 1274 | clear_bit(file_page_offset(bitmap, j), |
| 1304 | clear_bit(file_page_offset(bitmap, j), | 1275 | paddr); |
| 1305 | paddr); | 1276 | else |
| 1306 | else | 1277 | __clear_bit_le( |
| 1307 | __test_and_clear_bit_le(file_page_offset(bitmap, j), | 1278 | file_page_offset(bitmap, |
| 1308 | paddr); | 1279 | j), |
| 1309 | kunmap_atomic(paddr, KM_USER0); | 1280 | paddr); |
| 1310 | } else | 1281 | kunmap_atomic(paddr, KM_USER0); |
| 1311 | log->type->clear_region(log, j); | ||
| 1312 | } | 1282 | } |
| 1313 | } else | 1283 | } else |
| 1314 | j |= PAGE_COUNTER_MASK; | 1284 | j |= PAGE_COUNTER_MASK; |
| @@ -1316,16 +1286,12 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
| 1316 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1286 | spin_unlock_irqrestore(&bitmap->lock, flags); |
| 1317 | 1287 | ||
| 1318 | /* now sync the final page */ | 1288 | /* now sync the final page */ |
| 1319 | if (lastpage != NULL || log != NULL) { | 1289 | if (lastpage != NULL) { |
| 1320 | spin_lock_irqsave(&bitmap->lock, flags); | 1290 | spin_lock_irqsave(&bitmap->lock, flags); |
| 1321 | if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { | 1291 | if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { |
| 1322 | clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); | 1292 | clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); |
| 1323 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1293 | spin_unlock_irqrestore(&bitmap->lock, flags); |
| 1324 | if (lastpage) | 1294 | write_page(bitmap, lastpage, 0); |
| 1325 | write_page(bitmap, lastpage, 0); | ||
| 1326 | else | ||
| 1327 | if (log->type->flush(log)) | ||
| 1328 | bitmap->flags |= BITMAP_WRITE_ERROR; | ||
| 1329 | } else { | 1295 | } else { |
| 1330 | set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); | 1296 | set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); |
| 1331 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1297 | spin_unlock_irqrestore(&bitmap->lock, flags); |
| @@ -1767,12 +1733,10 @@ int bitmap_create(mddev_t *mddev) | |||
| 1767 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); | 1733 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); |
| 1768 | 1734 | ||
| 1769 | if (!file | 1735 | if (!file |
| 1770 | && !mddev->bitmap_info.offset | 1736 | && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ |
| 1771 | && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */ | ||
| 1772 | return 0; | 1737 | return 0; |
| 1773 | 1738 | ||
| 1774 | BUG_ON(file && mddev->bitmap_info.offset); | 1739 | BUG_ON(file && mddev->bitmap_info.offset); |
| 1775 | BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log); | ||
| 1776 | 1740 | ||
| 1777 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); | 1741 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); |
| 1778 | if (!bitmap) | 1742 | if (!bitmap) |
| @@ -1863,6 +1827,7 @@ int bitmap_create(mddev_t *mddev) | |||
| 1863 | int bitmap_load(mddev_t *mddev) | 1827 | int bitmap_load(mddev_t *mddev) |
| 1864 | { | 1828 | { |
| 1865 | int err = 0; | 1829 | int err = 0; |
| 1830 | sector_t start = 0; | ||
| 1866 | sector_t sector = 0; | 1831 | sector_t sector = 0; |
| 1867 | struct bitmap *bitmap = mddev->bitmap; | 1832 | struct bitmap *bitmap = mddev->bitmap; |
| 1868 | 1833 | ||
| @@ -1881,24 +1846,14 @@ int bitmap_load(mddev_t *mddev) | |||
| 1881 | } | 1846 | } |
| 1882 | bitmap_close_sync(bitmap); | 1847 | bitmap_close_sync(bitmap); |
| 1883 | 1848 | ||
| 1884 | if (mddev->bitmap_info.log) { | 1849 | if (mddev->degraded == 0 |
| 1885 | unsigned long i; | 1850 | || bitmap->events_cleared == mddev->events) |
| 1886 | struct dm_dirty_log *log = mddev->bitmap_info.log; | 1851 | /* no need to keep dirty bits to optimise a |
| 1887 | for (i = 0; i < bitmap->chunks; i++) | 1852 | * re-add of a missing device */ |
| 1888 | if (!log->type->in_sync(log, i, 1)) | 1853 | start = mddev->recovery_cp; |
| 1889 | bitmap_set_memory_bits(bitmap, | 1854 | |
| 1890 | (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), | 1855 | err = bitmap_init_from_disk(bitmap, start); |
| 1891 | 1); | 1856 | |
| 1892 | } else { | ||
| 1893 | sector_t start = 0; | ||
| 1894 | if (mddev->degraded == 0 | ||
| 1895 | || bitmap->events_cleared == mddev->events) | ||
| 1896 | /* no need to keep dirty bits to optimise a | ||
| 1897 | * re-add of a missing device */ | ||
| 1898 | start = mddev->recovery_cp; | ||
| 1899 | |||
| 1900 | err = bitmap_init_from_disk(bitmap, start); | ||
| 1901 | } | ||
| 1902 | if (err) | 1857 | if (err) |
| 1903 | goto out; | 1858 | goto out; |
| 1904 | 1859 | ||
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index b2a127e891ac..a28f2e5588c6 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h | |||
| @@ -212,10 +212,6 @@ struct bitmap { | |||
| 212 | unsigned long file_pages; /* number of pages in the file */ | 212 | unsigned long file_pages; /* number of pages in the file */ |
| 213 | int last_page_size; /* bytes in the last page */ | 213 | int last_page_size; /* bytes in the last page */ |
| 214 | 214 | ||
| 215 | unsigned long logattrs; /* used when filemap_attr doesn't exist | ||
| 216 | * because we are working with a dirty_log | ||
| 217 | */ | ||
| 218 | |||
| 219 | unsigned long flags; | 215 | unsigned long flags; |
| 220 | 216 | ||
| 221 | int allclean; | 217 | int allclean; |
| @@ -237,7 +233,6 @@ struct bitmap { | |||
| 237 | wait_queue_head_t behind_wait; | 233 | wait_queue_head_t behind_wait; |
| 238 | 234 | ||
| 239 | struct sysfs_dirent *sysfs_can_clear; | 235 | struct sysfs_dirent *sysfs_can_clear; |
| 240 | |||
| 241 | }; | 236 | }; |
| 242 | 237 | ||
| 243 | /* the bitmap API */ | 238 | /* the bitmap API */ |
diff --git a/drivers/md/md.c b/drivers/md/md.c index dfc9425db70b..8e221a20f5d9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | |||
| 215 | } | 215 | } |
| 216 | EXPORT_SYMBOL_GPL(bio_clone_mddev); | 216 | EXPORT_SYMBOL_GPL(bio_clone_mddev); |
| 217 | 217 | ||
| 218 | void md_trim_bio(struct bio *bio, int offset, int size) | ||
| 219 | { | ||
| 220 | /* 'bio' is a cloned bio which we need to trim to match | ||
| 221 | * the given offset and size. | ||
| 222 | * This requires adjusting bi_sector, bi_size, and bi_io_vec | ||
| 223 | */ | ||
| 224 | int i; | ||
| 225 | struct bio_vec *bvec; | ||
| 226 | int sofar = 0; | ||
| 227 | |||
| 228 | size <<= 9; | ||
| 229 | if (offset == 0 && size == bio->bi_size) | ||
| 230 | return; | ||
| 231 | |||
| 232 | bio->bi_sector += offset; | ||
| 233 | bio->bi_size = size; | ||
| 234 | offset <<= 9; | ||
| 235 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
| 236 | |||
| 237 | while (bio->bi_idx < bio->bi_vcnt && | ||
| 238 | bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { | ||
| 239 | /* remove this whole bio_vec */ | ||
| 240 | offset -= bio->bi_io_vec[bio->bi_idx].bv_len; | ||
| 241 | bio->bi_idx++; | ||
| 242 | } | ||
| 243 | if (bio->bi_idx < bio->bi_vcnt) { | ||
| 244 | bio->bi_io_vec[bio->bi_idx].bv_offset += offset; | ||
| 245 | bio->bi_io_vec[bio->bi_idx].bv_len -= offset; | ||
| 246 | } | ||
| 247 | /* avoid any complications with bi_idx being non-zero*/ | ||
| 248 | if (bio->bi_idx) { | ||
| 249 | memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, | ||
| 250 | (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); | ||
| 251 | bio->bi_vcnt -= bio->bi_idx; | ||
| 252 | bio->bi_idx = 0; | ||
| 253 | } | ||
| 254 | /* Make sure vcnt and last bv are not too big */ | ||
| 255 | bio_for_each_segment(bvec, bio, i) { | ||
| 256 | if (sofar + bvec->bv_len > size) | ||
| 257 | bvec->bv_len = size - sofar; | ||
| 258 | if (bvec->bv_len == 0) { | ||
| 259 | bio->bi_vcnt = i; | ||
| 260 | break; | ||
| 261 | } | ||
| 262 | sofar += bvec->bv_len; | ||
| 263 | } | ||
| 264 | } | ||
| 265 | EXPORT_SYMBOL_GPL(md_trim_bio); | ||
| 266 | |||
| 218 | /* | 267 | /* |
| 219 | * We have a system wide 'event count' that is incremented | 268 | * We have a system wide 'event count' that is incremented |
| 220 | * on any 'interesting' event, and readers of /proc/mdstat | 269 | * on any 'interesting' event, and readers of /proc/mdstat |
| @@ -757,6 +806,10 @@ static void free_disk_sb(mdk_rdev_t * rdev) | |||
| 757 | rdev->sb_start = 0; | 806 | rdev->sb_start = 0; |
| 758 | rdev->sectors = 0; | 807 | rdev->sectors = 0; |
| 759 | } | 808 | } |
| 809 | if (rdev->bb_page) { | ||
| 810 | put_page(rdev->bb_page); | ||
| 811 | rdev->bb_page = NULL; | ||
| 812 | } | ||
| 760 | } | 813 | } |
| 761 | 814 | ||
| 762 | 815 | ||
| @@ -1025,7 +1078,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
| 1025 | ret = -EINVAL; | 1078 | ret = -EINVAL; |
| 1026 | 1079 | ||
| 1027 | bdevname(rdev->bdev, b); | 1080 | bdevname(rdev->bdev, b); |
| 1028 | sb = (mdp_super_t*)page_address(rdev->sb_page); | 1081 | sb = page_address(rdev->sb_page); |
| 1029 | 1082 | ||
| 1030 | if (sb->md_magic != MD_SB_MAGIC) { | 1083 | if (sb->md_magic != MD_SB_MAGIC) { |
| 1031 | printk(KERN_ERR "md: invalid raid superblock magic on %s\n", | 1084 | printk(KERN_ERR "md: invalid raid superblock magic on %s\n", |
| @@ -1054,6 +1107,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
| 1054 | rdev->preferred_minor = sb->md_minor; | 1107 | rdev->preferred_minor = sb->md_minor; |
| 1055 | rdev->data_offset = 0; | 1108 | rdev->data_offset = 0; |
| 1056 | rdev->sb_size = MD_SB_BYTES; | 1109 | rdev->sb_size = MD_SB_BYTES; |
| 1110 | rdev->badblocks.shift = -1; | ||
| 1057 | 1111 | ||
| 1058 | if (sb->level == LEVEL_MULTIPATH) | 1112 | if (sb->level == LEVEL_MULTIPATH) |
| 1059 | rdev->desc_nr = -1; | 1113 | rdev->desc_nr = -1; |
| @@ -1064,7 +1118,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
| 1064 | ret = 1; | 1118 | ret = 1; |
| 1065 | } else { | 1119 | } else { |
| 1066 | __u64 ev1, ev2; | 1120 | __u64 ev1, ev2; |
| 1067 | mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); | 1121 | mdp_super_t *refsb = page_address(refdev->sb_page); |
| 1068 | if (!uuid_equal(refsb, sb)) { | 1122 | if (!uuid_equal(refsb, sb)) { |
| 1069 | printk(KERN_WARNING "md: %s has different UUID to %s\n", | 1123 | printk(KERN_WARNING "md: %s has different UUID to %s\n", |
| 1070 | b, bdevname(refdev->bdev,b2)); | 1124 | b, bdevname(refdev->bdev,b2)); |
| @@ -1099,7 +1153,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
| 1099 | static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | 1153 | static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) |
| 1100 | { | 1154 | { |
| 1101 | mdp_disk_t *desc; | 1155 | mdp_disk_t *desc; |
| 1102 | mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); | 1156 | mdp_super_t *sb = page_address(rdev->sb_page); |
| 1103 | __u64 ev1 = md_event(sb); | 1157 | __u64 ev1 = md_event(sb); |
| 1104 | 1158 | ||
| 1105 | rdev->raid_disk = -1; | 1159 | rdev->raid_disk = -1; |
| @@ -1230,7 +1284,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1230 | 1284 | ||
| 1231 | rdev->sb_size = MD_SB_BYTES; | 1285 | rdev->sb_size = MD_SB_BYTES; |
| 1232 | 1286 | ||
| 1233 | sb = (mdp_super_t*)page_address(rdev->sb_page); | 1287 | sb = page_address(rdev->sb_page); |
| 1234 | 1288 | ||
| 1235 | memset(sb, 0, sizeof(*sb)); | 1289 | memset(sb, 0, sizeof(*sb)); |
| 1236 | 1290 | ||
| @@ -1395,6 +1449,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) | |||
| 1395 | return cpu_to_le32(csum); | 1449 | return cpu_to_le32(csum); |
| 1396 | } | 1450 | } |
| 1397 | 1451 | ||
| 1452 | static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, | ||
| 1453 | int acknowledged); | ||
| 1398 | static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | 1454 | static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) |
| 1399 | { | 1455 | { |
| 1400 | struct mdp_superblock_1 *sb; | 1456 | struct mdp_superblock_1 *sb; |
| @@ -1435,7 +1491,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 1435 | if (ret) return ret; | 1491 | if (ret) return ret; |
| 1436 | 1492 | ||
| 1437 | 1493 | ||
| 1438 | sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 1494 | sb = page_address(rdev->sb_page); |
| 1439 | 1495 | ||
| 1440 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || | 1496 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || |
| 1441 | sb->major_version != cpu_to_le32(1) || | 1497 | sb->major_version != cpu_to_le32(1) || |
| @@ -1473,12 +1529,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 1473 | else | 1529 | else |
| 1474 | rdev->desc_nr = le32_to_cpu(sb->dev_number); | 1530 | rdev->desc_nr = le32_to_cpu(sb->dev_number); |
| 1475 | 1531 | ||
| 1532 | if (!rdev->bb_page) { | ||
| 1533 | rdev->bb_page = alloc_page(GFP_KERNEL); | ||
| 1534 | if (!rdev->bb_page) | ||
| 1535 | return -ENOMEM; | ||
| 1536 | } | ||
| 1537 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && | ||
| 1538 | rdev->badblocks.count == 0) { | ||
| 1539 | /* need to load the bad block list. | ||
| 1540 | * Currently we limit it to one page. | ||
| 1541 | */ | ||
| 1542 | s32 offset; | ||
| 1543 | sector_t bb_sector; | ||
| 1544 | u64 *bbp; | ||
| 1545 | int i; | ||
| 1546 | int sectors = le16_to_cpu(sb->bblog_size); | ||
| 1547 | if (sectors > (PAGE_SIZE / 512)) | ||
| 1548 | return -EINVAL; | ||
| 1549 | offset = le32_to_cpu(sb->bblog_offset); | ||
| 1550 | if (offset == 0) | ||
| 1551 | return -EINVAL; | ||
| 1552 | bb_sector = (long long)offset; | ||
| 1553 | if (!sync_page_io(rdev, bb_sector, sectors << 9, | ||
| 1554 | rdev->bb_page, READ, true)) | ||
| 1555 | return -EIO; | ||
| 1556 | bbp = (u64 *)page_address(rdev->bb_page); | ||
| 1557 | rdev->badblocks.shift = sb->bblog_shift; | ||
| 1558 | for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { | ||
| 1559 | u64 bb = le64_to_cpu(*bbp); | ||
| 1560 | int count = bb & (0x3ff); | ||
| 1561 | u64 sector = bb >> 10; | ||
| 1562 | sector <<= sb->bblog_shift; | ||
| 1563 | count <<= sb->bblog_shift; | ||
| 1564 | if (bb + 1 == 0) | ||
| 1565 | break; | ||
| 1566 | if (md_set_badblocks(&rdev->badblocks, | ||
| 1567 | sector, count, 1) == 0) | ||
| 1568 | return -EINVAL; | ||
| 1569 | } | ||
| 1570 | } else if (sb->bblog_offset == 0) | ||
| 1571 | rdev->badblocks.shift = -1; | ||
| 1572 | |||
| 1476 | if (!refdev) { | 1573 | if (!refdev) { |
| 1477 | ret = 1; | 1574 | ret = 1; |
| 1478 | } else { | 1575 | } else { |
| 1479 | __u64 ev1, ev2; | 1576 | __u64 ev1, ev2; |
| 1480 | struct mdp_superblock_1 *refsb = | 1577 | struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); |
| 1481 | (struct mdp_superblock_1*)page_address(refdev->sb_page); | ||
| 1482 | 1578 | ||
| 1483 | if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || | 1579 | if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || |
| 1484 | sb->level != refsb->level || | 1580 | sb->level != refsb->level || |
| @@ -1513,7 +1609,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 1513 | 1609 | ||
| 1514 | static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | 1610 | static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) |
| 1515 | { | 1611 | { |
| 1516 | struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 1612 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); |
| 1517 | __u64 ev1 = le64_to_cpu(sb->events); | 1613 | __u64 ev1 = le64_to_cpu(sb->events); |
| 1518 | 1614 | ||
| 1519 | rdev->raid_disk = -1; | 1615 | rdev->raid_disk = -1; |
| @@ -1619,13 +1715,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1619 | int max_dev, i; | 1715 | int max_dev, i; |
| 1620 | /* make rdev->sb match mddev and rdev data. */ | 1716 | /* make rdev->sb match mddev and rdev data. */ |
| 1621 | 1717 | ||
| 1622 | sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 1718 | sb = page_address(rdev->sb_page); |
| 1623 | 1719 | ||
| 1624 | sb->feature_map = 0; | 1720 | sb->feature_map = 0; |
| 1625 | sb->pad0 = 0; | 1721 | sb->pad0 = 0; |
| 1626 | sb->recovery_offset = cpu_to_le64(0); | 1722 | sb->recovery_offset = cpu_to_le64(0); |
| 1627 | memset(sb->pad1, 0, sizeof(sb->pad1)); | 1723 | memset(sb->pad1, 0, sizeof(sb->pad1)); |
| 1628 | memset(sb->pad2, 0, sizeof(sb->pad2)); | ||
| 1629 | memset(sb->pad3, 0, sizeof(sb->pad3)); | 1724 | memset(sb->pad3, 0, sizeof(sb->pad3)); |
| 1630 | 1725 | ||
| 1631 | sb->utime = cpu_to_le64((__u64)mddev->utime); | 1726 | sb->utime = cpu_to_le64((__u64)mddev->utime); |
| @@ -1665,6 +1760,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1665 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); | 1760 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
| 1666 | } | 1761 | } |
| 1667 | 1762 | ||
| 1763 | if (rdev->badblocks.count == 0) | ||
| 1764 | /* Nothing to do for bad blocks*/ ; | ||
| 1765 | else if (sb->bblog_offset == 0) | ||
| 1766 | /* Cannot record bad blocks on this device */ | ||
| 1767 | md_error(mddev, rdev); | ||
| 1768 | else { | ||
| 1769 | struct badblocks *bb = &rdev->badblocks; | ||
| 1770 | u64 *bbp = (u64 *)page_address(rdev->bb_page); | ||
| 1771 | u64 *p = bb->page; | ||
| 1772 | sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); | ||
| 1773 | if (bb->changed) { | ||
| 1774 | unsigned seq; | ||
| 1775 | |||
| 1776 | retry: | ||
| 1777 | seq = read_seqbegin(&bb->lock); | ||
| 1778 | |||
| 1779 | memset(bbp, 0xff, PAGE_SIZE); | ||
| 1780 | |||
| 1781 | for (i = 0 ; i < bb->count ; i++) { | ||
| 1782 | u64 internal_bb = *p++; | ||
| 1783 | u64 store_bb = ((BB_OFFSET(internal_bb) << 10) | ||
| 1784 | | BB_LEN(internal_bb)); | ||
| 1785 | *bbp++ = cpu_to_le64(store_bb); | ||
| 1786 | } | ||
| 1787 | if (read_seqretry(&bb->lock, seq)) | ||
| 1788 | goto retry; | ||
| 1789 | |||
| 1790 | bb->sector = (rdev->sb_start + | ||
| 1791 | (int)le32_to_cpu(sb->bblog_offset)); | ||
| 1792 | bb->size = le16_to_cpu(sb->bblog_size); | ||
| 1793 | bb->changed = 0; | ||
| 1794 | } | ||
| 1795 | } | ||
| 1796 | |||
| 1668 | max_dev = 0; | 1797 | max_dev = 0; |
| 1669 | list_for_each_entry(rdev2, &mddev->disks, same_set) | 1798 | list_for_each_entry(rdev2, &mddev->disks, same_set) |
| 1670 | if (rdev2->desc_nr+1 > max_dev) | 1799 | if (rdev2->desc_nr+1 > max_dev) |
| @@ -1724,7 +1853,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
| 1724 | num_sectors = max_sectors; | 1853 | num_sectors = max_sectors; |
| 1725 | rdev->sb_start = sb_start; | 1854 | rdev->sb_start = sb_start; |
| 1726 | } | 1855 | } |
| 1727 | sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); | 1856 | sb = page_address(rdev->sb_page); |
| 1728 | sb->data_size = cpu_to_le64(num_sectors); | 1857 | sb->data_size = cpu_to_le64(num_sectors); |
| 1729 | sb->super_offset = rdev->sb_start; | 1858 | sb->super_offset = rdev->sb_start; |
| 1730 | sb->sb_csum = calc_sb_1_csum(sb); | 1859 | sb->sb_csum = calc_sb_1_csum(sb); |
| @@ -1922,7 +2051,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
| 1922 | bd_link_disk_holder(rdev->bdev, mddev->gendisk); | 2051 | bd_link_disk_holder(rdev->bdev, mddev->gendisk); |
| 1923 | 2052 | ||
| 1924 | /* May as well allow recovery to be retried once */ | 2053 | /* May as well allow recovery to be retried once */ |
| 1925 | mddev->recovery_disabled = 0; | 2054 | mddev->recovery_disabled++; |
| 1926 | 2055 | ||
| 1927 | return 0; | 2056 | return 0; |
| 1928 | 2057 | ||
| @@ -1953,6 +2082,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) | |||
| 1953 | sysfs_remove_link(&rdev->kobj, "block"); | 2082 | sysfs_remove_link(&rdev->kobj, "block"); |
| 1954 | sysfs_put(rdev->sysfs_state); | 2083 | sysfs_put(rdev->sysfs_state); |
| 1955 | rdev->sysfs_state = NULL; | 2084 | rdev->sysfs_state = NULL; |
| 2085 | kfree(rdev->badblocks.page); | ||
| 2086 | rdev->badblocks.count = 0; | ||
| 2087 | rdev->badblocks.page = NULL; | ||
| 1956 | /* We need to delay this, otherwise we can deadlock when | 2088 | /* We need to delay this, otherwise we can deadlock when |
| 1957 | * writing to 'remove' to "dev/state". We also need | 2089 | * writing to 'remove' to "dev/state". We also need |
| 1958 | * to delay it due to rcu usage. | 2090 | * to delay it due to rcu usage. |
| @@ -2127,10 +2259,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version) | |||
| 2127 | printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); | 2259 | printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); |
| 2128 | switch (major_version) { | 2260 | switch (major_version) { |
| 2129 | case 0: | 2261 | case 0: |
| 2130 | print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); | 2262 | print_sb_90(page_address(rdev->sb_page)); |
| 2131 | break; | 2263 | break; |
| 2132 | case 1: | 2264 | case 1: |
| 2133 | print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); | 2265 | print_sb_1(page_address(rdev->sb_page)); |
| 2134 | break; | 2266 | break; |
| 2135 | } | 2267 | } |
| 2136 | } else | 2268 | } else |
| @@ -2194,6 +2326,7 @@ static void md_update_sb(mddev_t * mddev, int force_change) | |||
| 2194 | mdk_rdev_t *rdev; | 2326 | mdk_rdev_t *rdev; |
| 2195 | int sync_req; | 2327 | int sync_req; |
| 2196 | int nospares = 0; | 2328 | int nospares = 0; |
| 2329 | int any_badblocks_changed = 0; | ||
| 2197 | 2330 | ||
| 2198 | repeat: | 2331 | repeat: |
| 2199 | /* First make sure individual recovery_offsets are correct */ | 2332 | /* First make sure individual recovery_offsets are correct */ |
| @@ -2208,8 +2341,18 @@ repeat: | |||
| 2208 | if (!mddev->persistent) { | 2341 | if (!mddev->persistent) { |
| 2209 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); | 2342 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); |
| 2210 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); | 2343 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); |
| 2211 | if (!mddev->external) | 2344 | if (!mddev->external) { |
| 2212 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 2345 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); |
| 2346 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
| 2347 | if (rdev->badblocks.changed) { | ||
| 2348 | md_ack_all_badblocks(&rdev->badblocks); | ||
| 2349 | md_error(mddev, rdev); | ||
| 2350 | } | ||
| 2351 | clear_bit(Blocked, &rdev->flags); | ||
| 2352 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
| 2353 | wake_up(&rdev->blocked_wait); | ||
| 2354 | } | ||
| 2355 | } | ||
| 2213 | wake_up(&mddev->sb_wait); | 2356 | wake_up(&mddev->sb_wait); |
| 2214 | return; | 2357 | return; |
| 2215 | } | 2358 | } |
| @@ -2265,6 +2408,14 @@ repeat: | |||
| 2265 | MD_BUG(); | 2408 | MD_BUG(); |
| 2266 | mddev->events --; | 2409 | mddev->events --; |
| 2267 | } | 2410 | } |
| 2411 | |||
| 2412 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
| 2413 | if (rdev->badblocks.changed) | ||
| 2414 | any_badblocks_changed++; | ||
| 2415 | if (test_bit(Faulty, &rdev->flags)) | ||
| 2416 | set_bit(FaultRecorded, &rdev->flags); | ||
| 2417 | } | ||
| 2418 | |||
| 2268 | sync_sbs(mddev, nospares); | 2419 | sync_sbs(mddev, nospares); |
| 2269 | spin_unlock_irq(&mddev->write_lock); | 2420 | spin_unlock_irq(&mddev->write_lock); |
| 2270 | 2421 | ||
| @@ -2290,6 +2441,13 @@ repeat: | |||
| 2290 | bdevname(rdev->bdev,b), | 2441 | bdevname(rdev->bdev,b), |
| 2291 | (unsigned long long)rdev->sb_start); | 2442 | (unsigned long long)rdev->sb_start); |
| 2292 | rdev->sb_events = mddev->events; | 2443 | rdev->sb_events = mddev->events; |
| 2444 | if (rdev->badblocks.size) { | ||
| 2445 | md_super_write(mddev, rdev, | ||
| 2446 | rdev->badblocks.sector, | ||
| 2447 | rdev->badblocks.size << 9, | ||
| 2448 | rdev->bb_page); | ||
| 2449 | rdev->badblocks.size = 0; | ||
| 2450 | } | ||
| 2293 | 2451 | ||
| 2294 | } else | 2452 | } else |
| 2295 | dprintk(")\n"); | 2453 | dprintk(")\n"); |
| @@ -2313,6 +2471,15 @@ repeat: | |||
| 2313 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 2471 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
| 2314 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 2472 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
| 2315 | 2473 | ||
| 2474 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
| 2475 | if (test_and_clear_bit(FaultRecorded, &rdev->flags)) | ||
| 2476 | clear_bit(Blocked, &rdev->flags); | ||
| 2477 | |||
| 2478 | if (any_badblocks_changed) | ||
| 2479 | md_ack_all_badblocks(&rdev->badblocks); | ||
| 2480 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
| 2481 | wake_up(&rdev->blocked_wait); | ||
| 2482 | } | ||
| 2316 | } | 2483 | } |
| 2317 | 2484 | ||
| 2318 | /* words written to sysfs files may, or may not, be \n terminated. | 2485 | /* words written to sysfs files may, or may not, be \n terminated. |
| @@ -2347,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
| 2347 | char *sep = ""; | 2514 | char *sep = ""; |
| 2348 | size_t len = 0; | 2515 | size_t len = 0; |
| 2349 | 2516 | ||
| 2350 | if (test_bit(Faulty, &rdev->flags)) { | 2517 | if (test_bit(Faulty, &rdev->flags) || |
| 2518 | rdev->badblocks.unacked_exist) { | ||
| 2351 | len+= sprintf(page+len, "%sfaulty",sep); | 2519 | len+= sprintf(page+len, "%sfaulty",sep); |
| 2352 | sep = ","; | 2520 | sep = ","; |
| 2353 | } | 2521 | } |
| @@ -2359,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
| 2359 | len += sprintf(page+len, "%swrite_mostly",sep); | 2527 | len += sprintf(page+len, "%swrite_mostly",sep); |
| 2360 | sep = ","; | 2528 | sep = ","; |
| 2361 | } | 2529 | } |
| 2362 | if (test_bit(Blocked, &rdev->flags)) { | 2530 | if (test_bit(Blocked, &rdev->flags) || |
| 2531 | rdev->badblocks.unacked_exist) { | ||
| 2363 | len += sprintf(page+len, "%sblocked", sep); | 2532 | len += sprintf(page+len, "%sblocked", sep); |
| 2364 | sep = ","; | 2533 | sep = ","; |
| 2365 | } | 2534 | } |
| @@ -2368,6 +2537,10 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
| 2368 | len += sprintf(page+len, "%sspare", sep); | 2537 | len += sprintf(page+len, "%sspare", sep); |
| 2369 | sep = ","; | 2538 | sep = ","; |
| 2370 | } | 2539 | } |
| 2540 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
| 2541 | len += sprintf(page+len, "%swrite_error", sep); | ||
| 2542 | sep = ","; | ||
| 2543 | } | ||
| 2371 | return len+sprintf(page+len, "\n"); | 2544 | return len+sprintf(page+len, "\n"); |
| 2372 | } | 2545 | } |
| 2373 | 2546 | ||
| @@ -2375,13 +2548,15 @@ static ssize_t | |||
| 2375 | state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | 2548 | state_store(mdk_rdev_t *rdev, const char *buf, size_t len) |
| 2376 | { | 2549 | { |
| 2377 | /* can write | 2550 | /* can write |
| 2378 | * faulty - simulates and error | 2551 | * faulty - simulates an error |
| 2379 | * remove - disconnects the device | 2552 | * remove - disconnects the device |
| 2380 | * writemostly - sets write_mostly | 2553 | * writemostly - sets write_mostly |
| 2381 | * -writemostly - clears write_mostly | 2554 | * -writemostly - clears write_mostly |
| 2382 | * blocked - sets the Blocked flag | 2555 | * blocked - sets the Blocked flags |
| 2383 | * -blocked - clears the Blocked flag | 2556 | * -blocked - clears the Blocked and possibly simulates an error |
| 2384 | * insync - sets Insync providing device isn't active | 2557 | * insync - sets Insync providing device isn't active |
| 2558 | * write_error - sets WriteErrorSeen | ||
| 2559 | * -write_error - clears WriteErrorSeen | ||
| 2385 | */ | 2560 | */ |
| 2386 | int err = -EINVAL; | 2561 | int err = -EINVAL; |
| 2387 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { | 2562 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { |
| @@ -2408,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 2408 | set_bit(Blocked, &rdev->flags); | 2583 | set_bit(Blocked, &rdev->flags); |
| 2409 | err = 0; | 2584 | err = 0; |
| 2410 | } else if (cmd_match(buf, "-blocked")) { | 2585 | } else if (cmd_match(buf, "-blocked")) { |
| 2586 | if (!test_bit(Faulty, &rdev->flags) && | ||
| 2587 | test_bit(BlockedBadBlocks, &rdev->flags)) { | ||
| 2588 | /* metadata handler doesn't understand badblocks, | ||
| 2589 | * so we need to fail the device | ||
| 2590 | */ | ||
| 2591 | md_error(rdev->mddev, rdev); | ||
| 2592 | } | ||
| 2411 | clear_bit(Blocked, &rdev->flags); | 2593 | clear_bit(Blocked, &rdev->flags); |
| 2594 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
| 2412 | wake_up(&rdev->blocked_wait); | 2595 | wake_up(&rdev->blocked_wait); |
| 2413 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2596 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
| 2414 | md_wakeup_thread(rdev->mddev->thread); | 2597 | md_wakeup_thread(rdev->mddev->thread); |
| @@ -2417,6 +2600,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 2417 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { | 2600 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { |
| 2418 | set_bit(In_sync, &rdev->flags); | 2601 | set_bit(In_sync, &rdev->flags); |
| 2419 | err = 0; | 2602 | err = 0; |
| 2603 | } else if (cmd_match(buf, "write_error")) { | ||
| 2604 | set_bit(WriteErrorSeen, &rdev->flags); | ||
| 2605 | err = 0; | ||
| 2606 | } else if (cmd_match(buf, "-write_error")) { | ||
| 2607 | clear_bit(WriteErrorSeen, &rdev->flags); | ||
| 2608 | err = 0; | ||
| 2420 | } | 2609 | } |
| 2421 | if (!err) | 2610 | if (!err) |
| 2422 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2611 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
| @@ -2459,7 +2648,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 2459 | { | 2648 | { |
| 2460 | char *e; | 2649 | char *e; |
| 2461 | int err; | 2650 | int err; |
| 2462 | char nm[20]; | ||
| 2463 | int slot = simple_strtoul(buf, &e, 10); | 2651 | int slot = simple_strtoul(buf, &e, 10); |
| 2464 | if (strncmp(buf, "none", 4)==0) | 2652 | if (strncmp(buf, "none", 4)==0) |
| 2465 | slot = -1; | 2653 | slot = -1; |
| @@ -2482,8 +2670,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 2482 | hot_remove_disk(rdev->mddev, rdev->raid_disk); | 2670 | hot_remove_disk(rdev->mddev, rdev->raid_disk); |
| 2483 | if (err) | 2671 | if (err) |
| 2484 | return err; | 2672 | return err; |
| 2485 | sprintf(nm, "rd%d", rdev->raid_disk); | 2673 | sysfs_unlink_rdev(rdev->mddev, rdev); |
| 2486 | sysfs_remove_link(&rdev->mddev->kobj, nm); | ||
| 2487 | rdev->raid_disk = -1; | 2674 | rdev->raid_disk = -1; |
| 2488 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2675 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
| 2489 | md_wakeup_thread(rdev->mddev->thread); | 2676 | md_wakeup_thread(rdev->mddev->thread); |
| @@ -2522,8 +2709,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 2522 | return err; | 2709 | return err; |
| 2523 | } else | 2710 | } else |
| 2524 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2711 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
| 2525 | sprintf(nm, "rd%d", rdev->raid_disk); | 2712 | if (sysfs_link_rdev(rdev->mddev, rdev)) |
| 2526 | if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) | ||
| 2527 | /* failure here is OK */; | 2713 | /* failure here is OK */; |
| 2528 | /* don't wakeup anyone, leave that to userspace. */ | 2714 | /* don't wakeup anyone, leave that to userspace. */ |
| 2529 | } else { | 2715 | } else { |
| @@ -2712,6 +2898,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le | |||
| 2712 | static struct rdev_sysfs_entry rdev_recovery_start = | 2898 | static struct rdev_sysfs_entry rdev_recovery_start = |
| 2713 | __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); | 2899 | __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); |
| 2714 | 2900 | ||
| 2901 | |||
| 2902 | static ssize_t | ||
| 2903 | badblocks_show(struct badblocks *bb, char *page, int unack); | ||
| 2904 | static ssize_t | ||
| 2905 | badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); | ||
| 2906 | |||
| 2907 | static ssize_t bb_show(mdk_rdev_t *rdev, char *page) | ||
| 2908 | { | ||
| 2909 | return badblocks_show(&rdev->badblocks, page, 0); | ||
| 2910 | } | ||
| 2911 | static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len) | ||
| 2912 | { | ||
| 2913 | int rv = badblocks_store(&rdev->badblocks, page, len, 0); | ||
| 2914 | /* Maybe that ack was all we needed */ | ||
| 2915 | if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) | ||
| 2916 | wake_up(&rdev->blocked_wait); | ||
| 2917 | return rv; | ||
| 2918 | } | ||
| 2919 | static struct rdev_sysfs_entry rdev_bad_blocks = | ||
| 2920 | __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); | ||
| 2921 | |||
| 2922 | |||
| 2923 | static ssize_t ubb_show(mdk_rdev_t *rdev, char *page) | ||
| 2924 | { | ||
| 2925 | return badblocks_show(&rdev->badblocks, page, 1); | ||
| 2926 | } | ||
| 2927 | static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len) | ||
| 2928 | { | ||
| 2929 | return badblocks_store(&rdev->badblocks, page, len, 1); | ||
| 2930 | } | ||
| 2931 | static struct rdev_sysfs_entry rdev_unack_bad_blocks = | ||
| 2932 | __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); | ||
| 2933 | |||
| 2715 | static struct attribute *rdev_default_attrs[] = { | 2934 | static struct attribute *rdev_default_attrs[] = { |
| 2716 | &rdev_state.attr, | 2935 | &rdev_state.attr, |
| 2717 | &rdev_errors.attr, | 2936 | &rdev_errors.attr, |
| @@ -2719,6 +2938,8 @@ static struct attribute *rdev_default_attrs[] = { | |||
| 2719 | &rdev_offset.attr, | 2938 | &rdev_offset.attr, |
| 2720 | &rdev_size.attr, | 2939 | &rdev_size.attr, |
| 2721 | &rdev_recovery_start.attr, | 2940 | &rdev_recovery_start.attr, |
| 2941 | &rdev_bad_blocks.attr, | ||
| 2942 | &rdev_unack_bad_blocks.attr, | ||
| 2722 | NULL, | 2943 | NULL, |
| 2723 | }; | 2944 | }; |
| 2724 | static ssize_t | 2945 | static ssize_t |
| @@ -2782,7 +3003,7 @@ static struct kobj_type rdev_ktype = { | |||
| 2782 | .default_attrs = rdev_default_attrs, | 3003 | .default_attrs = rdev_default_attrs, |
| 2783 | }; | 3004 | }; |
| 2784 | 3005 | ||
| 2785 | void md_rdev_init(mdk_rdev_t *rdev) | 3006 | int md_rdev_init(mdk_rdev_t *rdev) |
| 2786 | { | 3007 | { |
| 2787 | rdev->desc_nr = -1; | 3008 | rdev->desc_nr = -1; |
| 2788 | rdev->saved_raid_disk = -1; | 3009 | rdev->saved_raid_disk = -1; |
| @@ -2792,12 +3013,27 @@ void md_rdev_init(mdk_rdev_t *rdev) | |||
| 2792 | rdev->sb_events = 0; | 3013 | rdev->sb_events = 0; |
| 2793 | rdev->last_read_error.tv_sec = 0; | 3014 | rdev->last_read_error.tv_sec = 0; |
| 2794 | rdev->last_read_error.tv_nsec = 0; | 3015 | rdev->last_read_error.tv_nsec = 0; |
| 3016 | rdev->sb_loaded = 0; | ||
| 3017 | rdev->bb_page = NULL; | ||
| 2795 | atomic_set(&rdev->nr_pending, 0); | 3018 | atomic_set(&rdev->nr_pending, 0); |
| 2796 | atomic_set(&rdev->read_errors, 0); | 3019 | atomic_set(&rdev->read_errors, 0); |
| 2797 | atomic_set(&rdev->corrected_errors, 0); | 3020 | atomic_set(&rdev->corrected_errors, 0); |
| 2798 | 3021 | ||
| 2799 | INIT_LIST_HEAD(&rdev->same_set); | 3022 | INIT_LIST_HEAD(&rdev->same_set); |
| 2800 | init_waitqueue_head(&rdev->blocked_wait); | 3023 | init_waitqueue_head(&rdev->blocked_wait); |
| 3024 | |||
| 3025 | /* Add space to store bad block list. | ||
| 3026 | * This reserves the space even on arrays where it cannot | ||
| 3027 | * be used - I wonder if that matters | ||
| 3028 | */ | ||
| 3029 | rdev->badblocks.count = 0; | ||
| 3030 | rdev->badblocks.shift = 0; | ||
| 3031 | rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
| 3032 | seqlock_init(&rdev->badblocks.lock); | ||
| 3033 | if (rdev->badblocks.page == NULL) | ||
| 3034 | return -ENOMEM; | ||
| 3035 | |||
| 3036 | return 0; | ||
| 2801 | } | 3037 | } |
| 2802 | EXPORT_SYMBOL_GPL(md_rdev_init); | 3038 | EXPORT_SYMBOL_GPL(md_rdev_init); |
| 2803 | /* | 3039 | /* |
| @@ -2823,8 +3059,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
| 2823 | return ERR_PTR(-ENOMEM); | 3059 | return ERR_PTR(-ENOMEM); |
| 2824 | } | 3060 | } |
| 2825 | 3061 | ||
| 2826 | md_rdev_init(rdev); | 3062 | err = md_rdev_init(rdev); |
| 2827 | if ((err = alloc_disk_sb(rdev))) | 3063 | if (err) |
| 3064 | goto abort_free; | ||
| 3065 | err = alloc_disk_sb(rdev); | ||
| 3066 | if (err) | ||
| 2828 | goto abort_free; | 3067 | goto abort_free; |
| 2829 | 3068 | ||
| 2830 | err = lock_rdev(rdev, newdev, super_format == -2); | 3069 | err = lock_rdev(rdev, newdev, super_format == -2); |
| @@ -2860,15 +3099,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
| 2860 | goto abort_free; | 3099 | goto abort_free; |
| 2861 | } | 3100 | } |
| 2862 | } | 3101 | } |
| 3102 | if (super_format == -1) | ||
| 3103 | /* hot-add for 0.90, or non-persistent: so no badblocks */ | ||
| 3104 | rdev->badblocks.shift = -1; | ||
| 2863 | 3105 | ||
| 2864 | return rdev; | 3106 | return rdev; |
| 2865 | 3107 | ||
| 2866 | abort_free: | 3108 | abort_free: |
| 2867 | if (rdev->sb_page) { | 3109 | if (rdev->bdev) |
| 2868 | if (rdev->bdev) | 3110 | unlock_rdev(rdev); |
| 2869 | unlock_rdev(rdev); | 3111 | free_disk_sb(rdev); |
| 2870 | free_disk_sb(rdev); | 3112 | kfree(rdev->badblocks.page); |
| 2871 | } | ||
| 2872 | kfree(rdev); | 3113 | kfree(rdev); |
| 2873 | return ERR_PTR(err); | 3114 | return ERR_PTR(err); |
| 2874 | } | 3115 | } |
| @@ -3149,15 +3390,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 3149 | } | 3390 | } |
| 3150 | 3391 | ||
| 3151 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 3392 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
| 3152 | char nm[20]; | ||
| 3153 | if (rdev->raid_disk < 0) | 3393 | if (rdev->raid_disk < 0) |
| 3154 | continue; | 3394 | continue; |
| 3155 | if (rdev->new_raid_disk >= mddev->raid_disks) | 3395 | if (rdev->new_raid_disk >= mddev->raid_disks) |
| 3156 | rdev->new_raid_disk = -1; | 3396 | rdev->new_raid_disk = -1; |
| 3157 | if (rdev->new_raid_disk == rdev->raid_disk) | 3397 | if (rdev->new_raid_disk == rdev->raid_disk) |
| 3158 | continue; | 3398 | continue; |
| 3159 | sprintf(nm, "rd%d", rdev->raid_disk); | 3399 | sysfs_unlink_rdev(mddev, rdev); |
| 3160 | sysfs_remove_link(&mddev->kobj, nm); | ||
| 3161 | } | 3400 | } |
| 3162 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 3401 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
| 3163 | if (rdev->raid_disk < 0) | 3402 | if (rdev->raid_disk < 0) |
| @@ -3168,11 +3407,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 3168 | if (rdev->raid_disk < 0) | 3407 | if (rdev->raid_disk < 0) |
| 3169 | clear_bit(In_sync, &rdev->flags); | 3408 | clear_bit(In_sync, &rdev->flags); |
| 3170 | else { | 3409 | else { |
| 3171 | char nm[20]; | 3410 | if (sysfs_link_rdev(mddev, rdev)) |
| 3172 | sprintf(nm, "rd%d", rdev->raid_disk); | 3411 | printk(KERN_WARNING "md: cannot register rd%d" |
| 3173 | if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) | 3412 | " for %s after level change\n", |
| 3174 | printk("md: cannot register %s for %s after level change\n", | 3413 | rdev->raid_disk, mdname(mddev)); |
| 3175 | nm, mdname(mddev)); | ||
| 3176 | } | 3414 | } |
| 3177 | } | 3415 | } |
| 3178 | 3416 | ||
| @@ -4504,7 +4742,8 @@ int md_run(mddev_t *mddev) | |||
| 4504 | } | 4742 | } |
| 4505 | 4743 | ||
| 4506 | if (mddev->bio_set == NULL) | 4744 | if (mddev->bio_set == NULL) |
| 4507 | mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); | 4745 | mddev->bio_set = bioset_create(BIO_POOL_SIZE, |
| 4746 | sizeof(mddev_t *)); | ||
| 4508 | 4747 | ||
| 4509 | spin_lock(&pers_lock); | 4748 | spin_lock(&pers_lock); |
| 4510 | pers = find_pers(mddev->level, mddev->clevel); | 4749 | pers = find_pers(mddev->level, mddev->clevel); |
| @@ -4621,12 +4860,9 @@ int md_run(mddev_t *mddev) | |||
| 4621 | smp_wmb(); | 4860 | smp_wmb(); |
| 4622 | mddev->ready = 1; | 4861 | mddev->ready = 1; |
| 4623 | list_for_each_entry(rdev, &mddev->disks, same_set) | 4862 | list_for_each_entry(rdev, &mddev->disks, same_set) |
| 4624 | if (rdev->raid_disk >= 0) { | 4863 | if (rdev->raid_disk >= 0) |
| 4625 | char nm[20]; | 4864 | if (sysfs_link_rdev(mddev, rdev)) |
| 4626 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
| 4627 | if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) | ||
| 4628 | /* failure here is OK */; | 4865 | /* failure here is OK */; |
| 4629 | } | ||
| 4630 | 4866 | ||
| 4631 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 4867 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 4632 | 4868 | ||
| @@ -4854,11 +5090,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
| 4854 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 5090 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
| 4855 | 5091 | ||
| 4856 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5092 | list_for_each_entry(rdev, &mddev->disks, same_set) |
| 4857 | if (rdev->raid_disk >= 0) { | 5093 | if (rdev->raid_disk >= 0) |
| 4858 | char nm[20]; | 5094 | sysfs_unlink_rdev(mddev, rdev); |
| 4859 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
| 4860 | sysfs_remove_link(&mddev->kobj, nm); | ||
| 4861 | } | ||
| 4862 | 5095 | ||
| 4863 | set_capacity(disk, 0); | 5096 | set_capacity(disk, 0); |
| 4864 | mutex_unlock(&mddev->open_mutex); | 5097 | mutex_unlock(&mddev->open_mutex); |
| @@ -6198,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 6198 | if (!rdev || test_bit(Faulty, &rdev->flags)) | 6431 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
| 6199 | return; | 6432 | return; |
| 6200 | 6433 | ||
| 6201 | if (mddev->external) | 6434 | if (!mddev->pers || !mddev->pers->error_handler) |
| 6202 | set_bit(Blocked, &rdev->flags); | ||
| 6203 | /* | ||
| 6204 | dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", | ||
| 6205 | mdname(mddev), | ||
| 6206 | MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), | ||
| 6207 | __builtin_return_address(0),__builtin_return_address(1), | ||
| 6208 | __builtin_return_address(2),__builtin_return_address(3)); | ||
| 6209 | */ | ||
| 6210 | if (!mddev->pers) | ||
| 6211 | return; | ||
| 6212 | if (!mddev->pers->error_handler) | ||
| 6213 | return; | 6435 | return; |
| 6214 | mddev->pers->error_handler(mddev,rdev); | 6436 | mddev->pers->error_handler(mddev,rdev); |
| 6215 | if (mddev->degraded) | 6437 | if (mddev->degraded) |
| @@ -6933,11 +7155,14 @@ void md_do_sync(mddev_t *mddev) | |||
| 6933 | atomic_add(sectors, &mddev->recovery_active); | 7155 | atomic_add(sectors, &mddev->recovery_active); |
| 6934 | } | 7156 | } |
| 6935 | 7157 | ||
| 7158 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
| 7159 | break; | ||
| 7160 | |||
| 6936 | j += sectors; | 7161 | j += sectors; |
| 6937 | if (j>1) mddev->curr_resync = j; | 7162 | if (j>1) mddev->curr_resync = j; |
| 6938 | mddev->curr_mark_cnt = io_sectors; | 7163 | mddev->curr_mark_cnt = io_sectors; |
| 6939 | if (last_check == 0) | 7164 | if (last_check == 0) |
| 6940 | /* this is the earliers that rebuilt will be | 7165 | /* this is the earliest that rebuild will be |
| 6941 | * visible in /proc/mdstat | 7166 | * visible in /proc/mdstat |
| 6942 | */ | 7167 | */ |
| 6943 | md_new_event(mddev); | 7168 | md_new_event(mddev); |
| @@ -6946,10 +7171,6 @@ void md_do_sync(mddev_t *mddev) | |||
| 6946 | continue; | 7171 | continue; |
| 6947 | 7172 | ||
| 6948 | last_check = io_sectors; | 7173 | last_check = io_sectors; |
| 6949 | |||
| 6950 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
| 6951 | break; | ||
| 6952 | |||
| 6953 | repeat: | 7174 | repeat: |
| 6954 | if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { | 7175 | if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { |
| 6955 | /* step marks */ | 7176 | /* step marks */ |
| @@ -7067,29 +7288,23 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
| 7067 | atomic_read(&rdev->nr_pending)==0) { | 7288 | atomic_read(&rdev->nr_pending)==0) { |
| 7068 | if (mddev->pers->hot_remove_disk( | 7289 | if (mddev->pers->hot_remove_disk( |
| 7069 | mddev, rdev->raid_disk)==0) { | 7290 | mddev, rdev->raid_disk)==0) { |
| 7070 | char nm[20]; | 7291 | sysfs_unlink_rdev(mddev, rdev); |
| 7071 | sprintf(nm,"rd%d", rdev->raid_disk); | ||
| 7072 | sysfs_remove_link(&mddev->kobj, nm); | ||
| 7073 | rdev->raid_disk = -1; | 7292 | rdev->raid_disk = -1; |
| 7074 | } | 7293 | } |
| 7075 | } | 7294 | } |
| 7076 | 7295 | ||
| 7077 | if (mddev->degraded && !mddev->recovery_disabled) { | 7296 | if (mddev->degraded) { |
| 7078 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 7297 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
| 7079 | if (rdev->raid_disk >= 0 && | 7298 | if (rdev->raid_disk >= 0 && |
| 7080 | !test_bit(In_sync, &rdev->flags) && | 7299 | !test_bit(In_sync, &rdev->flags) && |
| 7081 | !test_bit(Faulty, &rdev->flags) && | 7300 | !test_bit(Faulty, &rdev->flags)) |
| 7082 | !test_bit(Blocked, &rdev->flags)) | ||
| 7083 | spares++; | 7301 | spares++; |
| 7084 | if (rdev->raid_disk < 0 | 7302 | if (rdev->raid_disk < 0 |
| 7085 | && !test_bit(Faulty, &rdev->flags)) { | 7303 | && !test_bit(Faulty, &rdev->flags)) { |
| 7086 | rdev->recovery_offset = 0; | 7304 | rdev->recovery_offset = 0; |
| 7087 | if (mddev->pers-> | 7305 | if (mddev->pers-> |
| 7088 | hot_add_disk(mddev, rdev) == 0) { | 7306 | hot_add_disk(mddev, rdev) == 0) { |
| 7089 | char nm[20]; | 7307 | if (sysfs_link_rdev(mddev, rdev)) |
| 7090 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
| 7091 | if (sysfs_create_link(&mddev->kobj, | ||
| 7092 | &rdev->kobj, nm)) | ||
| 7093 | /* failure here is OK */; | 7308 | /* failure here is OK */; |
| 7094 | spares++; | 7309 | spares++; |
| 7095 | md_new_event(mddev); | 7310 | md_new_event(mddev); |
| @@ -7138,6 +7353,8 @@ static void reap_sync_thread(mddev_t *mddev) | |||
| 7138 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 7353 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 7139 | sysfs_notify_dirent_safe(mddev->sysfs_action); | 7354 | sysfs_notify_dirent_safe(mddev->sysfs_action); |
| 7140 | md_new_event(mddev); | 7355 | md_new_event(mddev); |
| 7356 | if (mddev->event_work.func) | ||
| 7357 | queue_work(md_misc_wq, &mddev->event_work); | ||
| 7141 | } | 7358 | } |
| 7142 | 7359 | ||
| 7143 | /* | 7360 | /* |
| @@ -7170,9 +7387,6 @@ void md_check_recovery(mddev_t *mddev) | |||
| 7170 | if (mddev->bitmap) | 7387 | if (mddev->bitmap) |
| 7171 | bitmap_daemon_work(mddev); | 7388 | bitmap_daemon_work(mddev); |
| 7172 | 7389 | ||
| 7173 | if (mddev->ro) | ||
| 7174 | return; | ||
| 7175 | |||
| 7176 | if (signal_pending(current)) { | 7390 | if (signal_pending(current)) { |
| 7177 | if (mddev->pers->sync_request && !mddev->external) { | 7391 | if (mddev->pers->sync_request && !mddev->external) { |
| 7178 | printk(KERN_INFO "md: %s in immediate safe mode\n", | 7392 | printk(KERN_INFO "md: %s in immediate safe mode\n", |
| @@ -7209,9 +7423,7 @@ void md_check_recovery(mddev_t *mddev) | |||
| 7209 | atomic_read(&rdev->nr_pending)==0) { | 7423 | atomic_read(&rdev->nr_pending)==0) { |
| 7210 | if (mddev->pers->hot_remove_disk( | 7424 | if (mddev->pers->hot_remove_disk( |
| 7211 | mddev, rdev->raid_disk)==0) { | 7425 | mddev, rdev->raid_disk)==0) { |
| 7212 | char nm[20]; | 7426 | sysfs_unlink_rdev(mddev, rdev); |
| 7213 | sprintf(nm,"rd%d", rdev->raid_disk); | ||
| 7214 | sysfs_remove_link(&mddev->kobj, nm); | ||
| 7215 | rdev->raid_disk = -1; | 7427 | rdev->raid_disk = -1; |
| 7216 | } | 7428 | } |
| 7217 | } | 7429 | } |
| @@ -7331,12 +7543,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) | |||
| 7331 | { | 7543 | { |
| 7332 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 7544 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
| 7333 | wait_event_timeout(rdev->blocked_wait, | 7545 | wait_event_timeout(rdev->blocked_wait, |
| 7334 | !test_bit(Blocked, &rdev->flags), | 7546 | !test_bit(Blocked, &rdev->flags) && |
| 7547 | !test_bit(BlockedBadBlocks, &rdev->flags), | ||
| 7335 | msecs_to_jiffies(5000)); | 7548 | msecs_to_jiffies(5000)); |
| 7336 | rdev_dec_pending(rdev, mddev); | 7549 | rdev_dec_pending(rdev, mddev); |
| 7337 | } | 7550 | } |
| 7338 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); | 7551 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); |
| 7339 | 7552 | ||
| 7553 | |||
| 7554 | /* Bad block management. | ||
| 7555 | * We can record which blocks on each device are 'bad' and so just | ||
| 7556 | * fail those blocks, or that stripe, rather than the whole device. | ||
| 7557 | * Entries in the bad-block table are 64bits wide. This comprises: | ||
| 7558 | * Length of bad-range, in sectors: 0-511 for lengths 1-512 | ||
| 7559 | * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) | ||
| 7560 | * A 'shift' can be set so that larger blocks are tracked and | ||
| 7561 | * consequently larger devices can be covered. | ||
| 7562 | * 'Acknowledged' flag - 1 bit. - the most significant bit. | ||
| 7563 | * | ||
| 7564 | * Locking of the bad-block table uses a seqlock so md_is_badblock | ||
| 7565 | * might need to retry if it is very unlucky. | ||
| 7566 | * We will sometimes want to check for bad blocks in a bi_end_io function, | ||
| 7567 | * so we use the write_seqlock_irq variant. | ||
| 7568 | * | ||
| 7569 | * When looking for a bad block we specify a range and want to | ||
| 7570 | * know if any block in the range is bad. So we binary-search | ||
| 7571 | * to the last range that starts at-or-before the given endpoint, | ||
| 7572 | * (or "before the sector after the target range") | ||
| 7573 | * then see if it ends after the given start. | ||
| 7574 | * We return | ||
| 7575 | * 0 if there are no known bad blocks in the range | ||
| 7576 | * 1 if there are known bad block which are all acknowledged | ||
| 7577 | * -1 if there are bad blocks which have not yet been acknowledged in metadata. | ||
| 7578 | * plus the start/length of the first bad section we overlap. | ||
| 7579 | */ | ||
| 7580 | int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, | ||
| 7581 | sector_t *first_bad, int *bad_sectors) | ||
| 7582 | { | ||
| 7583 | int hi; | ||
| 7584 | int lo = 0; | ||
| 7585 | u64 *p = bb->page; | ||
| 7586 | int rv = 0; | ||
| 7587 | sector_t target = s + sectors; | ||
| 7588 | unsigned seq; | ||
| 7589 | |||
| 7590 | if (bb->shift > 0) { | ||
| 7591 | /* round the start down, and the end up */ | ||
| 7592 | s >>= bb->shift; | ||
| 7593 | target += (1<<bb->shift) - 1; | ||
| 7594 | target >>= bb->shift; | ||
| 7595 | sectors = target - s; | ||
| 7596 | } | ||
| 7597 | /* 'target' is now the first block after the bad range */ | ||
| 7598 | |||
| 7599 | retry: | ||
| 7600 | seq = read_seqbegin(&bb->lock); | ||
| 7601 | |||
| 7602 | hi = bb->count; | ||
| 7603 | |||
| 7604 | /* Binary search between lo and hi for 'target' | ||
| 7605 | * i.e. for the last range that starts before 'target' | ||
| 7606 | */ | ||
| 7607 | /* INVARIANT: ranges before 'lo' and at-or-after 'hi' | ||
| 7608 | * are known not to be the last range before target. | ||
| 7609 | * VARIANT: hi-lo is the number of possible | ||
| 7610 | * ranges, and decreases until it reaches 1 | ||
| 7611 | */ | ||
| 7612 | while (hi - lo > 1) { | ||
| 7613 | int mid = (lo + hi) / 2; | ||
| 7614 | sector_t a = BB_OFFSET(p[mid]); | ||
| 7615 | if (a < target) | ||
| 7616 | /* This could still be the one, earlier ranges | ||
| 7617 | * could not. */ | ||
| 7618 | lo = mid; | ||
| 7619 | else | ||
| 7620 | /* This and later ranges are definitely out. */ | ||
| 7621 | hi = mid; | ||
| 7622 | } | ||
| 7623 | /* 'lo' might be the last that started before target, but 'hi' isn't */ | ||
| 7624 | if (hi > lo) { | ||
| 7625 | /* need to check all range that end after 's' to see if | ||
| 7626 | * any are unacknowledged. | ||
| 7627 | */ | ||
| 7628 | while (lo >= 0 && | ||
| 7629 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | ||
| 7630 | if (BB_OFFSET(p[lo]) < target) { | ||
| 7631 | /* starts before the end, and finishes after | ||
| 7632 | * the start, so they must overlap | ||
| 7633 | */ | ||
| 7634 | if (rv != -1 && BB_ACK(p[lo])) | ||
| 7635 | rv = 1; | ||
| 7636 | else | ||
| 7637 | rv = -1; | ||
| 7638 | *first_bad = BB_OFFSET(p[lo]); | ||
| 7639 | *bad_sectors = BB_LEN(p[lo]); | ||
| 7640 | } | ||
| 7641 | lo--; | ||
| 7642 | } | ||
| 7643 | } | ||
| 7644 | |||
| 7645 | if (read_seqretry(&bb->lock, seq)) | ||
| 7646 | goto retry; | ||
| 7647 | |||
| 7648 | return rv; | ||
| 7649 | } | ||
| 7650 | EXPORT_SYMBOL_GPL(md_is_badblock); | ||
| 7651 | |||
| 7652 | /* | ||
| 7653 | * Add a range of bad blocks to the table. | ||
| 7654 | * This might extend the table, or might contract it | ||
| 7655 | * if two adjacent ranges can be merged. | ||
| 7656 | * We binary-search to find the 'insertion' point, then | ||
| 7657 | * decide how best to handle it. | ||
| 7658 | */ | ||
| 7659 | static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, | ||
| 7660 | int acknowledged) | ||
| 7661 | { | ||
| 7662 | u64 *p; | ||
| 7663 | int lo, hi; | ||
| 7664 | int rv = 1; | ||
| 7665 | |||
| 7666 | if (bb->shift < 0) | ||
| 7667 | /* badblocks are disabled */ | ||
| 7668 | return 0; | ||
| 7669 | |||
| 7670 | if (bb->shift) { | ||
| 7671 | /* round the start down, and the end up */ | ||
| 7672 | sector_t next = s + sectors; | ||
| 7673 | s >>= bb->shift; | ||
| 7674 | next += (1<<bb->shift) - 1; | ||
| 7675 | next >>= bb->shift; | ||
| 7676 | sectors = next - s; | ||
| 7677 | } | ||
| 7678 | |||
| 7679 | write_seqlock_irq(&bb->lock); | ||
| 7680 | |||
| 7681 | p = bb->page; | ||
| 7682 | lo = 0; | ||
| 7683 | hi = bb->count; | ||
| 7684 | /* Find the last range that starts at-or-before 's' */ | ||
| 7685 | while (hi - lo > 1) { | ||
| 7686 | int mid = (lo + hi) / 2; | ||
| 7687 | sector_t a = BB_OFFSET(p[mid]); | ||
| 7688 | if (a <= s) | ||
| 7689 | lo = mid; | ||
| 7690 | else | ||
| 7691 | hi = mid; | ||
| 7692 | } | ||
| 7693 | if (hi > lo && BB_OFFSET(p[lo]) > s) | ||
| 7694 | hi = lo; | ||
| 7695 | |||
| 7696 | if (hi > lo) { | ||
| 7697 | /* we found a range that might merge with the start | ||
| 7698 | * of our new range | ||
| 7699 | */ | ||
| 7700 | sector_t a = BB_OFFSET(p[lo]); | ||
| 7701 | sector_t e = a + BB_LEN(p[lo]); | ||
| 7702 | int ack = BB_ACK(p[lo]); | ||
| 7703 | if (e >= s) { | ||
| 7704 | /* Yes, we can merge with a previous range */ | ||
| 7705 | if (s == a && s + sectors >= e) | ||
| 7706 | /* new range covers old */ | ||
| 7707 | ack = acknowledged; | ||
| 7708 | else | ||
| 7709 | ack = ack && acknowledged; | ||
| 7710 | |||
| 7711 | if (e < s + sectors) | ||
| 7712 | e = s + sectors; | ||
| 7713 | if (e - a <= BB_MAX_LEN) { | ||
| 7714 | p[lo] = BB_MAKE(a, e-a, ack); | ||
| 7715 | s = e; | ||
| 7716 | } else { | ||
| 7717 | /* does not all fit in one range, | ||
| 7718 | * make p[lo] maximal | ||
| 7719 | */ | ||
| 7720 | if (BB_LEN(p[lo]) != BB_MAX_LEN) | ||
| 7721 | p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); | ||
| 7722 | s = a + BB_MAX_LEN; | ||
| 7723 | } | ||
| 7724 | sectors = e - s; | ||
| 7725 | } | ||
| 7726 | } | ||
| 7727 | if (sectors && hi < bb->count) { | ||
| 7728 | /* 'hi' points to the first range that starts after 's'. | ||
| 7729 | * Maybe we can merge with the start of that range */ | ||
| 7730 | sector_t a = BB_OFFSET(p[hi]); | ||
| 7731 | sector_t e = a + BB_LEN(p[hi]); | ||
| 7732 | int ack = BB_ACK(p[hi]); | ||
| 7733 | if (a <= s + sectors) { | ||
| 7734 | /* merging is possible */ | ||
| 7735 | if (e <= s + sectors) { | ||
| 7736 | /* full overlap */ | ||
| 7737 | e = s + sectors; | ||
| 7738 | ack = acknowledged; | ||
| 7739 | } else | ||
| 7740 | ack = ack && acknowledged; | ||
| 7741 | |||
| 7742 | a = s; | ||
| 7743 | if (e - a <= BB_MAX_LEN) { | ||
| 7744 | p[hi] = BB_MAKE(a, e-a, ack); | ||
| 7745 | s = e; | ||
| 7746 | } else { | ||
| 7747 | p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); | ||
| 7748 | s = a + BB_MAX_LEN; | ||
| 7749 | } | ||
| 7750 | sectors = e - s; | ||
| 7751 | lo = hi; | ||
| 7752 | hi++; | ||
| 7753 | } | ||
| 7754 | } | ||
| 7755 | if (sectors == 0 && hi < bb->count) { | ||
| 7756 | /* we might be able to combine lo and hi */ | ||
| 7757 | /* Note: 's' is at the end of 'lo' */ | ||
| 7758 | sector_t a = BB_OFFSET(p[hi]); | ||
| 7759 | int lolen = BB_LEN(p[lo]); | ||
| 7760 | int hilen = BB_LEN(p[hi]); | ||
| 7761 | int newlen = lolen + hilen - (s - a); | ||
| 7762 | if (s >= a && newlen < BB_MAX_LEN) { | ||
| 7763 | /* yes, we can combine them */ | ||
| 7764 | int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); | ||
| 7765 | p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); | ||
| 7766 | memmove(p + hi, p + hi + 1, | ||
| 7767 | (bb->count - hi - 1) * 8); | ||
| 7768 | bb->count--; | ||
| 7769 | } | ||
| 7770 | } | ||
| 7771 | while (sectors) { | ||
| 7772 | /* didn't merge (it all). | ||
| 7773 | * Need to add a range just before 'hi' */ | ||
| 7774 | if (bb->count >= MD_MAX_BADBLOCKS) { | ||
| 7775 | /* No room for more */ | ||
| 7776 | rv = 0; | ||
| 7777 | break; | ||
| 7778 | } else { | ||
| 7779 | int this_sectors = sectors; | ||
| 7780 | memmove(p + hi + 1, p + hi, | ||
| 7781 | (bb->count - hi) * 8); | ||
| 7782 | bb->count++; | ||
| 7783 | |||
| 7784 | if (this_sectors > BB_MAX_LEN) | ||
| 7785 | this_sectors = BB_MAX_LEN; | ||
| 7786 | p[hi] = BB_MAKE(s, this_sectors, acknowledged); | ||
| 7787 | sectors -= this_sectors; | ||
| 7788 | s += this_sectors; | ||
| 7789 | } | ||
| 7790 | } | ||
| 7791 | |||
| 7792 | bb->changed = 1; | ||
| 7793 | if (!acknowledged) | ||
| 7794 | bb->unacked_exist = 1; | ||
| 7795 | write_sequnlock_irq(&bb->lock); | ||
| 7796 | |||
| 7797 | return rv; | ||
| 7798 | } | ||
| 7799 | |||
| 7800 | int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, | ||
| 7801 | int acknowledged) | ||
| 7802 | { | ||
| 7803 | int rv = md_set_badblocks(&rdev->badblocks, | ||
| 7804 | s + rdev->data_offset, sectors, acknowledged); | ||
| 7805 | if (rv) { | ||
| 7806 | /* Make sure they get written out promptly */ | ||
| 7807 | set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); | ||
| 7808 | md_wakeup_thread(rdev->mddev->thread); | ||
| 7809 | } | ||
| 7810 | return rv; | ||
| 7811 | } | ||
| 7812 | EXPORT_SYMBOL_GPL(rdev_set_badblocks); | ||
| 7813 | |||
| 7814 | /* | ||
| 7815 | * Remove a range of bad blocks from the table. | ||
| 7816 | * This may involve extending the table if we spilt a region, | ||
| 7817 | * but it must not fail. So if the table becomes full, we just | ||
| 7818 | * drop the remove request. | ||
| 7819 | */ | ||
| 7820 | static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) | ||
| 7821 | { | ||
| 7822 | u64 *p; | ||
| 7823 | int lo, hi; | ||
| 7824 | sector_t target = s + sectors; | ||
| 7825 | int rv = 0; | ||
| 7826 | |||
| 7827 | if (bb->shift > 0) { | ||
| 7828 | /* When clearing we round the start up and the end down. | ||
| 7829 | * This should not matter as the shift should align with | ||
| 7830 | * the block size and no rounding should ever be needed. | ||
| 7831 | * However it is better the think a block is bad when it | ||
| 7832 | * isn't than to think a block is not bad when it is. | ||
| 7833 | */ | ||
| 7834 | s += (1<<bb->shift) - 1; | ||
| 7835 | s >>= bb->shift; | ||
| 7836 | target >>= bb->shift; | ||
| 7837 | sectors = target - s; | ||
| 7838 | } | ||
| 7839 | |||
| 7840 | write_seqlock_irq(&bb->lock); | ||
| 7841 | |||
| 7842 | p = bb->page; | ||
| 7843 | lo = 0; | ||
| 7844 | hi = bb->count; | ||
| 7845 | /* Find the last range that starts before 'target' */ | ||
| 7846 | while (hi - lo > 1) { | ||
| 7847 | int mid = (lo + hi) / 2; | ||
| 7848 | sector_t a = BB_OFFSET(p[mid]); | ||
| 7849 | if (a < target) | ||
| 7850 | lo = mid; | ||
| 7851 | else | ||
| 7852 | hi = mid; | ||
| 7853 | } | ||
| 7854 | if (hi > lo) { | ||
| 7855 | /* p[lo] is the last range that could overlap the | ||
| 7856 | * current range. Earlier ranges could also overlap, | ||
| 7857 | * but only this one can overlap the end of the range. | ||
| 7858 | */ | ||
| 7859 | if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { | ||
| 7860 | /* Partial overlap, leave the tail of this range */ | ||
| 7861 | int ack = BB_ACK(p[lo]); | ||
| 7862 | sector_t a = BB_OFFSET(p[lo]); | ||
| 7863 | sector_t end = a + BB_LEN(p[lo]); | ||
| 7864 | |||
| 7865 | if (a < s) { | ||
| 7866 | /* we need to split this range */ | ||
| 7867 | if (bb->count >= MD_MAX_BADBLOCKS) { | ||
| 7868 | rv = 0; | ||
| 7869 | goto out; | ||
| 7870 | } | ||
| 7871 | memmove(p+lo+1, p+lo, (bb->count - lo) * 8); | ||
| 7872 | bb->count++; | ||
| 7873 | p[lo] = BB_MAKE(a, s-a, ack); | ||
| 7874 | lo++; | ||
| 7875 | } | ||
| 7876 | p[lo] = BB_MAKE(target, end - target, ack); | ||
| 7877 | /* there is no longer an overlap */ | ||
| 7878 | hi = lo; | ||
| 7879 | lo--; | ||
| 7880 | } | ||
| 7881 | while (lo >= 0 && | ||
| 7882 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | ||
| 7883 | /* This range does overlap */ | ||
| 7884 | if (BB_OFFSET(p[lo]) < s) { | ||
| 7885 | /* Keep the early parts of this range. */ | ||
| 7886 | int ack = BB_ACK(p[lo]); | ||
| 7887 | sector_t start = BB_OFFSET(p[lo]); | ||
| 7888 | p[lo] = BB_MAKE(start, s - start, ack); | ||
| 7889 | /* now low doesn't overlap, so.. */ | ||
| 7890 | break; | ||
| 7891 | } | ||
| 7892 | lo--; | ||
| 7893 | } | ||
| 7894 | /* 'lo' is strictly before, 'hi' is strictly after, | ||
| 7895 | * anything between needs to be discarded | ||
| 7896 | */ | ||
| 7897 | if (hi - lo > 1) { | ||
| 7898 | memmove(p+lo+1, p+hi, (bb->count - hi) * 8); | ||
| 7899 | bb->count -= (hi - lo - 1); | ||
| 7900 | } | ||
| 7901 | } | ||
| 7902 | |||
| 7903 | bb->changed = 1; | ||
| 7904 | out: | ||
| 7905 | write_sequnlock_irq(&bb->lock); | ||
| 7906 | return rv; | ||
| 7907 | } | ||
| 7908 | |||
| 7909 | int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors) | ||
| 7910 | { | ||
| 7911 | return md_clear_badblocks(&rdev->badblocks, | ||
| 7912 | s + rdev->data_offset, | ||
| 7913 | sectors); | ||
| 7914 | } | ||
| 7915 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); | ||
| 7916 | |||
| 7917 | /* | ||
| 7918 | * Acknowledge all bad blocks in a list. | ||
| 7919 | * This only succeeds if ->changed is clear. It is used by | ||
| 7920 | * in-kernel metadata updates | ||
| 7921 | */ | ||
| 7922 | void md_ack_all_badblocks(struct badblocks *bb) | ||
| 7923 | { | ||
| 7924 | if (bb->page == NULL || bb->changed) | ||
| 7925 | /* no point even trying */ | ||
| 7926 | return; | ||
| 7927 | write_seqlock_irq(&bb->lock); | ||
| 7928 | |||
| 7929 | if (bb->changed == 0) { | ||
| 7930 | u64 *p = bb->page; | ||
| 7931 | int i; | ||
| 7932 | for (i = 0; i < bb->count ; i++) { | ||
| 7933 | if (!BB_ACK(p[i])) { | ||
| 7934 | sector_t start = BB_OFFSET(p[i]); | ||
| 7935 | int len = BB_LEN(p[i]); | ||
| 7936 | p[i] = BB_MAKE(start, len, 1); | ||
| 7937 | } | ||
| 7938 | } | ||
| 7939 | bb->unacked_exist = 0; | ||
| 7940 | } | ||
| 7941 | write_sequnlock_irq(&bb->lock); | ||
| 7942 | } | ||
| 7943 | EXPORT_SYMBOL_GPL(md_ack_all_badblocks); | ||
| 7944 | |||
| 7945 | /* sysfs access to bad-blocks list. | ||
| 7946 | * We present two files. | ||
| 7947 | * 'bad-blocks' lists sector numbers and lengths of ranges that | ||
| 7948 | * are recorded as bad. The list is truncated to fit within | ||
| 7949 | * the one-page limit of sysfs. | ||
| 7950 | * Writing "sector length" to this file adds an acknowledged | ||
| 7951 | * bad block list. | ||
| 7952 | * 'unacknowledged-bad-blocks' lists bad blocks that have not yet | ||
| 7953 | * been acknowledged. Writing to this file adds bad blocks | ||
| 7954 | * without acknowledging them. This is largely for testing. | ||
| 7955 | */ | ||
| 7956 | |||
| 7957 | static ssize_t | ||
| 7958 | badblocks_show(struct badblocks *bb, char *page, int unack) | ||
| 7959 | { | ||
| 7960 | size_t len; | ||
| 7961 | int i; | ||
| 7962 | u64 *p = bb->page; | ||
| 7963 | unsigned seq; | ||
| 7964 | |||
| 7965 | if (bb->shift < 0) | ||
| 7966 | return 0; | ||
| 7967 | |||
| 7968 | retry: | ||
| 7969 | seq = read_seqbegin(&bb->lock); | ||
| 7970 | |||
| 7971 | len = 0; | ||
| 7972 | i = 0; | ||
| 7973 | |||
| 7974 | while (len < PAGE_SIZE && i < bb->count) { | ||
| 7975 | sector_t s = BB_OFFSET(p[i]); | ||
| 7976 | unsigned int length = BB_LEN(p[i]); | ||
| 7977 | int ack = BB_ACK(p[i]); | ||
| 7978 | i++; | ||
| 7979 | |||
| 7980 | if (unack && ack) | ||
| 7981 | continue; | ||
| 7982 | |||
| 7983 | len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", | ||
| 7984 | (unsigned long long)s << bb->shift, | ||
| 7985 | length << bb->shift); | ||
| 7986 | } | ||
| 7987 | if (unack && len == 0) | ||
| 7988 | bb->unacked_exist = 0; | ||
| 7989 | |||
| 7990 | if (read_seqretry(&bb->lock, seq)) | ||
| 7991 | goto retry; | ||
| 7992 | |||
| 7993 | return len; | ||
| 7994 | } | ||
| 7995 | |||
| 7996 | #define DO_DEBUG 1 | ||
| 7997 | |||
| 7998 | static ssize_t | ||
| 7999 | badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) | ||
| 8000 | { | ||
| 8001 | unsigned long long sector; | ||
| 8002 | int length; | ||
| 8003 | char newline; | ||
| 8004 | #ifdef DO_DEBUG | ||
| 8005 | /* Allow clearing via sysfs *only* for testing/debugging. | ||
| 8006 | * Normally only a successful write may clear a badblock | ||
| 8007 | */ | ||
| 8008 | int clear = 0; | ||
| 8009 | if (page[0] == '-') { | ||
| 8010 | clear = 1; | ||
| 8011 | page++; | ||
| 8012 | } | ||
| 8013 | #endif /* DO_DEBUG */ | ||
| 8014 | |||
| 8015 | switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { | ||
| 8016 | case 3: | ||
| 8017 | if (newline != '\n') | ||
| 8018 | return -EINVAL; | ||
| 8019 | case 2: | ||
| 8020 | if (length <= 0) | ||
| 8021 | return -EINVAL; | ||
| 8022 | break; | ||
| 8023 | default: | ||
| 8024 | return -EINVAL; | ||
| 8025 | } | ||
| 8026 | |||
| 8027 | #ifdef DO_DEBUG | ||
| 8028 | if (clear) { | ||
| 8029 | md_clear_badblocks(bb, sector, length); | ||
| 8030 | return len; | ||
| 8031 | } | ||
| 8032 | #endif /* DO_DEBUG */ | ||
| 8033 | if (md_set_badblocks(bb, sector, length, !unack)) | ||
| 8034 | return len; | ||
| 8035 | else | ||
| 8036 | return -ENOSPC; | ||
| 8037 | } | ||
| 8038 | |||
| 7340 | static int md_notify_reboot(struct notifier_block *this, | 8039 | static int md_notify_reboot(struct notifier_block *this, |
| 7341 | unsigned long code, void *x) | 8040 | unsigned long code, void *x) |
| 7342 | { | 8041 | { |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 1c26c7a08ae6..1e586bb4452e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
| @@ -29,6 +29,13 @@ | |||
| 29 | typedef struct mddev_s mddev_t; | 29 | typedef struct mddev_s mddev_t; |
| 30 | typedef struct mdk_rdev_s mdk_rdev_t; | 30 | typedef struct mdk_rdev_s mdk_rdev_t; |
| 31 | 31 | ||
| 32 | /* Bad block numbers are stored sorted in a single page. | ||
| 33 | * 64bits is used for each block or extent. | ||
| 34 | * 54 bits are sector number, 9 bits are extent size, | ||
| 35 | * 1 bit is an 'acknowledged' flag. | ||
| 36 | */ | ||
| 37 | #define MD_MAX_BADBLOCKS (PAGE_SIZE/8) | ||
| 38 | |||
| 32 | /* | 39 | /* |
| 33 | * MD's 'extended' device | 40 | * MD's 'extended' device |
| 34 | */ | 41 | */ |
| @@ -48,7 +55,7 @@ struct mdk_rdev_s | |||
| 48 | struct block_device *meta_bdev; | 55 | struct block_device *meta_bdev; |
| 49 | struct block_device *bdev; /* block device handle */ | 56 | struct block_device *bdev; /* block device handle */ |
| 50 | 57 | ||
| 51 | struct page *sb_page; | 58 | struct page *sb_page, *bb_page; |
| 52 | int sb_loaded; | 59 | int sb_loaded; |
| 53 | __u64 sb_events; | 60 | __u64 sb_events; |
| 54 | sector_t data_offset; /* start of data in array */ | 61 | sector_t data_offset; /* start of data in array */ |
| @@ -74,9 +81,29 @@ struct mdk_rdev_s | |||
| 74 | #define In_sync 2 /* device is in_sync with rest of array */ | 81 | #define In_sync 2 /* device is in_sync with rest of array */ |
| 75 | #define WriteMostly 4 /* Avoid reading if at all possible */ | 82 | #define WriteMostly 4 /* Avoid reading if at all possible */ |
| 76 | #define AutoDetected 7 /* added by auto-detect */ | 83 | #define AutoDetected 7 /* added by auto-detect */ |
| 77 | #define Blocked 8 /* An error occurred on an externally | 84 | #define Blocked 8 /* An error occurred but has not yet |
| 78 | * managed array, don't allow writes | 85 | * been acknowledged by the metadata |
| 86 | * handler, so don't allow writes | ||
| 79 | * until it is cleared */ | 87 | * until it is cleared */ |
| 88 | #define WriteErrorSeen 9 /* A write error has been seen on this | ||
| 89 | * device | ||
| 90 | */ | ||
| 91 | #define FaultRecorded 10 /* Intermediate state for clearing | ||
| 92 | * Blocked. The Fault is/will-be | ||
| 93 | * recorded in the metadata, but that | ||
| 94 | * metadata hasn't been stored safely | ||
| 95 | * on disk yet. | ||
| 96 | */ | ||
| 97 | #define BlockedBadBlocks 11 /* A writer is blocked because they | ||
| 98 | * found an unacknowledged bad-block. | ||
| 99 | * This can safely be cleared at any | ||
| 100 | * time, and the writer will re-check. | ||
| 101 | * It may be set at any time, and at | ||
| 102 | * worst the writer will timeout and | ||
| 103 | * re-check. So setting it as | ||
| 104 | * accurately as possible is good, but | ||
| 105 | * not absolutely critical. | ||
| 106 | */ | ||
| 80 | wait_queue_head_t blocked_wait; | 107 | wait_queue_head_t blocked_wait; |
| 81 | 108 | ||
| 82 | int desc_nr; /* descriptor index in the superblock */ | 109 | int desc_nr; /* descriptor index in the superblock */ |
| @@ -111,8 +138,54 @@ struct mdk_rdev_s | |||
| 111 | 138 | ||
| 112 | struct sysfs_dirent *sysfs_state; /* handle for 'state' | 139 | struct sysfs_dirent *sysfs_state; /* handle for 'state' |
| 113 | * sysfs entry */ | 140 | * sysfs entry */ |
| 141 | |||
| 142 | struct badblocks { | ||
| 143 | int count; /* count of bad blocks */ | ||
| 144 | int unacked_exist; /* there probably are unacknowledged | ||
| 145 | * bad blocks. This is only cleared | ||
| 146 | * when a read discovers none | ||
| 147 | */ | ||
| 148 | int shift; /* shift from sectors to block size | ||
| 149 | * a -ve shift means badblocks are | ||
| 150 | * disabled.*/ | ||
| 151 | u64 *page; /* badblock list */ | ||
| 152 | int changed; | ||
| 153 | seqlock_t lock; | ||
| 154 | |||
| 155 | sector_t sector; | ||
| 156 | sector_t size; /* in sectors */ | ||
| 157 | } badblocks; | ||
| 114 | }; | 158 | }; |
| 115 | 159 | ||
| 160 | #define BB_LEN_MASK (0x00000000000001FFULL) | ||
| 161 | #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) | ||
| 162 | #define BB_ACK_MASK (0x8000000000000000ULL) | ||
| 163 | #define BB_MAX_LEN 512 | ||
| 164 | #define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) | ||
| 165 | #define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) | ||
| 166 | #define BB_ACK(x) (!!((x) & BB_ACK_MASK)) | ||
| 167 | #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) | ||
| 168 | |||
| 169 | extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, | ||
| 170 | sector_t *first_bad, int *bad_sectors); | ||
| 171 | static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors, | ||
| 172 | sector_t *first_bad, int *bad_sectors) | ||
| 173 | { | ||
| 174 | if (unlikely(rdev->badblocks.count)) { | ||
| 175 | int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, | ||
| 176 | sectors, | ||
| 177 | first_bad, bad_sectors); | ||
| 178 | if (rv) | ||
| 179 | *first_bad -= rdev->data_offset; | ||
| 180 | return rv; | ||
| 181 | } | ||
| 182 | return 0; | ||
| 183 | } | ||
| 184 | extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, | ||
| 185 | int acknowledged); | ||
| 186 | extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors); | ||
| 187 | extern void md_ack_all_badblocks(struct badblocks *bb); | ||
| 188 | |||
| 116 | struct mddev_s | 189 | struct mddev_s |
| 117 | { | 190 | { |
| 118 | void *private; | 191 | void *private; |
| @@ -239,9 +312,12 @@ struct mddev_s | |||
| 239 | #define MD_RECOVERY_FROZEN 9 | 312 | #define MD_RECOVERY_FROZEN 9 |
| 240 | 313 | ||
| 241 | unsigned long recovery; | 314 | unsigned long recovery; |
| 242 | int recovery_disabled; /* if we detect that recovery | 315 | /* If a RAID personality determines that recovery (of a particular |
| 243 | * will always fail, set this | 316 | * device) will fail due to a read error on the source device, it |
| 244 | * so we don't loop trying */ | 317 | * takes a copy of this number and does not attempt recovery again |
| 318 | * until this number changes. | ||
| 319 | */ | ||
| 320 | int recovery_disabled; | ||
| 245 | 321 | ||
| 246 | int in_sync; /* know to not need resync */ | 322 | int in_sync; /* know to not need resync */ |
| 247 | /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so | 323 | /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so |
| @@ -304,11 +380,6 @@ struct mddev_s | |||
| 304 | * hot-adding a bitmap. It should | 380 | * hot-adding a bitmap. It should |
| 305 | * eventually be settable by sysfs. | 381 | * eventually be settable by sysfs. |
| 306 | */ | 382 | */ |
| 307 | /* When md is serving under dm, it might use a | ||
| 308 | * dirty_log to store the bits. | ||
| 309 | */ | ||
| 310 | struct dm_dirty_log *log; | ||
| 311 | |||
| 312 | struct mutex mutex; | 383 | struct mutex mutex; |
| 313 | unsigned long chunksize; | 384 | unsigned long chunksize; |
| 314 | unsigned long daemon_sleep; /* how many jiffies between updates? */ | 385 | unsigned long daemon_sleep; /* how many jiffies between updates? */ |
| @@ -413,6 +484,20 @@ static inline char * mdname (mddev_t * mddev) | |||
| 413 | return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; | 484 | return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; |
| 414 | } | 485 | } |
| 415 | 486 | ||
| 487 | static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev) | ||
| 488 | { | ||
| 489 | char nm[20]; | ||
| 490 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
| 491 | return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); | ||
| 492 | } | ||
| 493 | |||
| 494 | static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev) | ||
| 495 | { | ||
| 496 | char nm[20]; | ||
| 497 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
| 498 | sysfs_remove_link(&mddev->kobj, nm); | ||
| 499 | } | ||
| 500 | |||
| 416 | /* | 501 | /* |
| 417 | * iterates through some rdev ringlist. It's safe to remove the | 502 | * iterates through some rdev ringlist. It's safe to remove the |
| 418 | * current 'rdev'. Dont touch 'tmp' though. | 503 | * current 'rdev'. Dont touch 'tmp' though. |
| @@ -505,7 +590,7 @@ extern void mddev_init(mddev_t *mddev); | |||
| 505 | extern int md_run(mddev_t *mddev); | 590 | extern int md_run(mddev_t *mddev); |
| 506 | extern void md_stop(mddev_t *mddev); | 591 | extern void md_stop(mddev_t *mddev); |
| 507 | extern void md_stop_writes(mddev_t *mddev); | 592 | extern void md_stop_writes(mddev_t *mddev); |
| 508 | extern void md_rdev_init(mdk_rdev_t *rdev); | 593 | extern int md_rdev_init(mdk_rdev_t *rdev); |
| 509 | 594 | ||
| 510 | extern void mddev_suspend(mddev_t *mddev); | 595 | extern void mddev_suspend(mddev_t *mddev); |
| 511 | extern void mddev_resume(mddev_t *mddev); | 596 | extern void mddev_resume(mddev_t *mddev); |
| @@ -514,4 +599,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | |||
| 514 | extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, | 599 | extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, |
| 515 | mddev_t *mddev); | 600 | mddev_t *mddev); |
| 516 | extern int mddev_check_plugged(mddev_t *mddev); | 601 | extern int mddev_check_plugged(mddev_t *mddev); |
| 602 | extern void md_trim_bio(struct bio *bio, int offset, int size); | ||
| 517 | #endif /* _MD_MD_H */ | 603 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f7431b6d8447..32323f0afd89 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -35,16 +35,13 @@ | |||
| 35 | #include <linux/delay.h> | 35 | #include <linux/delay.h> |
| 36 | #include <linux/blkdev.h> | 36 | #include <linux/blkdev.h> |
| 37 | #include <linux/seq_file.h> | 37 | #include <linux/seq_file.h> |
| 38 | #include <linux/ratelimit.h> | ||
| 38 | #include "md.h" | 39 | #include "md.h" |
| 39 | #include "raid1.h" | 40 | #include "raid1.h" |
| 40 | #include "bitmap.h" | 41 | #include "bitmap.h" |
| 41 | 42 | ||
| 42 | #define DEBUG 0 | 43 | #define DEBUG 0 |
| 43 | #if DEBUG | 44 | #define PRINTK(x...) do { if (DEBUG) printk(x); } while (0) |
| 44 | #define PRINTK(x...) printk(x) | ||
| 45 | #else | ||
| 46 | #define PRINTK(x...) | ||
| 47 | #endif | ||
| 48 | 45 | ||
| 49 | /* | 46 | /* |
| 50 | * Number of guaranteed r1bios in case of extreme VM load: | 47 | * Number of guaranteed r1bios in case of extreme VM load: |
| @@ -166,7 +163,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) | |||
| 166 | 163 | ||
| 167 | for (i = 0; i < conf->raid_disks; i++) { | 164 | for (i = 0; i < conf->raid_disks; i++) { |
| 168 | struct bio **bio = r1_bio->bios + i; | 165 | struct bio **bio = r1_bio->bios + i; |
| 169 | if (*bio && *bio != IO_BLOCKED) | 166 | if (!BIO_SPECIAL(*bio)) |
| 170 | bio_put(*bio); | 167 | bio_put(*bio); |
| 171 | *bio = NULL; | 168 | *bio = NULL; |
| 172 | } | 169 | } |
| @@ -176,12 +173,6 @@ static void free_r1bio(r1bio_t *r1_bio) | |||
| 176 | { | 173 | { |
| 177 | conf_t *conf = r1_bio->mddev->private; | 174 | conf_t *conf = r1_bio->mddev->private; |
| 178 | 175 | ||
| 179 | /* | ||
| 180 | * Wake up any possible resync thread that waits for the device | ||
| 181 | * to go idle. | ||
| 182 | */ | ||
| 183 | allow_barrier(conf); | ||
| 184 | |||
| 185 | put_all_bios(conf, r1_bio); | 176 | put_all_bios(conf, r1_bio); |
| 186 | mempool_free(r1_bio, conf->r1bio_pool); | 177 | mempool_free(r1_bio, conf->r1bio_pool); |
| 187 | } | 178 | } |
| @@ -222,6 +213,33 @@ static void reschedule_retry(r1bio_t *r1_bio) | |||
| 222 | * operation and are ready to return a success/failure code to the buffer | 213 | * operation and are ready to return a success/failure code to the buffer |
| 223 | * cache layer. | 214 | * cache layer. |
| 224 | */ | 215 | */ |
| 216 | static void call_bio_endio(r1bio_t *r1_bio) | ||
| 217 | { | ||
| 218 | struct bio *bio = r1_bio->master_bio; | ||
| 219 | int done; | ||
| 220 | conf_t *conf = r1_bio->mddev->private; | ||
| 221 | |||
| 222 | if (bio->bi_phys_segments) { | ||
| 223 | unsigned long flags; | ||
| 224 | spin_lock_irqsave(&conf->device_lock, flags); | ||
| 225 | bio->bi_phys_segments--; | ||
| 226 | done = (bio->bi_phys_segments == 0); | ||
| 227 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
| 228 | } else | ||
| 229 | done = 1; | ||
| 230 | |||
| 231 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) | ||
| 232 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 233 | if (done) { | ||
| 234 | bio_endio(bio, 0); | ||
| 235 | /* | ||
| 236 | * Wake up any possible resync thread that waits for the device | ||
| 237 | * to go idle. | ||
| 238 | */ | ||
| 239 | allow_barrier(conf); | ||
| 240 | } | ||
| 241 | } | ||
| 242 | |||
| 225 | static void raid_end_bio_io(r1bio_t *r1_bio) | 243 | static void raid_end_bio_io(r1bio_t *r1_bio) |
| 226 | { | 244 | { |
| 227 | struct bio *bio = r1_bio->master_bio; | 245 | struct bio *bio = r1_bio->master_bio; |
| @@ -234,8 +252,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio) | |||
| 234 | (unsigned long long) bio->bi_sector + | 252 | (unsigned long long) bio->bi_sector + |
| 235 | (bio->bi_size >> 9) - 1); | 253 | (bio->bi_size >> 9) - 1); |
| 236 | 254 | ||
| 237 | bio_endio(bio, | 255 | call_bio_endio(r1_bio); |
| 238 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | ||
| 239 | } | 256 | } |
| 240 | free_r1bio(r1_bio); | 257 | free_r1bio(r1_bio); |
| 241 | } | 258 | } |
| @@ -287,36 +304,52 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
| 287 | * oops, read error: | 304 | * oops, read error: |
| 288 | */ | 305 | */ |
| 289 | char b[BDEVNAME_SIZE]; | 306 | char b[BDEVNAME_SIZE]; |
| 290 | if (printk_ratelimit()) | 307 | printk_ratelimited( |
| 291 | printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", | 308 | KERN_ERR "md/raid1:%s: %s: " |
| 292 | mdname(conf->mddev), | 309 | "rescheduling sector %llu\n", |
| 293 | bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); | 310 | mdname(conf->mddev), |
| 311 | bdevname(conf->mirrors[mirror].rdev->bdev, | ||
| 312 | b), | ||
| 313 | (unsigned long long)r1_bio->sector); | ||
| 314 | set_bit(R1BIO_ReadError, &r1_bio->state); | ||
| 294 | reschedule_retry(r1_bio); | 315 | reschedule_retry(r1_bio); |
| 295 | } | 316 | } |
| 296 | 317 | ||
| 297 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | 318 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); |
| 298 | } | 319 | } |
| 299 | 320 | ||
| 321 | static void close_write(r1bio_t *r1_bio) | ||
| 322 | { | ||
| 323 | /* it really is the end of this request */ | ||
| 324 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
| 325 | /* free extra copy of the data pages */ | ||
| 326 | int i = r1_bio->behind_page_count; | ||
| 327 | while (i--) | ||
| 328 | safe_put_page(r1_bio->behind_bvecs[i].bv_page); | ||
| 329 | kfree(r1_bio->behind_bvecs); | ||
| 330 | r1_bio->behind_bvecs = NULL; | ||
| 331 | } | ||
| 332 | /* clear the bitmap if all writes complete successfully */ | ||
| 333 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
| 334 | r1_bio->sectors, | ||
| 335 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
| 336 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
| 337 | md_write_end(r1_bio->mddev); | ||
| 338 | } | ||
| 339 | |||
| 300 | static void r1_bio_write_done(r1bio_t *r1_bio) | 340 | static void r1_bio_write_done(r1bio_t *r1_bio) |
| 301 | { | 341 | { |
| 302 | if (atomic_dec_and_test(&r1_bio->remaining)) | 342 | if (!atomic_dec_and_test(&r1_bio->remaining)) |
| 303 | { | 343 | return; |
| 304 | /* it really is the end of this request */ | 344 | |
| 305 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | 345 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) |
| 306 | /* free extra copy of the data pages */ | 346 | reschedule_retry(r1_bio); |
| 307 | int i = r1_bio->behind_page_count; | 347 | else { |
| 308 | while (i--) | 348 | close_write(r1_bio); |
| 309 | safe_put_page(r1_bio->behind_pages[i]); | 349 | if (test_bit(R1BIO_MadeGood, &r1_bio->state)) |
| 310 | kfree(r1_bio->behind_pages); | 350 | reschedule_retry(r1_bio); |
| 311 | r1_bio->behind_pages = NULL; | 351 | else |
| 312 | } | 352 | raid_end_bio_io(r1_bio); |
| 313 | /* clear the bitmap if all writes complete successfully */ | ||
| 314 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
| 315 | r1_bio->sectors, | ||
| 316 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
| 317 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
| 318 | md_write_end(r1_bio->mddev); | ||
| 319 | raid_end_bio_io(r1_bio); | ||
| 320 | } | 353 | } |
| 321 | } | 354 | } |
| 322 | 355 | ||
| @@ -336,13 +369,11 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
| 336 | /* | 369 | /* |
| 337 | * 'one mirror IO has finished' event handler: | 370 | * 'one mirror IO has finished' event handler: |
| 338 | */ | 371 | */ |
| 339 | r1_bio->bios[mirror] = NULL; | ||
| 340 | to_put = bio; | ||
| 341 | if (!uptodate) { | 372 | if (!uptodate) { |
| 342 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | 373 | set_bit(WriteErrorSeen, |
| 343 | /* an I/O failed, we can't clear the bitmap */ | 374 | &conf->mirrors[mirror].rdev->flags); |
| 344 | set_bit(R1BIO_Degraded, &r1_bio->state); | 375 | set_bit(R1BIO_WriteError, &r1_bio->state); |
| 345 | } else | 376 | } else { |
| 346 | /* | 377 | /* |
| 347 | * Set R1BIO_Uptodate in our master bio, so that we | 378 | * Set R1BIO_Uptodate in our master bio, so that we |
| 348 | * will return a good error code for to the higher | 379 | * will return a good error code for to the higher |
| @@ -353,8 +384,22 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
| 353 | * to user-side. So if something waits for IO, then it | 384 | * to user-side. So if something waits for IO, then it |
| 354 | * will wait for the 'master' bio. | 385 | * will wait for the 'master' bio. |
| 355 | */ | 386 | */ |
| 387 | sector_t first_bad; | ||
| 388 | int bad_sectors; | ||
| 389 | |||
| 390 | r1_bio->bios[mirror] = NULL; | ||
| 391 | to_put = bio; | ||
| 356 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 392 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
| 357 | 393 | ||
| 394 | /* Maybe we can clear some bad blocks. */ | ||
| 395 | if (is_badblock(conf->mirrors[mirror].rdev, | ||
| 396 | r1_bio->sector, r1_bio->sectors, | ||
| 397 | &first_bad, &bad_sectors)) { | ||
| 398 | r1_bio->bios[mirror] = IO_MADE_GOOD; | ||
| 399 | set_bit(R1BIO_MadeGood, &r1_bio->state); | ||
| 400 | } | ||
| 401 | } | ||
| 402 | |||
| 358 | update_head_pos(mirror, r1_bio); | 403 | update_head_pos(mirror, r1_bio); |
| 359 | 404 | ||
| 360 | if (behind) { | 405 | if (behind) { |
| @@ -377,11 +422,13 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
| 377 | (unsigned long long) mbio->bi_sector, | 422 | (unsigned long long) mbio->bi_sector, |
| 378 | (unsigned long long) mbio->bi_sector + | 423 | (unsigned long long) mbio->bi_sector + |
| 379 | (mbio->bi_size >> 9) - 1); | 424 | (mbio->bi_size >> 9) - 1); |
| 380 | bio_endio(mbio, 0); | 425 | call_bio_endio(r1_bio); |
| 381 | } | 426 | } |
| 382 | } | 427 | } |
| 383 | } | 428 | } |
| 384 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | 429 | if (r1_bio->bios[mirror] == NULL) |
| 430 | rdev_dec_pending(conf->mirrors[mirror].rdev, | ||
| 431 | conf->mddev); | ||
| 385 | 432 | ||
| 386 | /* | 433 | /* |
| 387 | * Let's see if all mirrored write operations have finished | 434 | * Let's see if all mirrored write operations have finished |
| @@ -408,10 +455,11 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
| 408 | * | 455 | * |
| 409 | * The rdev for the device selected will have nr_pending incremented. | 456 | * The rdev for the device selected will have nr_pending incremented. |
| 410 | */ | 457 | */ |
| 411 | static int read_balance(conf_t *conf, r1bio_t *r1_bio) | 458 | static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors) |
| 412 | { | 459 | { |
| 413 | const sector_t this_sector = r1_bio->sector; | 460 | const sector_t this_sector = r1_bio->sector; |
| 414 | const int sectors = r1_bio->sectors; | 461 | int sectors; |
| 462 | int best_good_sectors; | ||
| 415 | int start_disk; | 463 | int start_disk; |
| 416 | int best_disk; | 464 | int best_disk; |
| 417 | int i; | 465 | int i; |
| @@ -426,8 +474,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
| 426 | * We take the first readable disk when above the resync window. | 474 | * We take the first readable disk when above the resync window. |
| 427 | */ | 475 | */ |
| 428 | retry: | 476 | retry: |
| 477 | sectors = r1_bio->sectors; | ||
| 429 | best_disk = -1; | 478 | best_disk = -1; |
| 430 | best_dist = MaxSector; | 479 | best_dist = MaxSector; |
| 480 | best_good_sectors = 0; | ||
| 481 | |||
| 431 | if (conf->mddev->recovery_cp < MaxSector && | 482 | if (conf->mddev->recovery_cp < MaxSector && |
| 432 | (this_sector + sectors >= conf->next_resync)) { | 483 | (this_sector + sectors >= conf->next_resync)) { |
| 433 | choose_first = 1; | 484 | choose_first = 1; |
| @@ -439,6 +490,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
| 439 | 490 | ||
| 440 | for (i = 0 ; i < conf->raid_disks ; i++) { | 491 | for (i = 0 ; i < conf->raid_disks ; i++) { |
| 441 | sector_t dist; | 492 | sector_t dist; |
| 493 | sector_t first_bad; | ||
| 494 | int bad_sectors; | ||
| 495 | |||
| 442 | int disk = start_disk + i; | 496 | int disk = start_disk + i; |
| 443 | if (disk >= conf->raid_disks) | 497 | if (disk >= conf->raid_disks) |
| 444 | disk -= conf->raid_disks; | 498 | disk -= conf->raid_disks; |
| @@ -461,6 +515,35 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
| 461 | /* This is a reasonable device to use. It might | 515 | /* This is a reasonable device to use. It might |
| 462 | * even be best. | 516 | * even be best. |
| 463 | */ | 517 | */ |
| 518 | if (is_badblock(rdev, this_sector, sectors, | ||
| 519 | &first_bad, &bad_sectors)) { | ||
| 520 | if (best_dist < MaxSector) | ||
| 521 | /* already have a better device */ | ||
| 522 | continue; | ||
| 523 | if (first_bad <= this_sector) { | ||
| 524 | /* cannot read here. If this is the 'primary' | ||
| 525 | * device, then we must not read beyond | ||
| 526 | * bad_sectors from another device.. | ||
| 527 | */ | ||
| 528 | bad_sectors -= (this_sector - first_bad); | ||
| 529 | if (choose_first && sectors > bad_sectors) | ||
| 530 | sectors = bad_sectors; | ||
| 531 | if (best_good_sectors > sectors) | ||
| 532 | best_good_sectors = sectors; | ||
| 533 | |||
| 534 | } else { | ||
| 535 | sector_t good_sectors = first_bad - this_sector; | ||
| 536 | if (good_sectors > best_good_sectors) { | ||
| 537 | best_good_sectors = good_sectors; | ||
| 538 | best_disk = disk; | ||
| 539 | } | ||
| 540 | if (choose_first) | ||
| 541 | break; | ||
| 542 | } | ||
| 543 | continue; | ||
| 544 | } else | ||
| 545 | best_good_sectors = sectors; | ||
| 546 | |||
| 464 | dist = abs(this_sector - conf->mirrors[disk].head_position); | 547 | dist = abs(this_sector - conf->mirrors[disk].head_position); |
| 465 | if (choose_first | 548 | if (choose_first |
| 466 | /* Don't change to another disk for sequential reads */ | 549 | /* Don't change to another disk for sequential reads */ |
| @@ -489,10 +572,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
| 489 | rdev_dec_pending(rdev, conf->mddev); | 572 | rdev_dec_pending(rdev, conf->mddev); |
| 490 | goto retry; | 573 | goto retry; |
| 491 | } | 574 | } |
| 575 | sectors = best_good_sectors; | ||
| 492 | conf->next_seq_sect = this_sector + sectors; | 576 | conf->next_seq_sect = this_sector + sectors; |
| 493 | conf->last_used = best_disk; | 577 | conf->last_used = best_disk; |
| 494 | } | 578 | } |
| 495 | rcu_read_unlock(); | 579 | rcu_read_unlock(); |
| 580 | *max_sectors = sectors; | ||
| 496 | 581 | ||
| 497 | return best_disk; | 582 | return best_disk; |
| 498 | } | 583 | } |
| @@ -672,30 +757,31 @@ static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio) | |||
| 672 | { | 757 | { |
| 673 | int i; | 758 | int i; |
| 674 | struct bio_vec *bvec; | 759 | struct bio_vec *bvec; |
| 675 | struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), | 760 | struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), |
| 676 | GFP_NOIO); | 761 | GFP_NOIO); |
| 677 | if (unlikely(!pages)) | 762 | if (unlikely(!bvecs)) |
| 678 | return; | 763 | return; |
| 679 | 764 | ||
| 680 | bio_for_each_segment(bvec, bio, i) { | 765 | bio_for_each_segment(bvec, bio, i) { |
| 681 | pages[i] = alloc_page(GFP_NOIO); | 766 | bvecs[i] = *bvec; |
| 682 | if (unlikely(!pages[i])) | 767 | bvecs[i].bv_page = alloc_page(GFP_NOIO); |
| 768 | if (unlikely(!bvecs[i].bv_page)) | ||
| 683 | goto do_sync_io; | 769 | goto do_sync_io; |
| 684 | memcpy(kmap(pages[i]) + bvec->bv_offset, | 770 | memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset, |
| 685 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); | 771 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); |
| 686 | kunmap(pages[i]); | 772 | kunmap(bvecs[i].bv_page); |
| 687 | kunmap(bvec->bv_page); | 773 | kunmap(bvec->bv_page); |
| 688 | } | 774 | } |
| 689 | r1_bio->behind_pages = pages; | 775 | r1_bio->behind_bvecs = bvecs; |
| 690 | r1_bio->behind_page_count = bio->bi_vcnt; | 776 | r1_bio->behind_page_count = bio->bi_vcnt; |
| 691 | set_bit(R1BIO_BehindIO, &r1_bio->state); | 777 | set_bit(R1BIO_BehindIO, &r1_bio->state); |
| 692 | return; | 778 | return; |
| 693 | 779 | ||
| 694 | do_sync_io: | 780 | do_sync_io: |
| 695 | for (i = 0; i < bio->bi_vcnt; i++) | 781 | for (i = 0; i < bio->bi_vcnt; i++) |
| 696 | if (pages[i]) | 782 | if (bvecs[i].bv_page) |
| 697 | put_page(pages[i]); | 783 | put_page(bvecs[i].bv_page); |
| 698 | kfree(pages); | 784 | kfree(bvecs); |
| 699 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | 785 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); |
| 700 | } | 786 | } |
| 701 | 787 | ||
| @@ -705,7 +791,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 705 | mirror_info_t *mirror; | 791 | mirror_info_t *mirror; |
| 706 | r1bio_t *r1_bio; | 792 | r1bio_t *r1_bio; |
| 707 | struct bio *read_bio; | 793 | struct bio *read_bio; |
| 708 | int i, targets = 0, disks; | 794 | int i, disks; |
| 709 | struct bitmap *bitmap; | 795 | struct bitmap *bitmap; |
| 710 | unsigned long flags; | 796 | unsigned long flags; |
| 711 | const int rw = bio_data_dir(bio); | 797 | const int rw = bio_data_dir(bio); |
| @@ -713,6 +799,9 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 713 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); | 799 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
| 714 | mdk_rdev_t *blocked_rdev; | 800 | mdk_rdev_t *blocked_rdev; |
| 715 | int plugged; | 801 | int plugged; |
| 802 | int first_clone; | ||
| 803 | int sectors_handled; | ||
| 804 | int max_sectors; | ||
| 716 | 805 | ||
| 717 | /* | 806 | /* |
| 718 | * Register the new request and wait if the reconstruction | 807 | * Register the new request and wait if the reconstruction |
| @@ -759,11 +848,24 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 759 | r1_bio->mddev = mddev; | 848 | r1_bio->mddev = mddev; |
| 760 | r1_bio->sector = bio->bi_sector; | 849 | r1_bio->sector = bio->bi_sector; |
| 761 | 850 | ||
| 851 | /* We might need to issue multiple reads to different | ||
| 852 | * devices if there are bad blocks around, so we keep | ||
| 853 | * track of the number of reads in bio->bi_phys_segments. | ||
| 854 | * If this is 0, there is only one r1_bio and no locking | ||
| 855 | * will be needed when requests complete. If it is | ||
| 856 | * non-zero, then it is the number of not-completed requests. | ||
| 857 | */ | ||
| 858 | bio->bi_phys_segments = 0; | ||
| 859 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
| 860 | |||
| 762 | if (rw == READ) { | 861 | if (rw == READ) { |
| 763 | /* | 862 | /* |
| 764 | * read balancing logic: | 863 | * read balancing logic: |
| 765 | */ | 864 | */ |
| 766 | int rdisk = read_balance(conf, r1_bio); | 865 | int rdisk; |
| 866 | |||
| 867 | read_again: | ||
| 868 | rdisk = read_balance(conf, r1_bio, &max_sectors); | ||
| 767 | 869 | ||
| 768 | if (rdisk < 0) { | 870 | if (rdisk < 0) { |
| 769 | /* couldn't find anywhere to read from */ | 871 | /* couldn't find anywhere to read from */ |
| @@ -784,6 +886,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 784 | r1_bio->read_disk = rdisk; | 886 | r1_bio->read_disk = rdisk; |
| 785 | 887 | ||
| 786 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 888 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
| 889 | md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector, | ||
| 890 | max_sectors); | ||
| 787 | 891 | ||
| 788 | r1_bio->bios[rdisk] = read_bio; | 892 | r1_bio->bios[rdisk] = read_bio; |
| 789 | 893 | ||
| @@ -793,16 +897,52 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 793 | read_bio->bi_rw = READ | do_sync; | 897 | read_bio->bi_rw = READ | do_sync; |
| 794 | read_bio->bi_private = r1_bio; | 898 | read_bio->bi_private = r1_bio; |
| 795 | 899 | ||
| 796 | generic_make_request(read_bio); | 900 | if (max_sectors < r1_bio->sectors) { |
| 901 | /* could not read all from this device, so we will | ||
| 902 | * need another r1_bio. | ||
| 903 | */ | ||
| 904 | |||
| 905 | sectors_handled = (r1_bio->sector + max_sectors | ||
| 906 | - bio->bi_sector); | ||
| 907 | r1_bio->sectors = max_sectors; | ||
| 908 | spin_lock_irq(&conf->device_lock); | ||
| 909 | if (bio->bi_phys_segments == 0) | ||
| 910 | bio->bi_phys_segments = 2; | ||
| 911 | else | ||
| 912 | bio->bi_phys_segments++; | ||
| 913 | spin_unlock_irq(&conf->device_lock); | ||
| 914 | /* Cannot call generic_make_request directly | ||
| 915 | * as that will be queued in __make_request | ||
| 916 | * and subsequent mempool_alloc might block waiting | ||
| 917 | * for it. So hand bio over to raid1d. | ||
| 918 | */ | ||
| 919 | reschedule_retry(r1_bio); | ||
| 920 | |||
| 921 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
| 922 | |||
| 923 | r1_bio->master_bio = bio; | ||
| 924 | r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; | ||
| 925 | r1_bio->state = 0; | ||
| 926 | r1_bio->mddev = mddev; | ||
| 927 | r1_bio->sector = bio->bi_sector + sectors_handled; | ||
| 928 | goto read_again; | ||
| 929 | } else | ||
| 930 | generic_make_request(read_bio); | ||
| 797 | return 0; | 931 | return 0; |
| 798 | } | 932 | } |
| 799 | 933 | ||
| 800 | /* | 934 | /* |
| 801 | * WRITE: | 935 | * WRITE: |
| 802 | */ | 936 | */ |
| 803 | /* first select target devices under spinlock and | 937 | /* first select target devices under rcu_lock and |
| 804 | * inc refcount on their rdev. Record them by setting | 938 | * inc refcount on their rdev. Record them by setting |
| 805 | * bios[x] to bio | 939 | * bios[x] to bio |
| 940 | * If there are known/acknowledged bad blocks on any device on | ||
| 941 | * which we have seen a write error, we want to avoid writing those | ||
| 942 | * blocks. | ||
| 943 | * This potentially requires several writes to write around | ||
| 944 | * the bad blocks. Each set of writes gets it's own r1bio | ||
| 945 | * with a set of bios attached. | ||
| 806 | */ | 946 | */ |
| 807 | plugged = mddev_check_plugged(mddev); | 947 | plugged = mddev_check_plugged(mddev); |
| 808 | 948 | ||
| @@ -810,6 +950,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 810 | retry_write: | 950 | retry_write: |
| 811 | blocked_rdev = NULL; | 951 | blocked_rdev = NULL; |
| 812 | rcu_read_lock(); | 952 | rcu_read_lock(); |
| 953 | max_sectors = r1_bio->sectors; | ||
| 813 | for (i = 0; i < disks; i++) { | 954 | for (i = 0; i < disks; i++) { |
| 814 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | 955 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); |
| 815 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | 956 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { |
| @@ -817,17 +958,56 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 817 | blocked_rdev = rdev; | 958 | blocked_rdev = rdev; |
| 818 | break; | 959 | break; |
| 819 | } | 960 | } |
| 820 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 961 | r1_bio->bios[i] = NULL; |
| 821 | atomic_inc(&rdev->nr_pending); | 962 | if (!rdev || test_bit(Faulty, &rdev->flags)) { |
| 822 | if (test_bit(Faulty, &rdev->flags)) { | 963 | set_bit(R1BIO_Degraded, &r1_bio->state); |
| 964 | continue; | ||
| 965 | } | ||
| 966 | |||
| 967 | atomic_inc(&rdev->nr_pending); | ||
| 968 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
| 969 | sector_t first_bad; | ||
| 970 | int bad_sectors; | ||
| 971 | int is_bad; | ||
| 972 | |||
| 973 | is_bad = is_badblock(rdev, r1_bio->sector, | ||
| 974 | max_sectors, | ||
| 975 | &first_bad, &bad_sectors); | ||
| 976 | if (is_bad < 0) { | ||
| 977 | /* mustn't write here until the bad block is | ||
| 978 | * acknowledged*/ | ||
| 979 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
| 980 | blocked_rdev = rdev; | ||
| 981 | break; | ||
| 982 | } | ||
| 983 | if (is_bad && first_bad <= r1_bio->sector) { | ||
| 984 | /* Cannot write here at all */ | ||
| 985 | bad_sectors -= (r1_bio->sector - first_bad); | ||
| 986 | if (bad_sectors < max_sectors) | ||
| 987 | /* mustn't write more than bad_sectors | ||
| 988 | * to other devices yet | ||
| 989 | */ | ||
| 990 | max_sectors = bad_sectors; | ||
| 823 | rdev_dec_pending(rdev, mddev); | 991 | rdev_dec_pending(rdev, mddev); |
| 824 | r1_bio->bios[i] = NULL; | 992 | /* We don't set R1BIO_Degraded as that |
| 825 | } else { | 993 | * only applies if the disk is |
| 826 | r1_bio->bios[i] = bio; | 994 | * missing, so it might be re-added, |
| 827 | targets++; | 995 | * and we want to know to recover this |
| 996 | * chunk. | ||
| 997 | * In this case the device is here, | ||
| 998 | * and the fact that this chunk is not | ||
| 999 | * in-sync is recorded in the bad | ||
| 1000 | * block log | ||
| 1001 | */ | ||
| 1002 | continue; | ||
| 828 | } | 1003 | } |
| 829 | } else | 1004 | if (is_bad) { |
| 830 | r1_bio->bios[i] = NULL; | 1005 | int good_sectors = first_bad - r1_bio->sector; |
| 1006 | if (good_sectors < max_sectors) | ||
| 1007 | max_sectors = good_sectors; | ||
| 1008 | } | ||
| 1009 | } | ||
| 1010 | r1_bio->bios[i] = bio; | ||
| 831 | } | 1011 | } |
| 832 | rcu_read_unlock(); | 1012 | rcu_read_unlock(); |
| 833 | 1013 | ||
| @@ -838,51 +1018,57 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 838 | for (j = 0; j < i; j++) | 1018 | for (j = 0; j < i; j++) |
| 839 | if (r1_bio->bios[j]) | 1019 | if (r1_bio->bios[j]) |
| 840 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 1020 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
| 841 | 1021 | r1_bio->state = 0; | |
| 842 | allow_barrier(conf); | 1022 | allow_barrier(conf); |
| 843 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1023 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
| 844 | wait_barrier(conf); | 1024 | wait_barrier(conf); |
| 845 | goto retry_write; | 1025 | goto retry_write; |
| 846 | } | 1026 | } |
| 847 | 1027 | ||
| 848 | BUG_ON(targets == 0); /* we never fail the last device */ | 1028 | if (max_sectors < r1_bio->sectors) { |
| 849 | 1029 | /* We are splitting this write into multiple parts, so | |
| 850 | if (targets < conf->raid_disks) { | 1030 | * we need to prepare for allocating another r1_bio. |
| 851 | /* array is degraded, we will not clear the bitmap | 1031 | */ |
| 852 | * on I/O completion (see raid1_end_write_request) */ | 1032 | r1_bio->sectors = max_sectors; |
| 853 | set_bit(R1BIO_Degraded, &r1_bio->state); | 1033 | spin_lock_irq(&conf->device_lock); |
| 1034 | if (bio->bi_phys_segments == 0) | ||
| 1035 | bio->bi_phys_segments = 2; | ||
| 1036 | else | ||
| 1037 | bio->bi_phys_segments++; | ||
| 1038 | spin_unlock_irq(&conf->device_lock); | ||
| 854 | } | 1039 | } |
| 855 | 1040 | sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; | |
| 856 | /* do behind I/O ? | ||
| 857 | * Not if there are too many, or cannot allocate memory, | ||
| 858 | * or a reader on WriteMostly is waiting for behind writes | ||
| 859 | * to flush */ | ||
| 860 | if (bitmap && | ||
| 861 | (atomic_read(&bitmap->behind_writes) | ||
| 862 | < mddev->bitmap_info.max_write_behind) && | ||
| 863 | !waitqueue_active(&bitmap->behind_wait)) | ||
| 864 | alloc_behind_pages(bio, r1_bio); | ||
| 865 | 1041 | ||
| 866 | atomic_set(&r1_bio->remaining, 1); | 1042 | atomic_set(&r1_bio->remaining, 1); |
| 867 | atomic_set(&r1_bio->behind_remaining, 0); | 1043 | atomic_set(&r1_bio->behind_remaining, 0); |
| 868 | 1044 | ||
| 869 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, | 1045 | first_clone = 1; |
| 870 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
| 871 | for (i = 0; i < disks; i++) { | 1046 | for (i = 0; i < disks; i++) { |
| 872 | struct bio *mbio; | 1047 | struct bio *mbio; |
| 873 | if (!r1_bio->bios[i]) | 1048 | if (!r1_bio->bios[i]) |
| 874 | continue; | 1049 | continue; |
| 875 | 1050 | ||
| 876 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1051 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
| 877 | r1_bio->bios[i] = mbio; | 1052 | md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors); |
| 878 | 1053 | ||
| 879 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; | 1054 | if (first_clone) { |
| 880 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1055 | /* do behind I/O ? |
| 881 | mbio->bi_end_io = raid1_end_write_request; | 1056 | * Not if there are too many, or cannot |
| 882 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; | 1057 | * allocate memory, or a reader on WriteMostly |
| 883 | mbio->bi_private = r1_bio; | 1058 | * is waiting for behind writes to flush */ |
| 884 | 1059 | if (bitmap && | |
| 885 | if (r1_bio->behind_pages) { | 1060 | (atomic_read(&bitmap->behind_writes) |
| 1061 | < mddev->bitmap_info.max_write_behind) && | ||
| 1062 | !waitqueue_active(&bitmap->behind_wait)) | ||
| 1063 | alloc_behind_pages(mbio, r1_bio); | ||
| 1064 | |||
| 1065 | bitmap_startwrite(bitmap, r1_bio->sector, | ||
| 1066 | r1_bio->sectors, | ||
| 1067 | test_bit(R1BIO_BehindIO, | ||
| 1068 | &r1_bio->state)); | ||
| 1069 | first_clone = 0; | ||
| 1070 | } | ||
| 1071 | if (r1_bio->behind_bvecs) { | ||
| 886 | struct bio_vec *bvec; | 1072 | struct bio_vec *bvec; |
| 887 | int j; | 1073 | int j; |
| 888 | 1074 | ||
| @@ -894,11 +1080,20 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 894 | * them all | 1080 | * them all |
| 895 | */ | 1081 | */ |
| 896 | __bio_for_each_segment(bvec, mbio, j, 0) | 1082 | __bio_for_each_segment(bvec, mbio, j, 0) |
| 897 | bvec->bv_page = r1_bio->behind_pages[j]; | 1083 | bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; |
| 898 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) | 1084 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) |
| 899 | atomic_inc(&r1_bio->behind_remaining); | 1085 | atomic_inc(&r1_bio->behind_remaining); |
| 900 | } | 1086 | } |
| 901 | 1087 | ||
| 1088 | r1_bio->bios[i] = mbio; | ||
| 1089 | |||
| 1090 | mbio->bi_sector = (r1_bio->sector + | ||
| 1091 | conf->mirrors[i].rdev->data_offset); | ||
| 1092 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
| 1093 | mbio->bi_end_io = raid1_end_write_request; | ||
| 1094 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; | ||
| 1095 | mbio->bi_private = r1_bio; | ||
| 1096 | |||
| 902 | atomic_inc(&r1_bio->remaining); | 1097 | atomic_inc(&r1_bio->remaining); |
| 903 | spin_lock_irqsave(&conf->device_lock, flags); | 1098 | spin_lock_irqsave(&conf->device_lock, flags); |
| 904 | bio_list_add(&conf->pending_bio_list, mbio); | 1099 | bio_list_add(&conf->pending_bio_list, mbio); |
| @@ -909,6 +1104,19 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 909 | /* In case raid1d snuck in to freeze_array */ | 1104 | /* In case raid1d snuck in to freeze_array */ |
| 910 | wake_up(&conf->wait_barrier); | 1105 | wake_up(&conf->wait_barrier); |
| 911 | 1106 | ||
| 1107 | if (sectors_handled < (bio->bi_size >> 9)) { | ||
| 1108 | /* We need another r1_bio. It has already been counted | ||
| 1109 | * in bio->bi_phys_segments | ||
| 1110 | */ | ||
| 1111 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
| 1112 | r1_bio->master_bio = bio; | ||
| 1113 | r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; | ||
| 1114 | r1_bio->state = 0; | ||
| 1115 | r1_bio->mddev = mddev; | ||
| 1116 | r1_bio->sector = bio->bi_sector + sectors_handled; | ||
| 1117 | goto retry_write; | ||
| 1118 | } | ||
| 1119 | |||
| 912 | if (do_sync || !bitmap || !plugged) | 1120 | if (do_sync || !bitmap || !plugged) |
| 913 | md_wakeup_thread(mddev->thread); | 1121 | md_wakeup_thread(mddev->thread); |
| 914 | 1122 | ||
| @@ -952,9 +1160,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 952 | * However don't try a recovery from this drive as | 1160 | * However don't try a recovery from this drive as |
| 953 | * it is very likely to fail. | 1161 | * it is very likely to fail. |
| 954 | */ | 1162 | */ |
| 955 | mddev->recovery_disabled = 1; | 1163 | conf->recovery_disabled = mddev->recovery_disabled; |
| 956 | return; | 1164 | return; |
| 957 | } | 1165 | } |
| 1166 | set_bit(Blocked, &rdev->flags); | ||
| 958 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1167 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
| 959 | unsigned long flags; | 1168 | unsigned long flags; |
| 960 | spin_lock_irqsave(&conf->device_lock, flags); | 1169 | spin_lock_irqsave(&conf->device_lock, flags); |
| @@ -1027,7 +1236,7 @@ static int raid1_spare_active(mddev_t *mddev) | |||
| 1027 | && !test_bit(Faulty, &rdev->flags) | 1236 | && !test_bit(Faulty, &rdev->flags) |
| 1028 | && !test_and_set_bit(In_sync, &rdev->flags)) { | 1237 | && !test_and_set_bit(In_sync, &rdev->flags)) { |
| 1029 | count++; | 1238 | count++; |
| 1030 | sysfs_notify_dirent(rdev->sysfs_state); | 1239 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
| 1031 | } | 1240 | } |
| 1032 | } | 1241 | } |
| 1033 | spin_lock_irqsave(&conf->device_lock, flags); | 1242 | spin_lock_irqsave(&conf->device_lock, flags); |
| @@ -1048,6 +1257,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1048 | int first = 0; | 1257 | int first = 0; |
| 1049 | int last = mddev->raid_disks - 1; | 1258 | int last = mddev->raid_disks - 1; |
| 1050 | 1259 | ||
| 1260 | if (mddev->recovery_disabled == conf->recovery_disabled) | ||
| 1261 | return -EBUSY; | ||
| 1262 | |||
| 1051 | if (rdev->raid_disk >= 0) | 1263 | if (rdev->raid_disk >= 0) |
| 1052 | first = last = rdev->raid_disk; | 1264 | first = last = rdev->raid_disk; |
| 1053 | 1265 | ||
| @@ -1103,7 +1315,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number) | |||
| 1103 | * is not possible. | 1315 | * is not possible. |
| 1104 | */ | 1316 | */ |
| 1105 | if (!test_bit(Faulty, &rdev->flags) && | 1317 | if (!test_bit(Faulty, &rdev->flags) && |
| 1106 | !mddev->recovery_disabled && | 1318 | mddev->recovery_disabled != conf->recovery_disabled && |
| 1107 | mddev->degraded < conf->raid_disks) { | 1319 | mddev->degraded < conf->raid_disks) { |
| 1108 | err = -EBUSY; | 1320 | err = -EBUSY; |
| 1109 | goto abort; | 1321 | goto abort; |
| @@ -1155,6 +1367,8 @@ static void end_sync_write(struct bio *bio, int error) | |||
| 1155 | conf_t *conf = mddev->private; | 1367 | conf_t *conf = mddev->private; |
| 1156 | int i; | 1368 | int i; |
| 1157 | int mirror=0; | 1369 | int mirror=0; |
| 1370 | sector_t first_bad; | ||
| 1371 | int bad_sectors; | ||
| 1158 | 1372 | ||
| 1159 | for (i = 0; i < conf->raid_disks; i++) | 1373 | for (i = 0; i < conf->raid_disks; i++) |
| 1160 | if (r1_bio->bios[i] == bio) { | 1374 | if (r1_bio->bios[i] == bio) { |
| @@ -1172,18 +1386,48 @@ static void end_sync_write(struct bio *bio, int error) | |||
| 1172 | s += sync_blocks; | 1386 | s += sync_blocks; |
| 1173 | sectors_to_go -= sync_blocks; | 1387 | sectors_to_go -= sync_blocks; |
| 1174 | } while (sectors_to_go > 0); | 1388 | } while (sectors_to_go > 0); |
| 1175 | md_error(mddev, conf->mirrors[mirror].rdev); | 1389 | set_bit(WriteErrorSeen, |
| 1176 | } | 1390 | &conf->mirrors[mirror].rdev->flags); |
| 1391 | set_bit(R1BIO_WriteError, &r1_bio->state); | ||
| 1392 | } else if (is_badblock(conf->mirrors[mirror].rdev, | ||
| 1393 | r1_bio->sector, | ||
| 1394 | r1_bio->sectors, | ||
| 1395 | &first_bad, &bad_sectors) && | ||
| 1396 | !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, | ||
| 1397 | r1_bio->sector, | ||
| 1398 | r1_bio->sectors, | ||
| 1399 | &first_bad, &bad_sectors) | ||
| 1400 | ) | ||
| 1401 | set_bit(R1BIO_MadeGood, &r1_bio->state); | ||
| 1177 | 1402 | ||
| 1178 | update_head_pos(mirror, r1_bio); | 1403 | update_head_pos(mirror, r1_bio); |
| 1179 | 1404 | ||
| 1180 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 1405 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
| 1181 | sector_t s = r1_bio->sectors; | 1406 | int s = r1_bio->sectors; |
| 1182 | put_buf(r1_bio); | 1407 | if (test_bit(R1BIO_MadeGood, &r1_bio->state) || |
| 1183 | md_done_sync(mddev, s, uptodate); | 1408 | test_bit(R1BIO_WriteError, &r1_bio->state)) |
| 1409 | reschedule_retry(r1_bio); | ||
| 1410 | else { | ||
| 1411 | put_buf(r1_bio); | ||
| 1412 | md_done_sync(mddev, s, uptodate); | ||
| 1413 | } | ||
| 1184 | } | 1414 | } |
| 1185 | } | 1415 | } |
| 1186 | 1416 | ||
| 1417 | static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector, | ||
| 1418 | int sectors, struct page *page, int rw) | ||
| 1419 | { | ||
| 1420 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) | ||
| 1421 | /* success */ | ||
| 1422 | return 1; | ||
| 1423 | if (rw == WRITE) | ||
| 1424 | set_bit(WriteErrorSeen, &rdev->flags); | ||
| 1425 | /* need to record an error - either for the block or the device */ | ||
| 1426 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) | ||
| 1427 | md_error(rdev->mddev, rdev); | ||
| 1428 | return 0; | ||
| 1429 | } | ||
| 1430 | |||
| 1187 | static int fix_sync_read_error(r1bio_t *r1_bio) | 1431 | static int fix_sync_read_error(r1bio_t *r1_bio) |
| 1188 | { | 1432 | { |
| 1189 | /* Try some synchronous reads of other devices to get | 1433 | /* Try some synchronous reads of other devices to get |
| @@ -1193,6 +1437,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
| 1193 | * We don't need to freeze the array, because being in an | 1437 | * We don't need to freeze the array, because being in an |
| 1194 | * active sync request, there is no normal IO, and | 1438 | * active sync request, there is no normal IO, and |
| 1195 | * no overlapping syncs. | 1439 | * no overlapping syncs. |
| 1440 | * We don't need to check is_badblock() again as we | ||
| 1441 | * made sure that anything with a bad block in range | ||
| 1442 | * will have bi_end_io clear. | ||
| 1196 | */ | 1443 | */ |
| 1197 | mddev_t *mddev = r1_bio->mddev; | 1444 | mddev_t *mddev = r1_bio->mddev; |
| 1198 | conf_t *conf = mddev->private; | 1445 | conf_t *conf = mddev->private; |
| @@ -1217,9 +1464,7 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
| 1217 | * active, and resync is currently active | 1464 | * active, and resync is currently active |
| 1218 | */ | 1465 | */ |
| 1219 | rdev = conf->mirrors[d].rdev; | 1466 | rdev = conf->mirrors[d].rdev; |
| 1220 | if (sync_page_io(rdev, | 1467 | if (sync_page_io(rdev, sect, s<<9, |
| 1221 | sect, | ||
| 1222 | s<<9, | ||
| 1223 | bio->bi_io_vec[idx].bv_page, | 1468 | bio->bi_io_vec[idx].bv_page, |
| 1224 | READ, false)) { | 1469 | READ, false)) { |
| 1225 | success = 1; | 1470 | success = 1; |
| @@ -1233,16 +1478,36 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
| 1233 | 1478 | ||
| 1234 | if (!success) { | 1479 | if (!success) { |
| 1235 | char b[BDEVNAME_SIZE]; | 1480 | char b[BDEVNAME_SIZE]; |
| 1236 | /* Cannot read from anywhere, array is toast */ | 1481 | int abort = 0; |
| 1237 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | 1482 | /* Cannot read from anywhere, this block is lost. |
| 1483 | * Record a bad block on each device. If that doesn't | ||
| 1484 | * work just disable and interrupt the recovery. | ||
| 1485 | * Don't fail devices as that won't really help. | ||
| 1486 | */ | ||
| 1238 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | 1487 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" |
| 1239 | " for block %llu\n", | 1488 | " for block %llu\n", |
| 1240 | mdname(mddev), | 1489 | mdname(mddev), |
| 1241 | bdevname(bio->bi_bdev, b), | 1490 | bdevname(bio->bi_bdev, b), |
| 1242 | (unsigned long long)r1_bio->sector); | 1491 | (unsigned long long)r1_bio->sector); |
| 1243 | md_done_sync(mddev, r1_bio->sectors, 0); | 1492 | for (d = 0; d < conf->raid_disks; d++) { |
| 1244 | put_buf(r1_bio); | 1493 | rdev = conf->mirrors[d].rdev; |
| 1245 | return 0; | 1494 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
| 1495 | continue; | ||
| 1496 | if (!rdev_set_badblocks(rdev, sect, s, 0)) | ||
| 1497 | abort = 1; | ||
| 1498 | } | ||
| 1499 | if (abort) { | ||
| 1500 | mddev->recovery_disabled = 1; | ||
| 1501 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
| 1502 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
| 1503 | put_buf(r1_bio); | ||
| 1504 | return 0; | ||
| 1505 | } | ||
| 1506 | /* Try next page */ | ||
| 1507 | sectors -= s; | ||
| 1508 | sect += s; | ||
| 1509 | idx++; | ||
| 1510 | continue; | ||
| 1246 | } | 1511 | } |
| 1247 | 1512 | ||
| 1248 | start = d; | 1513 | start = d; |
| @@ -1254,16 +1519,12 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
| 1254 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | 1519 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) |
| 1255 | continue; | 1520 | continue; |
| 1256 | rdev = conf->mirrors[d].rdev; | 1521 | rdev = conf->mirrors[d].rdev; |
| 1257 | if (sync_page_io(rdev, | 1522 | if (r1_sync_page_io(rdev, sect, s, |
| 1258 | sect, | 1523 | bio->bi_io_vec[idx].bv_page, |
| 1259 | s<<9, | 1524 | WRITE) == 0) { |
| 1260 | bio->bi_io_vec[idx].bv_page, | ||
| 1261 | WRITE, false) == 0) { | ||
| 1262 | r1_bio->bios[d]->bi_end_io = NULL; | 1525 | r1_bio->bios[d]->bi_end_io = NULL; |
| 1263 | rdev_dec_pending(rdev, mddev); | 1526 | rdev_dec_pending(rdev, mddev); |
| 1264 | md_error(mddev, rdev); | 1527 | } |
| 1265 | } else | ||
| 1266 | atomic_add(s, &rdev->corrected_errors); | ||
| 1267 | } | 1528 | } |
| 1268 | d = start; | 1529 | d = start; |
| 1269 | while (d != r1_bio->read_disk) { | 1530 | while (d != r1_bio->read_disk) { |
| @@ -1273,12 +1534,10 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
| 1273 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | 1534 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) |
| 1274 | continue; | 1535 | continue; |
| 1275 | rdev = conf->mirrors[d].rdev; | 1536 | rdev = conf->mirrors[d].rdev; |
| 1276 | if (sync_page_io(rdev, | 1537 | if (r1_sync_page_io(rdev, sect, s, |
| 1277 | sect, | 1538 | bio->bi_io_vec[idx].bv_page, |
| 1278 | s<<9, | 1539 | READ) != 0) |
| 1279 | bio->bi_io_vec[idx].bv_page, | 1540 | atomic_add(s, &rdev->corrected_errors); |
| 1280 | READ, false) == 0) | ||
| 1281 | md_error(mddev, rdev); | ||
| 1282 | } | 1541 | } |
| 1283 | sectors -= s; | 1542 | sectors -= s; |
| 1284 | sect += s; | 1543 | sect += s; |
| @@ -1420,7 +1679,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | |||
| 1420 | * | 1679 | * |
| 1421 | * 1. Retries failed read operations on working mirrors. | 1680 | * 1. Retries failed read operations on working mirrors. |
| 1422 | * 2. Updates the raid superblock when problems encounter. | 1681 | * 2. Updates the raid superblock when problems encounter. |
| 1423 | * 3. Performs writes following reads for array syncronising. | 1682 | * 3. Performs writes following reads for array synchronising. |
| 1424 | */ | 1683 | */ |
| 1425 | 1684 | ||
| 1426 | static void fix_read_error(conf_t *conf, int read_disk, | 1685 | static void fix_read_error(conf_t *conf, int read_disk, |
| @@ -1443,9 +1702,14 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
| 1443 | * which is the thread that might remove | 1702 | * which is the thread that might remove |
| 1444 | * a device. If raid1d ever becomes multi-threaded.... | 1703 | * a device. If raid1d ever becomes multi-threaded.... |
| 1445 | */ | 1704 | */ |
| 1705 | sector_t first_bad; | ||
| 1706 | int bad_sectors; | ||
| 1707 | |||
| 1446 | rdev = conf->mirrors[d].rdev; | 1708 | rdev = conf->mirrors[d].rdev; |
| 1447 | if (rdev && | 1709 | if (rdev && |
| 1448 | test_bit(In_sync, &rdev->flags) && | 1710 | test_bit(In_sync, &rdev->flags) && |
| 1711 | is_badblock(rdev, sect, s, | ||
| 1712 | &first_bad, &bad_sectors) == 0 && | ||
| 1449 | sync_page_io(rdev, sect, s<<9, | 1713 | sync_page_io(rdev, sect, s<<9, |
| 1450 | conf->tmppage, READ, false)) | 1714 | conf->tmppage, READ, false)) |
| 1451 | success = 1; | 1715 | success = 1; |
| @@ -1457,8 +1721,10 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
| 1457 | } while (!success && d != read_disk); | 1721 | } while (!success && d != read_disk); |
| 1458 | 1722 | ||
| 1459 | if (!success) { | 1723 | if (!success) { |
| 1460 | /* Cannot read from anywhere -- bye bye array */ | 1724 | /* Cannot read from anywhere - mark it bad */ |
| 1461 | md_error(mddev, conf->mirrors[read_disk].rdev); | 1725 | mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev; |
| 1726 | if (!rdev_set_badblocks(rdev, sect, s, 0)) | ||
| 1727 | md_error(mddev, rdev); | ||
| 1462 | break; | 1728 | break; |
| 1463 | } | 1729 | } |
| 1464 | /* write it back and re-read */ | 1730 | /* write it back and re-read */ |
| @@ -1469,13 +1735,9 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
| 1469 | d--; | 1735 | d--; |
| 1470 | rdev = conf->mirrors[d].rdev; | 1736 | rdev = conf->mirrors[d].rdev; |
| 1471 | if (rdev && | 1737 | if (rdev && |
| 1472 | test_bit(In_sync, &rdev->flags)) { | 1738 | test_bit(In_sync, &rdev->flags)) |
| 1473 | if (sync_page_io(rdev, sect, s<<9, | 1739 | r1_sync_page_io(rdev, sect, s, |
| 1474 | conf->tmppage, WRITE, false) | 1740 | conf->tmppage, WRITE); |
| 1475 | == 0) | ||
| 1476 | /* Well, this device is dead */ | ||
| 1477 | md_error(mddev, rdev); | ||
| 1478 | } | ||
| 1479 | } | 1741 | } |
| 1480 | d = start; | 1742 | d = start; |
| 1481 | while (d != read_disk) { | 1743 | while (d != read_disk) { |
| @@ -1486,12 +1748,8 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
| 1486 | rdev = conf->mirrors[d].rdev; | 1748 | rdev = conf->mirrors[d].rdev; |
| 1487 | if (rdev && | 1749 | if (rdev && |
| 1488 | test_bit(In_sync, &rdev->flags)) { | 1750 | test_bit(In_sync, &rdev->flags)) { |
| 1489 | if (sync_page_io(rdev, sect, s<<9, | 1751 | if (r1_sync_page_io(rdev, sect, s, |
| 1490 | conf->tmppage, READ, false) | 1752 | conf->tmppage, READ)) { |
| 1491 | == 0) | ||
| 1492 | /* Well, this device is dead */ | ||
| 1493 | md_error(mddev, rdev); | ||
| 1494 | else { | ||
| 1495 | atomic_add(s, &rdev->corrected_errors); | 1753 | atomic_add(s, &rdev->corrected_errors); |
| 1496 | printk(KERN_INFO | 1754 | printk(KERN_INFO |
| 1497 | "md/raid1:%s: read error corrected " | 1755 | "md/raid1:%s: read error corrected " |
| @@ -1508,21 +1766,255 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
| 1508 | } | 1766 | } |
| 1509 | } | 1767 | } |
| 1510 | 1768 | ||
| 1769 | static void bi_complete(struct bio *bio, int error) | ||
| 1770 | { | ||
| 1771 | complete((struct completion *)bio->bi_private); | ||
| 1772 | } | ||
| 1773 | |||
| 1774 | static int submit_bio_wait(int rw, struct bio *bio) | ||
| 1775 | { | ||
| 1776 | struct completion event; | ||
| 1777 | rw |= REQ_SYNC; | ||
| 1778 | |||
| 1779 | init_completion(&event); | ||
| 1780 | bio->bi_private = &event; | ||
| 1781 | bio->bi_end_io = bi_complete; | ||
| 1782 | submit_bio(rw, bio); | ||
| 1783 | wait_for_completion(&event); | ||
| 1784 | |||
| 1785 | return test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 1786 | } | ||
| 1787 | |||
| 1788 | static int narrow_write_error(r1bio_t *r1_bio, int i) | ||
| 1789 | { | ||
| 1790 | mddev_t *mddev = r1_bio->mddev; | ||
| 1791 | conf_t *conf = mddev->private; | ||
| 1792 | mdk_rdev_t *rdev = conf->mirrors[i].rdev; | ||
| 1793 | int vcnt, idx; | ||
| 1794 | struct bio_vec *vec; | ||
| 1795 | |||
| 1796 | /* bio has the data to be written to device 'i' where | ||
| 1797 | * we just recently had a write error. | ||
| 1798 | * We repeatedly clone the bio and trim down to one block, | ||
| 1799 | * then try the write. Where the write fails we record | ||
| 1800 | * a bad block. | ||
| 1801 | * It is conceivable that the bio doesn't exactly align with | ||
| 1802 | * blocks. We must handle this somehow. | ||
| 1803 | * | ||
| 1804 | * We currently own a reference on the rdev. | ||
| 1805 | */ | ||
| 1806 | |||
| 1807 | int block_sectors; | ||
| 1808 | sector_t sector; | ||
| 1809 | int sectors; | ||
| 1810 | int sect_to_write = r1_bio->sectors; | ||
| 1811 | int ok = 1; | ||
| 1812 | |||
| 1813 | if (rdev->badblocks.shift < 0) | ||
| 1814 | return 0; | ||
| 1815 | |||
| 1816 | block_sectors = 1 << rdev->badblocks.shift; | ||
| 1817 | sector = r1_bio->sector; | ||
| 1818 | sectors = ((sector + block_sectors) | ||
| 1819 | & ~(sector_t)(block_sectors - 1)) | ||
| 1820 | - sector; | ||
| 1821 | |||
| 1822 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
| 1823 | vcnt = r1_bio->behind_page_count; | ||
| 1824 | vec = r1_bio->behind_bvecs; | ||
| 1825 | idx = 0; | ||
| 1826 | while (vec[idx].bv_page == NULL) | ||
| 1827 | idx++; | ||
| 1828 | } else { | ||
| 1829 | vcnt = r1_bio->master_bio->bi_vcnt; | ||
| 1830 | vec = r1_bio->master_bio->bi_io_vec; | ||
| 1831 | idx = r1_bio->master_bio->bi_idx; | ||
| 1832 | } | ||
| 1833 | while (sect_to_write) { | ||
| 1834 | struct bio *wbio; | ||
| 1835 | if (sectors > sect_to_write) | ||
| 1836 | sectors = sect_to_write; | ||
| 1837 | /* Write at 'sector' for 'sectors'*/ | ||
| 1838 | |||
| 1839 | wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); | ||
| 1840 | memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); | ||
| 1841 | wbio->bi_sector = r1_bio->sector; | ||
| 1842 | wbio->bi_rw = WRITE; | ||
| 1843 | wbio->bi_vcnt = vcnt; | ||
| 1844 | wbio->bi_size = r1_bio->sectors << 9; | ||
| 1845 | wbio->bi_idx = idx; | ||
| 1846 | |||
| 1847 | md_trim_bio(wbio, sector - r1_bio->sector, sectors); | ||
| 1848 | wbio->bi_sector += rdev->data_offset; | ||
| 1849 | wbio->bi_bdev = rdev->bdev; | ||
| 1850 | if (submit_bio_wait(WRITE, wbio) == 0) | ||
| 1851 | /* failure! */ | ||
| 1852 | ok = rdev_set_badblocks(rdev, sector, | ||
| 1853 | sectors, 0) | ||
| 1854 | && ok; | ||
| 1855 | |||
| 1856 | bio_put(wbio); | ||
| 1857 | sect_to_write -= sectors; | ||
| 1858 | sector += sectors; | ||
| 1859 | sectors = block_sectors; | ||
| 1860 | } | ||
| 1861 | return ok; | ||
| 1862 | } | ||
| 1863 | |||
| 1864 | static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio) | ||
| 1865 | { | ||
| 1866 | int m; | ||
| 1867 | int s = r1_bio->sectors; | ||
| 1868 | for (m = 0; m < conf->raid_disks ; m++) { | ||
| 1869 | mdk_rdev_t *rdev = conf->mirrors[m].rdev; | ||
| 1870 | struct bio *bio = r1_bio->bios[m]; | ||
| 1871 | if (bio->bi_end_io == NULL) | ||
| 1872 | continue; | ||
| 1873 | if (test_bit(BIO_UPTODATE, &bio->bi_flags) && | ||
| 1874 | test_bit(R1BIO_MadeGood, &r1_bio->state)) { | ||
| 1875 | rdev_clear_badblocks(rdev, r1_bio->sector, s); | ||
| 1876 | } | ||
| 1877 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && | ||
| 1878 | test_bit(R1BIO_WriteError, &r1_bio->state)) { | ||
| 1879 | if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) | ||
| 1880 | md_error(conf->mddev, rdev); | ||
| 1881 | } | ||
| 1882 | } | ||
| 1883 | put_buf(r1_bio); | ||
| 1884 | md_done_sync(conf->mddev, s, 1); | ||
| 1885 | } | ||
| 1886 | |||
| 1887 | static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio) | ||
| 1888 | { | ||
| 1889 | int m; | ||
| 1890 | for (m = 0; m < conf->raid_disks ; m++) | ||
| 1891 | if (r1_bio->bios[m] == IO_MADE_GOOD) { | ||
| 1892 | mdk_rdev_t *rdev = conf->mirrors[m].rdev; | ||
| 1893 | rdev_clear_badblocks(rdev, | ||
| 1894 | r1_bio->sector, | ||
| 1895 | r1_bio->sectors); | ||
| 1896 | rdev_dec_pending(rdev, conf->mddev); | ||
| 1897 | } else if (r1_bio->bios[m] != NULL) { | ||
| 1898 | /* This drive got a write error. We need to | ||
| 1899 | * narrow down and record precise write | ||
| 1900 | * errors. | ||
| 1901 | */ | ||
| 1902 | if (!narrow_write_error(r1_bio, m)) { | ||
| 1903 | md_error(conf->mddev, | ||
| 1904 | conf->mirrors[m].rdev); | ||
| 1905 | /* an I/O failed, we can't clear the bitmap */ | ||
| 1906 | set_bit(R1BIO_Degraded, &r1_bio->state); | ||
| 1907 | } | ||
| 1908 | rdev_dec_pending(conf->mirrors[m].rdev, | ||
| 1909 | conf->mddev); | ||
| 1910 | } | ||
| 1911 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) | ||
| 1912 | close_write(r1_bio); | ||
| 1913 | raid_end_bio_io(r1_bio); | ||
| 1914 | } | ||
| 1915 | |||
| 1916 | static void handle_read_error(conf_t *conf, r1bio_t *r1_bio) | ||
| 1917 | { | ||
| 1918 | int disk; | ||
| 1919 | int max_sectors; | ||
| 1920 | mddev_t *mddev = conf->mddev; | ||
| 1921 | struct bio *bio; | ||
| 1922 | char b[BDEVNAME_SIZE]; | ||
| 1923 | mdk_rdev_t *rdev; | ||
| 1924 | |||
| 1925 | clear_bit(R1BIO_ReadError, &r1_bio->state); | ||
| 1926 | /* we got a read error. Maybe the drive is bad. Maybe just | ||
| 1927 | * the block and we can fix it. | ||
| 1928 | * We freeze all other IO, and try reading the block from | ||
| 1929 | * other devices. When we find one, we re-write | ||
| 1930 | * and check it that fixes the read error. | ||
| 1931 | * This is all done synchronously while the array is | ||
| 1932 | * frozen | ||
| 1933 | */ | ||
| 1934 | if (mddev->ro == 0) { | ||
| 1935 | freeze_array(conf); | ||
| 1936 | fix_read_error(conf, r1_bio->read_disk, | ||
| 1937 | r1_bio->sector, r1_bio->sectors); | ||
| 1938 | unfreeze_array(conf); | ||
| 1939 | } else | ||
| 1940 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
| 1941 | |||
| 1942 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
| 1943 | bdevname(bio->bi_bdev, b); | ||
| 1944 | read_more: | ||
| 1945 | disk = read_balance(conf, r1_bio, &max_sectors); | ||
| 1946 | if (disk == -1) { | ||
| 1947 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" | ||
| 1948 | " read error for block %llu\n", | ||
| 1949 | mdname(mddev), b, (unsigned long long)r1_bio->sector); | ||
| 1950 | raid_end_bio_io(r1_bio); | ||
| 1951 | } else { | ||
| 1952 | const unsigned long do_sync | ||
| 1953 | = r1_bio->master_bio->bi_rw & REQ_SYNC; | ||
| 1954 | if (bio) { | ||
| 1955 | r1_bio->bios[r1_bio->read_disk] = | ||
| 1956 | mddev->ro ? IO_BLOCKED : NULL; | ||
| 1957 | bio_put(bio); | ||
| 1958 | } | ||
| 1959 | r1_bio->read_disk = disk; | ||
| 1960 | bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); | ||
| 1961 | md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors); | ||
| 1962 | r1_bio->bios[r1_bio->read_disk] = bio; | ||
| 1963 | rdev = conf->mirrors[disk].rdev; | ||
| 1964 | printk_ratelimited(KERN_ERR | ||
| 1965 | "md/raid1:%s: redirecting sector %llu" | ||
| 1966 | " to other mirror: %s\n", | ||
| 1967 | mdname(mddev), | ||
| 1968 | (unsigned long long)r1_bio->sector, | ||
| 1969 | bdevname(rdev->bdev, b)); | ||
| 1970 | bio->bi_sector = r1_bio->sector + rdev->data_offset; | ||
| 1971 | bio->bi_bdev = rdev->bdev; | ||
| 1972 | bio->bi_end_io = raid1_end_read_request; | ||
| 1973 | bio->bi_rw = READ | do_sync; | ||
| 1974 | bio->bi_private = r1_bio; | ||
| 1975 | if (max_sectors < r1_bio->sectors) { | ||
| 1976 | /* Drat - have to split this up more */ | ||
| 1977 | struct bio *mbio = r1_bio->master_bio; | ||
| 1978 | int sectors_handled = (r1_bio->sector + max_sectors | ||
| 1979 | - mbio->bi_sector); | ||
| 1980 | r1_bio->sectors = max_sectors; | ||
| 1981 | spin_lock_irq(&conf->device_lock); | ||
| 1982 | if (mbio->bi_phys_segments == 0) | ||
| 1983 | mbio->bi_phys_segments = 2; | ||
| 1984 | else | ||
| 1985 | mbio->bi_phys_segments++; | ||
| 1986 | spin_unlock_irq(&conf->device_lock); | ||
| 1987 | generic_make_request(bio); | ||
| 1988 | bio = NULL; | ||
| 1989 | |||
| 1990 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
| 1991 | |||
| 1992 | r1_bio->master_bio = mbio; | ||
| 1993 | r1_bio->sectors = (mbio->bi_size >> 9) | ||
| 1994 | - sectors_handled; | ||
| 1995 | r1_bio->state = 0; | ||
| 1996 | set_bit(R1BIO_ReadError, &r1_bio->state); | ||
| 1997 | r1_bio->mddev = mddev; | ||
| 1998 | r1_bio->sector = mbio->bi_sector + sectors_handled; | ||
| 1999 | |||
| 2000 | goto read_more; | ||
| 2001 | } else | ||
| 2002 | generic_make_request(bio); | ||
| 2003 | } | ||
| 2004 | } | ||
| 2005 | |||
| 1511 | static void raid1d(mddev_t *mddev) | 2006 | static void raid1d(mddev_t *mddev) |
| 1512 | { | 2007 | { |
| 1513 | r1bio_t *r1_bio; | 2008 | r1bio_t *r1_bio; |
| 1514 | struct bio *bio; | ||
| 1515 | unsigned long flags; | 2009 | unsigned long flags; |
| 1516 | conf_t *conf = mddev->private; | 2010 | conf_t *conf = mddev->private; |
| 1517 | struct list_head *head = &conf->retry_list; | 2011 | struct list_head *head = &conf->retry_list; |
| 1518 | mdk_rdev_t *rdev; | ||
| 1519 | struct blk_plug plug; | 2012 | struct blk_plug plug; |
| 1520 | 2013 | ||
| 1521 | md_check_recovery(mddev); | 2014 | md_check_recovery(mddev); |
| 1522 | 2015 | ||
| 1523 | blk_start_plug(&plug); | 2016 | blk_start_plug(&plug); |
| 1524 | for (;;) { | 2017 | for (;;) { |
| 1525 | char b[BDEVNAME_SIZE]; | ||
| 1526 | 2018 | ||
| 1527 | if (atomic_read(&mddev->plug_cnt) == 0) | 2019 | if (atomic_read(&mddev->plug_cnt) == 0) |
| 1528 | flush_pending_writes(conf); | 2020 | flush_pending_writes(conf); |
| @@ -1539,62 +2031,26 @@ static void raid1d(mddev_t *mddev) | |||
| 1539 | 2031 | ||
| 1540 | mddev = r1_bio->mddev; | 2032 | mddev = r1_bio->mddev; |
| 1541 | conf = mddev->private; | 2033 | conf = mddev->private; |
| 1542 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) | 2034 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { |
| 1543 | sync_request_write(mddev, r1_bio); | 2035 | if (test_bit(R1BIO_MadeGood, &r1_bio->state) || |
| 1544 | else { | 2036 | test_bit(R1BIO_WriteError, &r1_bio->state)) |
| 1545 | int disk; | 2037 | handle_sync_write_finished(conf, r1_bio); |
| 1546 | 2038 | else | |
| 1547 | /* we got a read error. Maybe the drive is bad. Maybe just | 2039 | sync_request_write(mddev, r1_bio); |
| 1548 | * the block and we can fix it. | 2040 | } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) || |
| 1549 | * We freeze all other IO, and try reading the block from | 2041 | test_bit(R1BIO_WriteError, &r1_bio->state)) |
| 1550 | * other devices. When we find one, we re-write | 2042 | handle_write_finished(conf, r1_bio); |
| 1551 | * and check it that fixes the read error. | 2043 | else if (test_bit(R1BIO_ReadError, &r1_bio->state)) |
| 1552 | * This is all done synchronously while the array is | 2044 | handle_read_error(conf, r1_bio); |
| 1553 | * frozen | 2045 | else |
| 2046 | /* just a partial read to be scheduled from separate | ||
| 2047 | * context | ||
| 1554 | */ | 2048 | */ |
| 1555 | if (mddev->ro == 0) { | 2049 | generic_make_request(r1_bio->bios[r1_bio->read_disk]); |
| 1556 | freeze_array(conf); | 2050 | |
| 1557 | fix_read_error(conf, r1_bio->read_disk, | ||
| 1558 | r1_bio->sector, | ||
| 1559 | r1_bio->sectors); | ||
| 1560 | unfreeze_array(conf); | ||
| 1561 | } else | ||
| 1562 | md_error(mddev, | ||
| 1563 | conf->mirrors[r1_bio->read_disk].rdev); | ||
| 1564 | |||
| 1565 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
| 1566 | if ((disk=read_balance(conf, r1_bio)) == -1) { | ||
| 1567 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" | ||
| 1568 | " read error for block %llu\n", | ||
| 1569 | mdname(mddev), | ||
| 1570 | bdevname(bio->bi_bdev,b), | ||
| 1571 | (unsigned long long)r1_bio->sector); | ||
| 1572 | raid_end_bio_io(r1_bio); | ||
| 1573 | } else { | ||
| 1574 | const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC; | ||
| 1575 | r1_bio->bios[r1_bio->read_disk] = | ||
| 1576 | mddev->ro ? IO_BLOCKED : NULL; | ||
| 1577 | r1_bio->read_disk = disk; | ||
| 1578 | bio_put(bio); | ||
| 1579 | bio = bio_clone_mddev(r1_bio->master_bio, | ||
| 1580 | GFP_NOIO, mddev); | ||
| 1581 | r1_bio->bios[r1_bio->read_disk] = bio; | ||
| 1582 | rdev = conf->mirrors[disk].rdev; | ||
| 1583 | if (printk_ratelimit()) | ||
| 1584 | printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to" | ||
| 1585 | " other mirror: %s\n", | ||
| 1586 | mdname(mddev), | ||
| 1587 | (unsigned long long)r1_bio->sector, | ||
| 1588 | bdevname(rdev->bdev,b)); | ||
| 1589 | bio->bi_sector = r1_bio->sector + rdev->data_offset; | ||
| 1590 | bio->bi_bdev = rdev->bdev; | ||
| 1591 | bio->bi_end_io = raid1_end_read_request; | ||
| 1592 | bio->bi_rw = READ | do_sync; | ||
| 1593 | bio->bi_private = r1_bio; | ||
| 1594 | generic_make_request(bio); | ||
| 1595 | } | ||
| 1596 | } | ||
| 1597 | cond_resched(); | 2051 | cond_resched(); |
| 2052 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | ||
| 2053 | md_check_recovery(mddev); | ||
| 1598 | } | 2054 | } |
| 1599 | blk_finish_plug(&plug); | 2055 | blk_finish_plug(&plug); |
| 1600 | } | 2056 | } |
| @@ -1636,6 +2092,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1636 | int write_targets = 0, read_targets = 0; | 2092 | int write_targets = 0, read_targets = 0; |
| 1637 | sector_t sync_blocks; | 2093 | sector_t sync_blocks; |
| 1638 | int still_degraded = 0; | 2094 | int still_degraded = 0; |
| 2095 | int good_sectors = RESYNC_SECTORS; | ||
| 2096 | int min_bad = 0; /* number of sectors that are bad in all devices */ | ||
| 1639 | 2097 | ||
| 1640 | if (!conf->r1buf_pool) | 2098 | if (!conf->r1buf_pool) |
| 1641 | if (init_resync(conf)) | 2099 | if (init_resync(conf)) |
| @@ -1723,36 +2181,89 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1723 | 2181 | ||
| 1724 | rdev = rcu_dereference(conf->mirrors[i].rdev); | 2182 | rdev = rcu_dereference(conf->mirrors[i].rdev); |
| 1725 | if (rdev == NULL || | 2183 | if (rdev == NULL || |
| 1726 | test_bit(Faulty, &rdev->flags)) { | 2184 | test_bit(Faulty, &rdev->flags)) { |
| 1727 | still_degraded = 1; | 2185 | still_degraded = 1; |
| 1728 | continue; | ||
| 1729 | } else if (!test_bit(In_sync, &rdev->flags)) { | 2186 | } else if (!test_bit(In_sync, &rdev->flags)) { |
| 1730 | bio->bi_rw = WRITE; | 2187 | bio->bi_rw = WRITE; |
| 1731 | bio->bi_end_io = end_sync_write; | 2188 | bio->bi_end_io = end_sync_write; |
| 1732 | write_targets ++; | 2189 | write_targets ++; |
| 1733 | } else { | 2190 | } else { |
| 1734 | /* may need to read from here */ | 2191 | /* may need to read from here */ |
| 1735 | bio->bi_rw = READ; | 2192 | sector_t first_bad = MaxSector; |
| 1736 | bio->bi_end_io = end_sync_read; | 2193 | int bad_sectors; |
| 1737 | if (test_bit(WriteMostly, &rdev->flags)) { | 2194 | |
| 1738 | if (wonly < 0) | 2195 | if (is_badblock(rdev, sector_nr, good_sectors, |
| 1739 | wonly = i; | 2196 | &first_bad, &bad_sectors)) { |
| 1740 | } else { | 2197 | if (first_bad > sector_nr) |
| 1741 | if (disk < 0) | 2198 | good_sectors = first_bad - sector_nr; |
| 1742 | disk = i; | 2199 | else { |
| 2200 | bad_sectors -= (sector_nr - first_bad); | ||
| 2201 | if (min_bad == 0 || | ||
| 2202 | min_bad > bad_sectors) | ||
| 2203 | min_bad = bad_sectors; | ||
| 2204 | } | ||
| 2205 | } | ||
| 2206 | if (sector_nr < first_bad) { | ||
| 2207 | if (test_bit(WriteMostly, &rdev->flags)) { | ||
| 2208 | if (wonly < 0) | ||
| 2209 | wonly = i; | ||
| 2210 | } else { | ||
| 2211 | if (disk < 0) | ||
| 2212 | disk = i; | ||
| 2213 | } | ||
| 2214 | bio->bi_rw = READ; | ||
| 2215 | bio->bi_end_io = end_sync_read; | ||
| 2216 | read_targets++; | ||
| 1743 | } | 2217 | } |
| 1744 | read_targets++; | ||
| 1745 | } | 2218 | } |
| 1746 | atomic_inc(&rdev->nr_pending); | 2219 | if (bio->bi_end_io) { |
| 1747 | bio->bi_sector = sector_nr + rdev->data_offset; | 2220 | atomic_inc(&rdev->nr_pending); |
| 1748 | bio->bi_bdev = rdev->bdev; | 2221 | bio->bi_sector = sector_nr + rdev->data_offset; |
| 1749 | bio->bi_private = r1_bio; | 2222 | bio->bi_bdev = rdev->bdev; |
| 2223 | bio->bi_private = r1_bio; | ||
| 2224 | } | ||
| 1750 | } | 2225 | } |
| 1751 | rcu_read_unlock(); | 2226 | rcu_read_unlock(); |
| 1752 | if (disk < 0) | 2227 | if (disk < 0) |
| 1753 | disk = wonly; | 2228 | disk = wonly; |
| 1754 | r1_bio->read_disk = disk; | 2229 | r1_bio->read_disk = disk; |
| 1755 | 2230 | ||
| 2231 | if (read_targets == 0 && min_bad > 0) { | ||
| 2232 | /* These sectors are bad on all InSync devices, so we | ||
| 2233 | * need to mark them bad on all write targets | ||
| 2234 | */ | ||
| 2235 | int ok = 1; | ||
| 2236 | for (i = 0 ; i < conf->raid_disks ; i++) | ||
| 2237 | if (r1_bio->bios[i]->bi_end_io == end_sync_write) { | ||
| 2238 | mdk_rdev_t *rdev = | ||
| 2239 | rcu_dereference(conf->mirrors[i].rdev); | ||
| 2240 | ok = rdev_set_badblocks(rdev, sector_nr, | ||
| 2241 | min_bad, 0 | ||
| 2242 | ) && ok; | ||
| 2243 | } | ||
| 2244 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
| 2245 | *skipped = 1; | ||
| 2246 | put_buf(r1_bio); | ||
| 2247 | |||
| 2248 | if (!ok) { | ||
| 2249 | /* Cannot record the badblocks, so need to | ||
| 2250 | * abort the resync. | ||
| 2251 | * If there are multiple read targets, could just | ||
| 2252 | * fail the really bad ones ??? | ||
| 2253 | */ | ||
| 2254 | conf->recovery_disabled = mddev->recovery_disabled; | ||
| 2255 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
| 2256 | return 0; | ||
| 2257 | } else | ||
| 2258 | return min_bad; | ||
| 2259 | |||
| 2260 | } | ||
| 2261 | if (min_bad > 0 && min_bad < good_sectors) { | ||
| 2262 | /* only resync enough to reach the next bad->good | ||
| 2263 | * transition */ | ||
| 2264 | good_sectors = min_bad; | ||
| 2265 | } | ||
| 2266 | |||
| 1756 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) | 2267 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) |
| 1757 | /* extra read targets are also write targets */ | 2268 | /* extra read targets are also write targets */ |
| 1758 | write_targets += read_targets-1; | 2269 | write_targets += read_targets-1; |
| @@ -1769,6 +2280,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1769 | 2280 | ||
| 1770 | if (max_sector > mddev->resync_max) | 2281 | if (max_sector > mddev->resync_max) |
| 1771 | max_sector = mddev->resync_max; /* Don't do IO beyond here */ | 2282 | max_sector = mddev->resync_max; /* Don't do IO beyond here */ |
| 2283 | if (max_sector > sector_nr + good_sectors) | ||
| 2284 | max_sector = sector_nr + good_sectors; | ||
| 1772 | nr_sectors = 0; | 2285 | nr_sectors = 0; |
| 1773 | sync_blocks = 0; | 2286 | sync_blocks = 0; |
| 1774 | do { | 2287 | do { |
| @@ -2154,18 +2667,13 @@ static int raid1_reshape(mddev_t *mddev) | |||
| 2154 | for (d = d2 = 0; d < conf->raid_disks; d++) { | 2667 | for (d = d2 = 0; d < conf->raid_disks; d++) { |
| 2155 | mdk_rdev_t *rdev = conf->mirrors[d].rdev; | 2668 | mdk_rdev_t *rdev = conf->mirrors[d].rdev; |
| 2156 | if (rdev && rdev->raid_disk != d2) { | 2669 | if (rdev && rdev->raid_disk != d2) { |
| 2157 | char nm[20]; | 2670 | sysfs_unlink_rdev(mddev, rdev); |
| 2158 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
| 2159 | sysfs_remove_link(&mddev->kobj, nm); | ||
| 2160 | rdev->raid_disk = d2; | 2671 | rdev->raid_disk = d2; |
| 2161 | sprintf(nm, "rd%d", rdev->raid_disk); | 2672 | sysfs_unlink_rdev(mddev, rdev); |
| 2162 | sysfs_remove_link(&mddev->kobj, nm); | 2673 | if (sysfs_link_rdev(mddev, rdev)) |
| 2163 | if (sysfs_create_link(&mddev->kobj, | ||
| 2164 | &rdev->kobj, nm)) | ||
| 2165 | printk(KERN_WARNING | 2674 | printk(KERN_WARNING |
| 2166 | "md/raid1:%s: cannot register " | 2675 | "md/raid1:%s: cannot register rd%d\n", |
| 2167 | "%s\n", | 2676 | mdname(mddev), rdev->raid_disk); |
| 2168 | mdname(mddev), nm); | ||
| 2169 | } | 2677 | } |
| 2170 | if (rdev) | 2678 | if (rdev) |
| 2171 | newmirrors[d2++].rdev = rdev; | 2679 | newmirrors[d2++].rdev = rdev; |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index e743a64fac4f..e0d676b48974 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
| @@ -48,6 +48,12 @@ struct r1_private_data_s { | |||
| 48 | * (fresh device added). | 48 | * (fresh device added). |
| 49 | * Cleared when a sync completes. | 49 | * Cleared when a sync completes. |
| 50 | */ | 50 | */ |
| 51 | int recovery_disabled; /* when the same as | ||
| 52 | * mddev->recovery_disabled | ||
| 53 | * we don't allow recovery | ||
| 54 | * to be attempted as we | ||
| 55 | * expect a read error | ||
| 56 | */ | ||
| 51 | 57 | ||
| 52 | wait_queue_head_t wait_barrier; | 58 | wait_queue_head_t wait_barrier; |
| 53 | 59 | ||
| @@ -95,7 +101,7 @@ struct r1bio_s { | |||
| 95 | 101 | ||
| 96 | struct list_head retry_list; | 102 | struct list_head retry_list; |
| 97 | /* Next two are only valid when R1BIO_BehindIO is set */ | 103 | /* Next two are only valid when R1BIO_BehindIO is set */ |
| 98 | struct page **behind_pages; | 104 | struct bio_vec *behind_bvecs; |
| 99 | int behind_page_count; | 105 | int behind_page_count; |
| 100 | /* | 106 | /* |
| 101 | * if the IO is in WRITE direction, then multiple bios are used. | 107 | * if the IO is in WRITE direction, then multiple bios are used. |
| @@ -110,13 +116,24 @@ struct r1bio_s { | |||
| 110 | * correct the read error. To keep track of bad blocks on a per-bio | 116 | * correct the read error. To keep track of bad blocks on a per-bio |
| 111 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | 117 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer |
| 112 | */ | 118 | */ |
| 113 | #define IO_BLOCKED ((struct bio*)1) | 119 | #define IO_BLOCKED ((struct bio *)1) |
| 120 | /* When we successfully write to a known bad-block, we need to remove the | ||
| 121 | * bad-block marking which must be done from process context. So we record | ||
| 122 | * the success by setting bios[n] to IO_MADE_GOOD | ||
| 123 | */ | ||
| 124 | #define IO_MADE_GOOD ((struct bio *)2) | ||
| 125 | |||
| 126 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
| 114 | 127 | ||
| 115 | /* bits for r1bio.state */ | 128 | /* bits for r1bio.state */ |
| 116 | #define R1BIO_Uptodate 0 | 129 | #define R1BIO_Uptodate 0 |
| 117 | #define R1BIO_IsSync 1 | 130 | #define R1BIO_IsSync 1 |
| 118 | #define R1BIO_Degraded 2 | 131 | #define R1BIO_Degraded 2 |
| 119 | #define R1BIO_BehindIO 3 | 132 | #define R1BIO_BehindIO 3 |
| 133 | /* Set ReadError on bios that experience a readerror so that | ||
| 134 | * raid1d knows what to do with them. | ||
| 135 | */ | ||
| 136 | #define R1BIO_ReadError 4 | ||
| 120 | /* For write-behind requests, we call bi_end_io when | 137 | /* For write-behind requests, we call bi_end_io when |
| 121 | * the last non-write-behind device completes, providing | 138 | * the last non-write-behind device completes, providing |
| 122 | * any write was successful. Otherwise we call when | 139 | * any write was successful. Otherwise we call when |
| @@ -125,6 +142,11 @@ struct r1bio_s { | |||
| 125 | * Record that bi_end_io was called with this flag... | 142 | * Record that bi_end_io was called with this flag... |
| 126 | */ | 143 | */ |
| 127 | #define R1BIO_Returned 6 | 144 | #define R1BIO_Returned 6 |
| 145 | /* If a write for this request means we can clear some | ||
| 146 | * known-bad-block records, we set this flag | ||
| 147 | */ | ||
| 148 | #define R1BIO_MadeGood 7 | ||
| 149 | #define R1BIO_WriteError 8 | ||
| 128 | 150 | ||
| 129 | extern int md_raid1_congested(mddev_t *mddev, int bits); | 151 | extern int md_raid1_congested(mddev_t *mddev, int bits); |
| 130 | 152 | ||
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6e846688962f..8b29cd4f01c8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/delay.h> | 22 | #include <linux/delay.h> |
| 23 | #include <linux/blkdev.h> | 23 | #include <linux/blkdev.h> |
| 24 | #include <linux/seq_file.h> | 24 | #include <linux/seq_file.h> |
| 25 | #include <linux/ratelimit.h> | ||
| 25 | #include "md.h" | 26 | #include "md.h" |
| 26 | #include "raid10.h" | 27 | #include "raid10.h" |
| 27 | #include "raid0.h" | 28 | #include "raid0.h" |
| @@ -123,7 +124,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
| 123 | for (j = 0 ; j < nalloc; j++) { | 124 | for (j = 0 ; j < nalloc; j++) { |
| 124 | bio = r10_bio->devs[j].bio; | 125 | bio = r10_bio->devs[j].bio; |
| 125 | for (i = 0; i < RESYNC_PAGES; i++) { | 126 | for (i = 0; i < RESYNC_PAGES; i++) { |
| 126 | page = alloc_page(gfp_flags); | 127 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, |
| 128 | &conf->mddev->recovery)) { | ||
| 129 | /* we can share bv_page's during recovery */ | ||
| 130 | struct bio *rbio = r10_bio->devs[0].bio; | ||
| 131 | page = rbio->bi_io_vec[i].bv_page; | ||
| 132 | get_page(page); | ||
| 133 | } else | ||
| 134 | page = alloc_page(gfp_flags); | ||
| 127 | if (unlikely(!page)) | 135 | if (unlikely(!page)) |
| 128 | goto out_free_pages; | 136 | goto out_free_pages; |
| 129 | 137 | ||
| @@ -173,7 +181,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) | |||
| 173 | 181 | ||
| 174 | for (i = 0; i < conf->copies; i++) { | 182 | for (i = 0; i < conf->copies; i++) { |
| 175 | struct bio **bio = & r10_bio->devs[i].bio; | 183 | struct bio **bio = & r10_bio->devs[i].bio; |
| 176 | if (*bio && *bio != IO_BLOCKED) | 184 | if (!BIO_SPECIAL(*bio)) |
| 177 | bio_put(*bio); | 185 | bio_put(*bio); |
| 178 | *bio = NULL; | 186 | *bio = NULL; |
| 179 | } | 187 | } |
| @@ -183,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio) | |||
| 183 | { | 191 | { |
| 184 | conf_t *conf = r10_bio->mddev->private; | 192 | conf_t *conf = r10_bio->mddev->private; |
| 185 | 193 | ||
| 186 | /* | ||
| 187 | * Wake up any possible resync thread that waits for the device | ||
| 188 | * to go idle. | ||
| 189 | */ | ||
| 190 | allow_barrier(conf); | ||
| 191 | |||
| 192 | put_all_bios(conf, r10_bio); | 194 | put_all_bios(conf, r10_bio); |
| 193 | mempool_free(r10_bio, conf->r10bio_pool); | 195 | mempool_free(r10_bio, conf->r10bio_pool); |
| 194 | } | 196 | } |
| @@ -227,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio) | |||
| 227 | static void raid_end_bio_io(r10bio_t *r10_bio) | 229 | static void raid_end_bio_io(r10bio_t *r10_bio) |
| 228 | { | 230 | { |
| 229 | struct bio *bio = r10_bio->master_bio; | 231 | struct bio *bio = r10_bio->master_bio; |
| 232 | int done; | ||
| 233 | conf_t *conf = r10_bio->mddev->private; | ||
| 230 | 234 | ||
| 231 | bio_endio(bio, | 235 | if (bio->bi_phys_segments) { |
| 232 | test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); | 236 | unsigned long flags; |
| 237 | spin_lock_irqsave(&conf->device_lock, flags); | ||
| 238 | bio->bi_phys_segments--; | ||
| 239 | done = (bio->bi_phys_segments == 0); | ||
| 240 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
| 241 | } else | ||
| 242 | done = 1; | ||
| 243 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) | ||
| 244 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 245 | if (done) { | ||
| 246 | bio_endio(bio, 0); | ||
| 247 | /* | ||
| 248 | * Wake up any possible resync thread that waits for the device | ||
| 249 | * to go idle. | ||
| 250 | */ | ||
| 251 | allow_barrier(conf); | ||
| 252 | } | ||
| 233 | free_r10bio(r10_bio); | 253 | free_r10bio(r10_bio); |
| 234 | } | 254 | } |
| 235 | 255 | ||
| @@ -244,6 +264,26 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio) | |||
| 244 | r10_bio->devs[slot].addr + (r10_bio->sectors); | 264 | r10_bio->devs[slot].addr + (r10_bio->sectors); |
| 245 | } | 265 | } |
| 246 | 266 | ||
| 267 | /* | ||
| 268 | * Find the disk number which triggered given bio | ||
| 269 | */ | ||
| 270 | static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio, | ||
| 271 | struct bio *bio, int *slotp) | ||
| 272 | { | ||
| 273 | int slot; | ||
| 274 | |||
| 275 | for (slot = 0; slot < conf->copies; slot++) | ||
| 276 | if (r10_bio->devs[slot].bio == bio) | ||
| 277 | break; | ||
| 278 | |||
| 279 | BUG_ON(slot == conf->copies); | ||
| 280 | update_head_pos(slot, r10_bio); | ||
| 281 | |||
| 282 | if (slotp) | ||
| 283 | *slotp = slot; | ||
| 284 | return r10_bio->devs[slot].devnum; | ||
| 285 | } | ||
| 286 | |||
| 247 | static void raid10_end_read_request(struct bio *bio, int error) | 287 | static void raid10_end_read_request(struct bio *bio, int error) |
| 248 | { | 288 | { |
| 249 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 289 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| @@ -277,34 +317,45 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
| 277 | * oops, read error - keep the refcount on the rdev | 317 | * oops, read error - keep the refcount on the rdev |
| 278 | */ | 318 | */ |
| 279 | char b[BDEVNAME_SIZE]; | 319 | char b[BDEVNAME_SIZE]; |
| 280 | if (printk_ratelimit()) | 320 | printk_ratelimited(KERN_ERR |
| 281 | printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", | 321 | "md/raid10:%s: %s: rescheduling sector %llu\n", |
| 282 | mdname(conf->mddev), | 322 | mdname(conf->mddev), |
| 283 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); | 323 | bdevname(conf->mirrors[dev].rdev->bdev, b), |
| 324 | (unsigned long long)r10_bio->sector); | ||
| 325 | set_bit(R10BIO_ReadError, &r10_bio->state); | ||
| 284 | reschedule_retry(r10_bio); | 326 | reschedule_retry(r10_bio); |
| 285 | } | 327 | } |
| 286 | } | 328 | } |
| 287 | 329 | ||
| 330 | static void close_write(r10bio_t *r10_bio) | ||
| 331 | { | ||
| 332 | /* clear the bitmap if all writes complete successfully */ | ||
| 333 | bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, | ||
| 334 | r10_bio->sectors, | ||
| 335 | !test_bit(R10BIO_Degraded, &r10_bio->state), | ||
| 336 | 0); | ||
| 337 | md_write_end(r10_bio->mddev); | ||
| 338 | } | ||
| 339 | |||
| 288 | static void raid10_end_write_request(struct bio *bio, int error) | 340 | static void raid10_end_write_request(struct bio *bio, int error) |
| 289 | { | 341 | { |
| 290 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 342 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| 291 | r10bio_t *r10_bio = bio->bi_private; | 343 | r10bio_t *r10_bio = bio->bi_private; |
| 292 | int slot, dev; | 344 | int dev; |
| 345 | int dec_rdev = 1; | ||
| 293 | conf_t *conf = r10_bio->mddev->private; | 346 | conf_t *conf = r10_bio->mddev->private; |
| 347 | int slot; | ||
| 294 | 348 | ||
| 295 | for (slot = 0; slot < conf->copies; slot++) | 349 | dev = find_bio_disk(conf, r10_bio, bio, &slot); |
| 296 | if (r10_bio->devs[slot].bio == bio) | ||
| 297 | break; | ||
| 298 | dev = r10_bio->devs[slot].devnum; | ||
| 299 | 350 | ||
| 300 | /* | 351 | /* |
| 301 | * this branch is our 'one mirror IO has finished' event handler: | 352 | * this branch is our 'one mirror IO has finished' event handler: |
| 302 | */ | 353 | */ |
| 303 | if (!uptodate) { | 354 | if (!uptodate) { |
| 304 | md_error(r10_bio->mddev, conf->mirrors[dev].rdev); | 355 | set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); |
| 305 | /* an I/O failed, we can't clear the bitmap */ | 356 | set_bit(R10BIO_WriteError, &r10_bio->state); |
| 306 | set_bit(R10BIO_Degraded, &r10_bio->state); | 357 | dec_rdev = 0; |
| 307 | } else | 358 | } else { |
| 308 | /* | 359 | /* |
| 309 | * Set R10BIO_Uptodate in our master bio, so that | 360 | * Set R10BIO_Uptodate in our master bio, so that |
| 310 | * we will return a good error code for to the higher | 361 | * we will return a good error code for to the higher |
| @@ -314,9 +365,22 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
| 314 | * user-side. So if something waits for IO, then it will | 365 | * user-side. So if something waits for IO, then it will |
| 315 | * wait for the 'master' bio. | 366 | * wait for the 'master' bio. |
| 316 | */ | 367 | */ |
| 368 | sector_t first_bad; | ||
| 369 | int bad_sectors; | ||
| 370 | |||
| 317 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 371 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
| 318 | 372 | ||
| 319 | update_head_pos(slot, r10_bio); | 373 | /* Maybe we can clear some bad blocks. */ |
| 374 | if (is_badblock(conf->mirrors[dev].rdev, | ||
| 375 | r10_bio->devs[slot].addr, | ||
| 376 | r10_bio->sectors, | ||
| 377 | &first_bad, &bad_sectors)) { | ||
| 378 | bio_put(bio); | ||
| 379 | r10_bio->devs[slot].bio = IO_MADE_GOOD; | ||
| 380 | dec_rdev = 0; | ||
| 381 | set_bit(R10BIO_MadeGood, &r10_bio->state); | ||
| 382 | } | ||
| 383 | } | ||
| 320 | 384 | ||
| 321 | /* | 385 | /* |
| 322 | * | 386 | * |
| @@ -324,16 +388,18 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
| 324 | * already. | 388 | * already. |
| 325 | */ | 389 | */ |
| 326 | if (atomic_dec_and_test(&r10_bio->remaining)) { | 390 | if (atomic_dec_and_test(&r10_bio->remaining)) { |
| 327 | /* clear the bitmap if all writes complete successfully */ | 391 | if (test_bit(R10BIO_WriteError, &r10_bio->state)) |
| 328 | bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, | 392 | reschedule_retry(r10_bio); |
| 329 | r10_bio->sectors, | 393 | else { |
| 330 | !test_bit(R10BIO_Degraded, &r10_bio->state), | 394 | close_write(r10_bio); |
| 331 | 0); | 395 | if (test_bit(R10BIO_MadeGood, &r10_bio->state)) |
| 332 | md_write_end(r10_bio->mddev); | 396 | reschedule_retry(r10_bio); |
| 333 | raid_end_bio_io(r10_bio); | 397 | else |
| 398 | raid_end_bio_io(r10_bio); | ||
| 399 | } | ||
| 334 | } | 400 | } |
| 335 | 401 | if (dec_rdev) | |
| 336 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | 402 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); |
| 337 | } | 403 | } |
| 338 | 404 | ||
| 339 | 405 | ||
| @@ -484,11 +550,12 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
| 484 | * FIXME: possibly should rethink readbalancing and do it differently | 550 | * FIXME: possibly should rethink readbalancing and do it differently |
| 485 | * depending on near_copies / far_copies geometry. | 551 | * depending on near_copies / far_copies geometry. |
| 486 | */ | 552 | */ |
| 487 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) | 553 | static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors) |
| 488 | { | 554 | { |
| 489 | const sector_t this_sector = r10_bio->sector; | 555 | const sector_t this_sector = r10_bio->sector; |
| 490 | int disk, slot; | 556 | int disk, slot; |
| 491 | const int sectors = r10_bio->sectors; | 557 | int sectors = r10_bio->sectors; |
| 558 | int best_good_sectors; | ||
| 492 | sector_t new_distance, best_dist; | 559 | sector_t new_distance, best_dist; |
| 493 | mdk_rdev_t *rdev; | 560 | mdk_rdev_t *rdev; |
| 494 | int do_balance; | 561 | int do_balance; |
| @@ -497,8 +564,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) | |||
| 497 | raid10_find_phys(conf, r10_bio); | 564 | raid10_find_phys(conf, r10_bio); |
| 498 | rcu_read_lock(); | 565 | rcu_read_lock(); |
| 499 | retry: | 566 | retry: |
| 567 | sectors = r10_bio->sectors; | ||
| 500 | best_slot = -1; | 568 | best_slot = -1; |
| 501 | best_dist = MaxSector; | 569 | best_dist = MaxSector; |
| 570 | best_good_sectors = 0; | ||
| 502 | do_balance = 1; | 571 | do_balance = 1; |
| 503 | /* | 572 | /* |
| 504 | * Check if we can balance. We can balance on the whole | 573 | * Check if we can balance. We can balance on the whole |
| @@ -511,6 +580,10 @@ retry: | |||
| 511 | do_balance = 0; | 580 | do_balance = 0; |
| 512 | 581 | ||
| 513 | for (slot = 0; slot < conf->copies ; slot++) { | 582 | for (slot = 0; slot < conf->copies ; slot++) { |
| 583 | sector_t first_bad; | ||
| 584 | int bad_sectors; | ||
| 585 | sector_t dev_sector; | ||
| 586 | |||
| 514 | if (r10_bio->devs[slot].bio == IO_BLOCKED) | 587 | if (r10_bio->devs[slot].bio == IO_BLOCKED) |
| 515 | continue; | 588 | continue; |
| 516 | disk = r10_bio->devs[slot].devnum; | 589 | disk = r10_bio->devs[slot].devnum; |
| @@ -520,6 +593,37 @@ retry: | |||
| 520 | if (!test_bit(In_sync, &rdev->flags)) | 593 | if (!test_bit(In_sync, &rdev->flags)) |
| 521 | continue; | 594 | continue; |
| 522 | 595 | ||
| 596 | dev_sector = r10_bio->devs[slot].addr; | ||
| 597 | if (is_badblock(rdev, dev_sector, sectors, | ||
| 598 | &first_bad, &bad_sectors)) { | ||
| 599 | if (best_dist < MaxSector) | ||
| 600 | /* Already have a better slot */ | ||
| 601 | continue; | ||
| 602 | if (first_bad <= dev_sector) { | ||
| 603 | /* Cannot read here. If this is the | ||
| 604 | * 'primary' device, then we must not read | ||
| 605 | * beyond 'bad_sectors' from another device. | ||
| 606 | */ | ||
| 607 | bad_sectors -= (dev_sector - first_bad); | ||
| 608 | if (!do_balance && sectors > bad_sectors) | ||
| 609 | sectors = bad_sectors; | ||
| 610 | if (best_good_sectors > sectors) | ||
| 611 | best_good_sectors = sectors; | ||
| 612 | } else { | ||
| 613 | sector_t good_sectors = | ||
| 614 | first_bad - dev_sector; | ||
| 615 | if (good_sectors > best_good_sectors) { | ||
| 616 | best_good_sectors = good_sectors; | ||
| 617 | best_slot = slot; | ||
| 618 | } | ||
| 619 | if (!do_balance) | ||
| 620 | /* Must read from here */ | ||
| 621 | break; | ||
| 622 | } | ||
| 623 | continue; | ||
| 624 | } else | ||
| 625 | best_good_sectors = sectors; | ||
| 626 | |||
| 523 | if (!do_balance) | 627 | if (!do_balance) |
| 524 | break; | 628 | break; |
| 525 | 629 | ||
| @@ -561,6 +665,7 @@ retry: | |||
| 561 | } else | 665 | } else |
| 562 | disk = -1; | 666 | disk = -1; |
| 563 | rcu_read_unlock(); | 667 | rcu_read_unlock(); |
| 668 | *max_sectors = best_good_sectors; | ||
| 564 | 669 | ||
| 565 | return disk; | 670 | return disk; |
| 566 | } | 671 | } |
| @@ -734,6 +839,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 734 | unsigned long flags; | 839 | unsigned long flags; |
| 735 | mdk_rdev_t *blocked_rdev; | 840 | mdk_rdev_t *blocked_rdev; |
| 736 | int plugged; | 841 | int plugged; |
| 842 | int sectors_handled; | ||
| 843 | int max_sectors; | ||
| 737 | 844 | ||
| 738 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | 845 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
| 739 | md_flush_request(mddev, bio); | 846 | md_flush_request(mddev, bio); |
| @@ -808,12 +915,26 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 808 | r10_bio->sector = bio->bi_sector; | 915 | r10_bio->sector = bio->bi_sector; |
| 809 | r10_bio->state = 0; | 916 | r10_bio->state = 0; |
| 810 | 917 | ||
| 918 | /* We might need to issue multiple reads to different | ||
| 919 | * devices if there are bad blocks around, so we keep | ||
| 920 | * track of the number of reads in bio->bi_phys_segments. | ||
| 921 | * If this is 0, there is only one r10_bio and no locking | ||
| 922 | * will be needed when the request completes. If it is | ||
| 923 | * non-zero, then it is the number of not-completed requests. | ||
| 924 | */ | ||
| 925 | bio->bi_phys_segments = 0; | ||
| 926 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
| 927 | |||
| 811 | if (rw == READ) { | 928 | if (rw == READ) { |
| 812 | /* | 929 | /* |
| 813 | * read balancing logic: | 930 | * read balancing logic: |
| 814 | */ | 931 | */ |
| 815 | int disk = read_balance(conf, r10_bio); | 932 | int disk; |
| 816 | int slot = r10_bio->read_slot; | 933 | int slot; |
| 934 | |||
| 935 | read_again: | ||
| 936 | disk = read_balance(conf, r10_bio, &max_sectors); | ||
| 937 | slot = r10_bio->read_slot; | ||
| 817 | if (disk < 0) { | 938 | if (disk < 0) { |
| 818 | raid_end_bio_io(r10_bio); | 939 | raid_end_bio_io(r10_bio); |
| 819 | return 0; | 940 | return 0; |
| @@ -821,6 +942,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 821 | mirror = conf->mirrors + disk; | 942 | mirror = conf->mirrors + disk; |
| 822 | 943 | ||
| 823 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 944 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
| 945 | md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, | ||
| 946 | max_sectors); | ||
| 824 | 947 | ||
| 825 | r10_bio->devs[slot].bio = read_bio; | 948 | r10_bio->devs[slot].bio = read_bio; |
| 826 | 949 | ||
| @@ -831,7 +954,37 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 831 | read_bio->bi_rw = READ | do_sync; | 954 | read_bio->bi_rw = READ | do_sync; |
| 832 | read_bio->bi_private = r10_bio; | 955 | read_bio->bi_private = r10_bio; |
| 833 | 956 | ||
| 834 | generic_make_request(read_bio); | 957 | if (max_sectors < r10_bio->sectors) { |
| 958 | /* Could not read all from this device, so we will | ||
| 959 | * need another r10_bio. | ||
| 960 | */ | ||
| 961 | sectors_handled = (r10_bio->sectors + max_sectors | ||
| 962 | - bio->bi_sector); | ||
| 963 | r10_bio->sectors = max_sectors; | ||
| 964 | spin_lock_irq(&conf->device_lock); | ||
| 965 | if (bio->bi_phys_segments == 0) | ||
| 966 | bio->bi_phys_segments = 2; | ||
| 967 | else | ||
| 968 | bio->bi_phys_segments++; | ||
| 969 | spin_unlock(&conf->device_lock); | ||
| 970 | /* Cannot call generic_make_request directly | ||
| 971 | * as that will be queued in __generic_make_request | ||
| 972 | * and subsequent mempool_alloc might block | ||
| 973 | * waiting for it. so hand bio over to raid10d. | ||
| 974 | */ | ||
| 975 | reschedule_retry(r10_bio); | ||
| 976 | |||
| 977 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
| 978 | |||
| 979 | r10_bio->master_bio = bio; | ||
| 980 | r10_bio->sectors = ((bio->bi_size >> 9) | ||
| 981 | - sectors_handled); | ||
| 982 | r10_bio->state = 0; | ||
| 983 | r10_bio->mddev = mddev; | ||
| 984 | r10_bio->sector = bio->bi_sector + sectors_handled; | ||
| 985 | goto read_again; | ||
| 986 | } else | ||
| 987 | generic_make_request(read_bio); | ||
| 835 | return 0; | 988 | return 0; |
| 836 | } | 989 | } |
| 837 | 990 | ||
| @@ -841,13 +994,22 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 841 | /* first select target devices under rcu_lock and | 994 | /* first select target devices under rcu_lock and |
| 842 | * inc refcount on their rdev. Record them by setting | 995 | * inc refcount on their rdev. Record them by setting |
| 843 | * bios[x] to bio | 996 | * bios[x] to bio |
| 997 | * If there are known/acknowledged bad blocks on any device | ||
| 998 | * on which we have seen a write error, we want to avoid | ||
| 999 | * writing to those blocks. This potentially requires several | ||
| 1000 | * writes to write around the bad blocks. Each set of writes | ||
| 1001 | * gets its own r10_bio with a set of bios attached. The number | ||
| 1002 | * of r10_bios is recored in bio->bi_phys_segments just as with | ||
| 1003 | * the read case. | ||
| 844 | */ | 1004 | */ |
| 845 | plugged = mddev_check_plugged(mddev); | 1005 | plugged = mddev_check_plugged(mddev); |
| 846 | 1006 | ||
| 847 | raid10_find_phys(conf, r10_bio); | 1007 | raid10_find_phys(conf, r10_bio); |
| 848 | retry_write: | 1008 | retry_write: |
| 849 | blocked_rdev = NULL; | 1009 | blocked_rdev = NULL; |
| 850 | rcu_read_lock(); | 1010 | rcu_read_lock(); |
| 1011 | max_sectors = r10_bio->sectors; | ||
| 1012 | |||
| 851 | for (i = 0; i < conf->copies; i++) { | 1013 | for (i = 0; i < conf->copies; i++) { |
| 852 | int d = r10_bio->devs[i].devnum; | 1014 | int d = r10_bio->devs[i].devnum; |
| 853 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); | 1015 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); |
| @@ -856,13 +1018,55 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 856 | blocked_rdev = rdev; | 1018 | blocked_rdev = rdev; |
| 857 | break; | 1019 | break; |
| 858 | } | 1020 | } |
| 859 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 1021 | r10_bio->devs[i].bio = NULL; |
| 860 | atomic_inc(&rdev->nr_pending); | 1022 | if (!rdev || test_bit(Faulty, &rdev->flags)) { |
| 861 | r10_bio->devs[i].bio = bio; | ||
| 862 | } else { | ||
| 863 | r10_bio->devs[i].bio = NULL; | ||
| 864 | set_bit(R10BIO_Degraded, &r10_bio->state); | 1023 | set_bit(R10BIO_Degraded, &r10_bio->state); |
| 1024 | continue; | ||
| 865 | } | 1025 | } |
| 1026 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
| 1027 | sector_t first_bad; | ||
| 1028 | sector_t dev_sector = r10_bio->devs[i].addr; | ||
| 1029 | int bad_sectors; | ||
| 1030 | int is_bad; | ||
| 1031 | |||
| 1032 | is_bad = is_badblock(rdev, dev_sector, | ||
| 1033 | max_sectors, | ||
| 1034 | &first_bad, &bad_sectors); | ||
| 1035 | if (is_bad < 0) { | ||
| 1036 | /* Mustn't write here until the bad block | ||
| 1037 | * is acknowledged | ||
| 1038 | */ | ||
| 1039 | atomic_inc(&rdev->nr_pending); | ||
| 1040 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
| 1041 | blocked_rdev = rdev; | ||
| 1042 | break; | ||
| 1043 | } | ||
| 1044 | if (is_bad && first_bad <= dev_sector) { | ||
| 1045 | /* Cannot write here at all */ | ||
| 1046 | bad_sectors -= (dev_sector - first_bad); | ||
| 1047 | if (bad_sectors < max_sectors) | ||
| 1048 | /* Mustn't write more than bad_sectors | ||
| 1049 | * to other devices yet | ||
| 1050 | */ | ||
| 1051 | max_sectors = bad_sectors; | ||
| 1052 | /* We don't set R10BIO_Degraded as that | ||
| 1053 | * only applies if the disk is missing, | ||
| 1054 | * so it might be re-added, and we want to | ||
| 1055 | * know to recover this chunk. | ||
| 1056 | * In this case the device is here, and the | ||
| 1057 | * fact that this chunk is not in-sync is | ||
| 1058 | * recorded in the bad block log. | ||
| 1059 | */ | ||
| 1060 | continue; | ||
| 1061 | } | ||
| 1062 | if (is_bad) { | ||
| 1063 | int good_sectors = first_bad - dev_sector; | ||
| 1064 | if (good_sectors < max_sectors) | ||
| 1065 | max_sectors = good_sectors; | ||
| 1066 | } | ||
| 1067 | } | ||
| 1068 | r10_bio->devs[i].bio = bio; | ||
| 1069 | atomic_inc(&rdev->nr_pending); | ||
| 866 | } | 1070 | } |
| 867 | rcu_read_unlock(); | 1071 | rcu_read_unlock(); |
| 868 | 1072 | ||
| @@ -882,8 +1086,22 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 882 | goto retry_write; | 1086 | goto retry_write; |
| 883 | } | 1087 | } |
| 884 | 1088 | ||
| 1089 | if (max_sectors < r10_bio->sectors) { | ||
| 1090 | /* We are splitting this into multiple parts, so | ||
| 1091 | * we need to prepare for allocating another r10_bio. | ||
| 1092 | */ | ||
| 1093 | r10_bio->sectors = max_sectors; | ||
| 1094 | spin_lock_irq(&conf->device_lock); | ||
| 1095 | if (bio->bi_phys_segments == 0) | ||
| 1096 | bio->bi_phys_segments = 2; | ||
| 1097 | else | ||
| 1098 | bio->bi_phys_segments++; | ||
| 1099 | spin_unlock_irq(&conf->device_lock); | ||
| 1100 | } | ||
| 1101 | sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; | ||
| 1102 | |||
| 885 | atomic_set(&r10_bio->remaining, 1); | 1103 | atomic_set(&r10_bio->remaining, 1); |
| 886 | bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); | 1104 | bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); |
| 887 | 1105 | ||
| 888 | for (i = 0; i < conf->copies; i++) { | 1106 | for (i = 0; i < conf->copies; i++) { |
| 889 | struct bio *mbio; | 1107 | struct bio *mbio; |
| @@ -892,10 +1110,12 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 892 | continue; | 1110 | continue; |
| 893 | 1111 | ||
| 894 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1112 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
| 1113 | md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, | ||
| 1114 | max_sectors); | ||
| 895 | r10_bio->devs[i].bio = mbio; | 1115 | r10_bio->devs[i].bio = mbio; |
| 896 | 1116 | ||
| 897 | mbio->bi_sector = r10_bio->devs[i].addr+ | 1117 | mbio->bi_sector = (r10_bio->devs[i].addr+ |
| 898 | conf->mirrors[d].rdev->data_offset; | 1118 | conf->mirrors[d].rdev->data_offset); |
| 899 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 1119 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
| 900 | mbio->bi_end_io = raid10_end_write_request; | 1120 | mbio->bi_end_io = raid10_end_write_request; |
| 901 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1121 | mbio->bi_rw = WRITE | do_sync | do_fua; |
| @@ -920,6 +1140,21 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
| 920 | /* In case raid10d snuck in to freeze_array */ | 1140 | /* In case raid10d snuck in to freeze_array */ |
| 921 | wake_up(&conf->wait_barrier); | 1141 | wake_up(&conf->wait_barrier); |
| 922 | 1142 | ||
| 1143 | if (sectors_handled < (bio->bi_size >> 9)) { | ||
| 1144 | /* We need another r10_bio. It has already been counted | ||
| 1145 | * in bio->bi_phys_segments. | ||
| 1146 | */ | ||
| 1147 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
| 1148 | |||
| 1149 | r10_bio->master_bio = bio; | ||
| 1150 | r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; | ||
| 1151 | |||
| 1152 | r10_bio->mddev = mddev; | ||
| 1153 | r10_bio->sector = bio->bi_sector + sectors_handled; | ||
| 1154 | r10_bio->state = 0; | ||
| 1155 | goto retry_write; | ||
| 1156 | } | ||
| 1157 | |||
| 923 | if (do_sync || !mddev->bitmap || !plugged) | 1158 | if (do_sync || !mddev->bitmap || !plugged) |
| 924 | md_wakeup_thread(mddev->thread); | 1159 | md_wakeup_thread(mddev->thread); |
| 925 | return 0; | 1160 | return 0; |
| @@ -949,6 +1184,30 @@ static void status(struct seq_file *seq, mddev_t *mddev) | |||
| 949 | seq_printf(seq, "]"); | 1184 | seq_printf(seq, "]"); |
| 950 | } | 1185 | } |
| 951 | 1186 | ||
| 1187 | /* check if there are enough drives for | ||
| 1188 | * every block to appear on atleast one. | ||
| 1189 | * Don't consider the device numbered 'ignore' | ||
| 1190 | * as we might be about to remove it. | ||
| 1191 | */ | ||
| 1192 | static int enough(conf_t *conf, int ignore) | ||
| 1193 | { | ||
| 1194 | int first = 0; | ||
| 1195 | |||
| 1196 | do { | ||
| 1197 | int n = conf->copies; | ||
| 1198 | int cnt = 0; | ||
| 1199 | while (n--) { | ||
| 1200 | if (conf->mirrors[first].rdev && | ||
| 1201 | first != ignore) | ||
| 1202 | cnt++; | ||
| 1203 | first = (first+1) % conf->raid_disks; | ||
| 1204 | } | ||
| 1205 | if (cnt == 0) | ||
| 1206 | return 0; | ||
| 1207 | } while (first != 0); | ||
| 1208 | return 1; | ||
| 1209 | } | ||
| 1210 | |||
| 952 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | 1211 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) |
| 953 | { | 1212 | { |
| 954 | char b[BDEVNAME_SIZE]; | 1213 | char b[BDEVNAME_SIZE]; |
| @@ -961,13 +1220,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 961 | * else mark the drive as failed | 1220 | * else mark the drive as failed |
| 962 | */ | 1221 | */ |
| 963 | if (test_bit(In_sync, &rdev->flags) | 1222 | if (test_bit(In_sync, &rdev->flags) |
| 964 | && conf->raid_disks-mddev->degraded == 1) | 1223 | && !enough(conf, rdev->raid_disk)) |
| 965 | /* | 1224 | /* |
| 966 | * Don't fail the drive, just return an IO error. | 1225 | * Don't fail the drive, just return an IO error. |
| 967 | * The test should really be more sophisticated than | ||
| 968 | * "working_disks == 1", but it isn't critical, and | ||
| 969 | * can wait until we do more sophisticated "is the drive | ||
| 970 | * really dead" tests... | ||
| 971 | */ | 1226 | */ |
| 972 | return; | 1227 | return; |
| 973 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1228 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
| @@ -980,6 +1235,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 980 | */ | 1235 | */ |
| 981 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 1236 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
| 982 | } | 1237 | } |
| 1238 | set_bit(Blocked, &rdev->flags); | ||
| 983 | set_bit(Faulty, &rdev->flags); | 1239 | set_bit(Faulty, &rdev->flags); |
| 984 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1240 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
| 985 | printk(KERN_ALERT | 1241 | printk(KERN_ALERT |
| @@ -1022,27 +1278,6 @@ static void close_sync(conf_t *conf) | |||
| 1022 | conf->r10buf_pool = NULL; | 1278 | conf->r10buf_pool = NULL; |
| 1023 | } | 1279 | } |
| 1024 | 1280 | ||
| 1025 | /* check if there are enough drives for | ||
| 1026 | * every block to appear on atleast one | ||
| 1027 | */ | ||
| 1028 | static int enough(conf_t *conf) | ||
| 1029 | { | ||
| 1030 | int first = 0; | ||
| 1031 | |||
| 1032 | do { | ||
| 1033 | int n = conf->copies; | ||
| 1034 | int cnt = 0; | ||
| 1035 | while (n--) { | ||
| 1036 | if (conf->mirrors[first].rdev) | ||
| 1037 | cnt++; | ||
| 1038 | first = (first+1) % conf->raid_disks; | ||
| 1039 | } | ||
| 1040 | if (cnt == 0) | ||
| 1041 | return 0; | ||
| 1042 | } while (first != 0); | ||
| 1043 | return 1; | ||
| 1044 | } | ||
| 1045 | |||
| 1046 | static int raid10_spare_active(mddev_t *mddev) | 1281 | static int raid10_spare_active(mddev_t *mddev) |
| 1047 | { | 1282 | { |
| 1048 | int i; | 1283 | int i; |
| @@ -1078,7 +1313,6 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1078 | conf_t *conf = mddev->private; | 1313 | conf_t *conf = mddev->private; |
| 1079 | int err = -EEXIST; | 1314 | int err = -EEXIST; |
| 1080 | int mirror; | 1315 | int mirror; |
| 1081 | mirror_info_t *p; | ||
| 1082 | int first = 0; | 1316 | int first = 0; |
| 1083 | int last = conf->raid_disks - 1; | 1317 | int last = conf->raid_disks - 1; |
| 1084 | 1318 | ||
| @@ -1087,44 +1321,47 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1087 | * very different from resync | 1321 | * very different from resync |
| 1088 | */ | 1322 | */ |
| 1089 | return -EBUSY; | 1323 | return -EBUSY; |
| 1090 | if (!enough(conf)) | 1324 | if (!enough(conf, -1)) |
| 1091 | return -EINVAL; | 1325 | return -EINVAL; |
| 1092 | 1326 | ||
| 1093 | if (rdev->raid_disk >= 0) | 1327 | if (rdev->raid_disk >= 0) |
| 1094 | first = last = rdev->raid_disk; | 1328 | first = last = rdev->raid_disk; |
| 1095 | 1329 | ||
| 1096 | if (rdev->saved_raid_disk >= 0 && | 1330 | if (rdev->saved_raid_disk >= first && |
| 1097 | rdev->saved_raid_disk >= first && | ||
| 1098 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) | 1331 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) |
| 1099 | mirror = rdev->saved_raid_disk; | 1332 | mirror = rdev->saved_raid_disk; |
| 1100 | else | 1333 | else |
| 1101 | mirror = first; | 1334 | mirror = first; |
| 1102 | for ( ; mirror <= last ; mirror++) | 1335 | for ( ; mirror <= last ; mirror++) { |
| 1103 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1336 | mirror_info_t *p = &conf->mirrors[mirror]; |
| 1104 | 1337 | if (p->recovery_disabled == mddev->recovery_disabled) | |
| 1105 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1338 | continue; |
| 1106 | rdev->data_offset << 9); | 1339 | if (!p->rdev) |
| 1107 | /* as we don't honour merge_bvec_fn, we must | 1340 | continue; |
| 1108 | * never risk violating it, so limit | ||
| 1109 | * ->max_segments to one lying with a single | ||
| 1110 | * page, as a one page request is never in | ||
| 1111 | * violation. | ||
| 1112 | */ | ||
| 1113 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
| 1114 | blk_queue_max_segments(mddev->queue, 1); | ||
| 1115 | blk_queue_segment_boundary(mddev->queue, | ||
| 1116 | PAGE_CACHE_SIZE - 1); | ||
| 1117 | } | ||
| 1118 | 1341 | ||
| 1119 | p->head_position = 0; | 1342 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
| 1120 | rdev->raid_disk = mirror; | 1343 | rdev->data_offset << 9); |
| 1121 | err = 0; | 1344 | /* as we don't honour merge_bvec_fn, we must |
| 1122 | if (rdev->saved_raid_disk != mirror) | 1345 | * never risk violating it, so limit |
| 1123 | conf->fullsync = 1; | 1346 | * ->max_segments to one lying with a single |
| 1124 | rcu_assign_pointer(p->rdev, rdev); | 1347 | * page, as a one page request is never in |
| 1125 | break; | 1348 | * violation. |
| 1349 | */ | ||
| 1350 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
| 1351 | blk_queue_max_segments(mddev->queue, 1); | ||
| 1352 | blk_queue_segment_boundary(mddev->queue, | ||
| 1353 | PAGE_CACHE_SIZE - 1); | ||
| 1126 | } | 1354 | } |
| 1127 | 1355 | ||
| 1356 | p->head_position = 0; | ||
| 1357 | rdev->raid_disk = mirror; | ||
| 1358 | err = 0; | ||
| 1359 | if (rdev->saved_raid_disk != mirror) | ||
| 1360 | conf->fullsync = 1; | ||
| 1361 | rcu_assign_pointer(p->rdev, rdev); | ||
| 1362 | break; | ||
| 1363 | } | ||
| 1364 | |||
| 1128 | md_integrity_add_rdev(rdev, mddev); | 1365 | md_integrity_add_rdev(rdev, mddev); |
| 1129 | print_conf(conf); | 1366 | print_conf(conf); |
| 1130 | return err; | 1367 | return err; |
| @@ -1149,7 +1386,8 @@ static int raid10_remove_disk(mddev_t *mddev, int number) | |||
| 1149 | * is not possible. | 1386 | * is not possible. |
| 1150 | */ | 1387 | */ |
| 1151 | if (!test_bit(Faulty, &rdev->flags) && | 1388 | if (!test_bit(Faulty, &rdev->flags) && |
| 1152 | enough(conf)) { | 1389 | mddev->recovery_disabled != p->recovery_disabled && |
| 1390 | enough(conf, -1)) { | ||
| 1153 | err = -EBUSY; | 1391 | err = -EBUSY; |
| 1154 | goto abort; | 1392 | goto abort; |
| 1155 | } | 1393 | } |
| @@ -1174,24 +1412,18 @@ static void end_sync_read(struct bio *bio, int error) | |||
| 1174 | { | 1412 | { |
| 1175 | r10bio_t *r10_bio = bio->bi_private; | 1413 | r10bio_t *r10_bio = bio->bi_private; |
| 1176 | conf_t *conf = r10_bio->mddev->private; | 1414 | conf_t *conf = r10_bio->mddev->private; |
| 1177 | int i,d; | 1415 | int d; |
| 1178 | 1416 | ||
| 1179 | for (i=0; i<conf->copies; i++) | 1417 | d = find_bio_disk(conf, r10_bio, bio, NULL); |
| 1180 | if (r10_bio->devs[i].bio == bio) | ||
| 1181 | break; | ||
| 1182 | BUG_ON(i == conf->copies); | ||
| 1183 | update_head_pos(i, r10_bio); | ||
| 1184 | d = r10_bio->devs[i].devnum; | ||
| 1185 | 1418 | ||
| 1186 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 1419 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
| 1187 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 1420 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
| 1188 | else { | 1421 | else |
| 1422 | /* The write handler will notice the lack of | ||
| 1423 | * R10BIO_Uptodate and record any errors etc | ||
| 1424 | */ | ||
| 1189 | atomic_add(r10_bio->sectors, | 1425 | atomic_add(r10_bio->sectors, |
| 1190 | &conf->mirrors[d].rdev->corrected_errors); | 1426 | &conf->mirrors[d].rdev->corrected_errors); |
| 1191 | if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) | ||
| 1192 | md_error(r10_bio->mddev, | ||
| 1193 | conf->mirrors[d].rdev); | ||
| 1194 | } | ||
| 1195 | 1427 | ||
| 1196 | /* for reconstruct, we always reschedule after a read. | 1428 | /* for reconstruct, we always reschedule after a read. |
| 1197 | * for resync, only after all reads | 1429 | * for resync, only after all reads |
| @@ -1206,40 +1438,60 @@ static void end_sync_read(struct bio *bio, int error) | |||
| 1206 | } | 1438 | } |
| 1207 | } | 1439 | } |
| 1208 | 1440 | ||
| 1209 | static void end_sync_write(struct bio *bio, int error) | 1441 | static void end_sync_request(r10bio_t *r10_bio) |
| 1210 | { | 1442 | { |
| 1211 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 1212 | r10bio_t *r10_bio = bio->bi_private; | ||
| 1213 | mddev_t *mddev = r10_bio->mddev; | 1443 | mddev_t *mddev = r10_bio->mddev; |
| 1214 | conf_t *conf = mddev->private; | ||
| 1215 | int i,d; | ||
| 1216 | |||
| 1217 | for (i = 0; i < conf->copies; i++) | ||
| 1218 | if (r10_bio->devs[i].bio == bio) | ||
| 1219 | break; | ||
| 1220 | d = r10_bio->devs[i].devnum; | ||
| 1221 | 1444 | ||
| 1222 | if (!uptodate) | ||
| 1223 | md_error(mddev, conf->mirrors[d].rdev); | ||
| 1224 | |||
| 1225 | update_head_pos(i, r10_bio); | ||
| 1226 | |||
| 1227 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | ||
| 1228 | while (atomic_dec_and_test(&r10_bio->remaining)) { | 1445 | while (atomic_dec_and_test(&r10_bio->remaining)) { |
| 1229 | if (r10_bio->master_bio == NULL) { | 1446 | if (r10_bio->master_bio == NULL) { |
| 1230 | /* the primary of several recovery bios */ | 1447 | /* the primary of several recovery bios */ |
| 1231 | sector_t s = r10_bio->sectors; | 1448 | sector_t s = r10_bio->sectors; |
| 1232 | put_buf(r10_bio); | 1449 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
| 1450 | test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
| 1451 | reschedule_retry(r10_bio); | ||
| 1452 | else | ||
| 1453 | put_buf(r10_bio); | ||
| 1233 | md_done_sync(mddev, s, 1); | 1454 | md_done_sync(mddev, s, 1); |
| 1234 | break; | 1455 | break; |
| 1235 | } else { | 1456 | } else { |
| 1236 | r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; | 1457 | r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; |
| 1237 | put_buf(r10_bio); | 1458 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
| 1459 | test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
| 1460 | reschedule_retry(r10_bio); | ||
| 1461 | else | ||
| 1462 | put_buf(r10_bio); | ||
| 1238 | r10_bio = r10_bio2; | 1463 | r10_bio = r10_bio2; |
| 1239 | } | 1464 | } |
| 1240 | } | 1465 | } |
| 1241 | } | 1466 | } |
| 1242 | 1467 | ||
| 1468 | static void end_sync_write(struct bio *bio, int error) | ||
| 1469 | { | ||
| 1470 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 1471 | r10bio_t *r10_bio = bio->bi_private; | ||
| 1472 | mddev_t *mddev = r10_bio->mddev; | ||
| 1473 | conf_t *conf = mddev->private; | ||
| 1474 | int d; | ||
| 1475 | sector_t first_bad; | ||
| 1476 | int bad_sectors; | ||
| 1477 | int slot; | ||
| 1478 | |||
| 1479 | d = find_bio_disk(conf, r10_bio, bio, &slot); | ||
| 1480 | |||
| 1481 | if (!uptodate) { | ||
| 1482 | set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); | ||
| 1483 | set_bit(R10BIO_WriteError, &r10_bio->state); | ||
| 1484 | } else if (is_badblock(conf->mirrors[d].rdev, | ||
| 1485 | r10_bio->devs[slot].addr, | ||
| 1486 | r10_bio->sectors, | ||
| 1487 | &first_bad, &bad_sectors)) | ||
| 1488 | set_bit(R10BIO_MadeGood, &r10_bio->state); | ||
| 1489 | |||
| 1490 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | ||
| 1491 | |||
| 1492 | end_sync_request(r10_bio); | ||
| 1493 | } | ||
| 1494 | |||
| 1243 | /* | 1495 | /* |
| 1244 | * Note: sync and recover and handled very differently for raid10 | 1496 | * Note: sync and recover and handled very differently for raid10 |
| 1245 | * This code is for resync. | 1497 | * This code is for resync. |
| @@ -1299,11 +1551,12 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1299 | if (j == vcnt) | 1551 | if (j == vcnt) |
| 1300 | continue; | 1552 | continue; |
| 1301 | mddev->resync_mismatches += r10_bio->sectors; | 1553 | mddev->resync_mismatches += r10_bio->sectors; |
| 1554 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) | ||
| 1555 | /* Don't fix anything. */ | ||
| 1556 | continue; | ||
| 1302 | } | 1557 | } |
| 1303 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) | 1558 | /* Ok, we need to write this bio, either to correct an |
| 1304 | /* Don't fix anything. */ | 1559 | * inconsistency or to correct an unreadable block. |
| 1305 | continue; | ||
| 1306 | /* Ok, we need to write this bio | ||
| 1307 | * First we need to fixup bv_offset, bv_len and | 1560 | * First we need to fixup bv_offset, bv_len and |
| 1308 | * bi_vecs, as the read request might have corrupted these | 1561 | * bi_vecs, as the read request might have corrupted these |
| 1309 | */ | 1562 | */ |
| @@ -1355,32 +1608,107 @@ done: | |||
| 1355 | * The second for writing. | 1608 | * The second for writing. |
| 1356 | * | 1609 | * |
| 1357 | */ | 1610 | */ |
| 1611 | static void fix_recovery_read_error(r10bio_t *r10_bio) | ||
| 1612 | { | ||
| 1613 | /* We got a read error during recovery. | ||
| 1614 | * We repeat the read in smaller page-sized sections. | ||
| 1615 | * If a read succeeds, write it to the new device or record | ||
| 1616 | * a bad block if we cannot. | ||
| 1617 | * If a read fails, record a bad block on both old and | ||
| 1618 | * new devices. | ||
| 1619 | */ | ||
| 1620 | mddev_t *mddev = r10_bio->mddev; | ||
| 1621 | conf_t *conf = mddev->private; | ||
| 1622 | struct bio *bio = r10_bio->devs[0].bio; | ||
| 1623 | sector_t sect = 0; | ||
| 1624 | int sectors = r10_bio->sectors; | ||
| 1625 | int idx = 0; | ||
| 1626 | int dr = r10_bio->devs[0].devnum; | ||
| 1627 | int dw = r10_bio->devs[1].devnum; | ||
| 1628 | |||
| 1629 | while (sectors) { | ||
| 1630 | int s = sectors; | ||
| 1631 | mdk_rdev_t *rdev; | ||
| 1632 | sector_t addr; | ||
| 1633 | int ok; | ||
| 1634 | |||
| 1635 | if (s > (PAGE_SIZE>>9)) | ||
| 1636 | s = PAGE_SIZE >> 9; | ||
| 1637 | |||
| 1638 | rdev = conf->mirrors[dr].rdev; | ||
| 1639 | addr = r10_bio->devs[0].addr + sect, | ||
| 1640 | ok = sync_page_io(rdev, | ||
| 1641 | addr, | ||
| 1642 | s << 9, | ||
| 1643 | bio->bi_io_vec[idx].bv_page, | ||
| 1644 | READ, false); | ||
| 1645 | if (ok) { | ||
| 1646 | rdev = conf->mirrors[dw].rdev; | ||
| 1647 | addr = r10_bio->devs[1].addr + sect; | ||
| 1648 | ok = sync_page_io(rdev, | ||
| 1649 | addr, | ||
| 1650 | s << 9, | ||
| 1651 | bio->bi_io_vec[idx].bv_page, | ||
| 1652 | WRITE, false); | ||
| 1653 | if (!ok) | ||
| 1654 | set_bit(WriteErrorSeen, &rdev->flags); | ||
| 1655 | } | ||
| 1656 | if (!ok) { | ||
| 1657 | /* We don't worry if we cannot set a bad block - | ||
| 1658 | * it really is bad so there is no loss in not | ||
| 1659 | * recording it yet | ||
| 1660 | */ | ||
| 1661 | rdev_set_badblocks(rdev, addr, s, 0); | ||
| 1662 | |||
| 1663 | if (rdev != conf->mirrors[dw].rdev) { | ||
| 1664 | /* need bad block on destination too */ | ||
| 1665 | mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev; | ||
| 1666 | addr = r10_bio->devs[1].addr + sect; | ||
| 1667 | ok = rdev_set_badblocks(rdev2, addr, s, 0); | ||
| 1668 | if (!ok) { | ||
| 1669 | /* just abort the recovery */ | ||
| 1670 | printk(KERN_NOTICE | ||
| 1671 | "md/raid10:%s: recovery aborted" | ||
| 1672 | " due to read error\n", | ||
| 1673 | mdname(mddev)); | ||
| 1674 | |||
| 1675 | conf->mirrors[dw].recovery_disabled | ||
| 1676 | = mddev->recovery_disabled; | ||
| 1677 | set_bit(MD_RECOVERY_INTR, | ||
| 1678 | &mddev->recovery); | ||
| 1679 | break; | ||
| 1680 | } | ||
| 1681 | } | ||
| 1682 | } | ||
| 1683 | |||
| 1684 | sectors -= s; | ||
| 1685 | sect += s; | ||
| 1686 | idx++; | ||
| 1687 | } | ||
| 1688 | } | ||
| 1358 | 1689 | ||
| 1359 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) | 1690 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) |
| 1360 | { | 1691 | { |
| 1361 | conf_t *conf = mddev->private; | 1692 | conf_t *conf = mddev->private; |
| 1362 | int i, d; | 1693 | int d; |
| 1363 | struct bio *bio, *wbio; | 1694 | struct bio *wbio; |
| 1364 | 1695 | ||
| 1696 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { | ||
| 1697 | fix_recovery_read_error(r10_bio); | ||
| 1698 | end_sync_request(r10_bio); | ||
| 1699 | return; | ||
| 1700 | } | ||
| 1365 | 1701 | ||
| 1366 | /* move the pages across to the second bio | 1702 | /* |
| 1703 | * share the pages with the first bio | ||
| 1367 | * and submit the write request | 1704 | * and submit the write request |
| 1368 | */ | 1705 | */ |
| 1369 | bio = r10_bio->devs[0].bio; | ||
| 1370 | wbio = r10_bio->devs[1].bio; | 1706 | wbio = r10_bio->devs[1].bio; |
| 1371 | for (i=0; i < wbio->bi_vcnt; i++) { | ||
| 1372 | struct page *p = bio->bi_io_vec[i].bv_page; | ||
| 1373 | bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page; | ||
| 1374 | wbio->bi_io_vec[i].bv_page = p; | ||
| 1375 | } | ||
| 1376 | d = r10_bio->devs[1].devnum; | 1707 | d = r10_bio->devs[1].devnum; |
| 1377 | 1708 | ||
| 1378 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 1709 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
| 1379 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); | 1710 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); |
| 1380 | if (test_bit(R10BIO_Uptodate, &r10_bio->state)) | 1711 | generic_make_request(wbio); |
| 1381 | generic_make_request(wbio); | ||
| 1382 | else | ||
| 1383 | bio_endio(wbio, -EIO); | ||
| 1384 | } | 1712 | } |
| 1385 | 1713 | ||
| 1386 | 1714 | ||
| @@ -1421,6 +1749,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1421 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); | 1749 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); |
| 1422 | } | 1750 | } |
| 1423 | 1751 | ||
| 1752 | static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector, | ||
| 1753 | int sectors, struct page *page, int rw) | ||
| 1754 | { | ||
| 1755 | sector_t first_bad; | ||
| 1756 | int bad_sectors; | ||
| 1757 | |||
| 1758 | if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) | ||
| 1759 | && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) | ||
| 1760 | return -1; | ||
| 1761 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) | ||
| 1762 | /* success */ | ||
| 1763 | return 1; | ||
| 1764 | if (rw == WRITE) | ||
| 1765 | set_bit(WriteErrorSeen, &rdev->flags); | ||
| 1766 | /* need to record an error - either for the block or the device */ | ||
| 1767 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) | ||
| 1768 | md_error(rdev->mddev, rdev); | ||
| 1769 | return 0; | ||
| 1770 | } | ||
| 1771 | |||
| 1424 | /* | 1772 | /* |
| 1425 | * This is a kernel thread which: | 1773 | * This is a kernel thread which: |
| 1426 | * | 1774 | * |
| @@ -1476,10 +1824,15 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1476 | 1824 | ||
| 1477 | rcu_read_lock(); | 1825 | rcu_read_lock(); |
| 1478 | do { | 1826 | do { |
| 1827 | sector_t first_bad; | ||
| 1828 | int bad_sectors; | ||
| 1829 | |||
| 1479 | d = r10_bio->devs[sl].devnum; | 1830 | d = r10_bio->devs[sl].devnum; |
| 1480 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1831 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
| 1481 | if (rdev && | 1832 | if (rdev && |
| 1482 | test_bit(In_sync, &rdev->flags)) { | 1833 | test_bit(In_sync, &rdev->flags) && |
| 1834 | is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, | ||
| 1835 | &first_bad, &bad_sectors) == 0) { | ||
| 1483 | atomic_inc(&rdev->nr_pending); | 1836 | atomic_inc(&rdev->nr_pending); |
| 1484 | rcu_read_unlock(); | 1837 | rcu_read_unlock(); |
| 1485 | success = sync_page_io(rdev, | 1838 | success = sync_page_io(rdev, |
| @@ -1499,9 +1852,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1499 | rcu_read_unlock(); | 1852 | rcu_read_unlock(); |
| 1500 | 1853 | ||
| 1501 | if (!success) { | 1854 | if (!success) { |
| 1502 | /* Cannot read from anywhere -- bye bye array */ | 1855 | /* Cannot read from anywhere, just mark the block |
| 1856 | * as bad on the first device to discourage future | ||
| 1857 | * reads. | ||
| 1858 | */ | ||
| 1503 | int dn = r10_bio->devs[r10_bio->read_slot].devnum; | 1859 | int dn = r10_bio->devs[r10_bio->read_slot].devnum; |
| 1504 | md_error(mddev, conf->mirrors[dn].rdev); | 1860 | rdev = conf->mirrors[dn].rdev; |
| 1861 | |||
| 1862 | if (!rdev_set_badblocks( | ||
| 1863 | rdev, | ||
| 1864 | r10_bio->devs[r10_bio->read_slot].addr | ||
| 1865 | + sect, | ||
| 1866 | s, 0)) | ||
| 1867 | md_error(mddev, rdev); | ||
| 1505 | break; | 1868 | break; |
| 1506 | } | 1869 | } |
| 1507 | 1870 | ||
| @@ -1516,80 +1879,82 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1516 | sl--; | 1879 | sl--; |
| 1517 | d = r10_bio->devs[sl].devnum; | 1880 | d = r10_bio->devs[sl].devnum; |
| 1518 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1881 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
| 1519 | if (rdev && | 1882 | if (!rdev || |
| 1520 | test_bit(In_sync, &rdev->flags)) { | 1883 | !test_bit(In_sync, &rdev->flags)) |
| 1521 | atomic_inc(&rdev->nr_pending); | 1884 | continue; |
| 1522 | rcu_read_unlock(); | 1885 | |
| 1523 | atomic_add(s, &rdev->corrected_errors); | 1886 | atomic_inc(&rdev->nr_pending); |
| 1524 | if (sync_page_io(rdev, | 1887 | rcu_read_unlock(); |
| 1525 | r10_bio->devs[sl].addr + | 1888 | if (r10_sync_page_io(rdev, |
| 1526 | sect, | 1889 | r10_bio->devs[sl].addr + |
| 1527 | s<<9, conf->tmppage, WRITE, false) | 1890 | sect, |
| 1528 | == 0) { | 1891 | s<<9, conf->tmppage, WRITE) |
| 1529 | /* Well, this device is dead */ | 1892 | == 0) { |
| 1530 | printk(KERN_NOTICE | 1893 | /* Well, this device is dead */ |
| 1531 | "md/raid10:%s: read correction " | 1894 | printk(KERN_NOTICE |
| 1532 | "write failed" | 1895 | "md/raid10:%s: read correction " |
| 1533 | " (%d sectors at %llu on %s)\n", | 1896 | "write failed" |
| 1534 | mdname(mddev), s, | 1897 | " (%d sectors at %llu on %s)\n", |
| 1535 | (unsigned long long)( | 1898 | mdname(mddev), s, |
| 1536 | sect + rdev->data_offset), | 1899 | (unsigned long long)( |
| 1537 | bdevname(rdev->bdev, b)); | 1900 | sect + rdev->data_offset), |
| 1538 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 1901 | bdevname(rdev->bdev, b)); |
| 1539 | "drive\n", | 1902 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
| 1540 | mdname(mddev), | 1903 | "drive\n", |
| 1541 | bdevname(rdev->bdev, b)); | 1904 | mdname(mddev), |
| 1542 | md_error(mddev, rdev); | 1905 | bdevname(rdev->bdev, b)); |
| 1543 | } | ||
| 1544 | rdev_dec_pending(rdev, mddev); | ||
| 1545 | rcu_read_lock(); | ||
| 1546 | } | 1906 | } |
| 1907 | rdev_dec_pending(rdev, mddev); | ||
| 1908 | rcu_read_lock(); | ||
| 1547 | } | 1909 | } |
| 1548 | sl = start; | 1910 | sl = start; |
| 1549 | while (sl != r10_bio->read_slot) { | 1911 | while (sl != r10_bio->read_slot) { |
| 1912 | char b[BDEVNAME_SIZE]; | ||
| 1550 | 1913 | ||
| 1551 | if (sl==0) | 1914 | if (sl==0) |
| 1552 | sl = conf->copies; | 1915 | sl = conf->copies; |
| 1553 | sl--; | 1916 | sl--; |
| 1554 | d = r10_bio->devs[sl].devnum; | 1917 | d = r10_bio->devs[sl].devnum; |
| 1555 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1918 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
| 1556 | if (rdev && | 1919 | if (!rdev || |
| 1557 | test_bit(In_sync, &rdev->flags)) { | 1920 | !test_bit(In_sync, &rdev->flags)) |
| 1558 | char b[BDEVNAME_SIZE]; | 1921 | continue; |
| 1559 | atomic_inc(&rdev->nr_pending); | ||
| 1560 | rcu_read_unlock(); | ||
| 1561 | if (sync_page_io(rdev, | ||
| 1562 | r10_bio->devs[sl].addr + | ||
| 1563 | sect, | ||
| 1564 | s<<9, conf->tmppage, | ||
| 1565 | READ, false) == 0) { | ||
| 1566 | /* Well, this device is dead */ | ||
| 1567 | printk(KERN_NOTICE | ||
| 1568 | "md/raid10:%s: unable to read back " | ||
| 1569 | "corrected sectors" | ||
| 1570 | " (%d sectors at %llu on %s)\n", | ||
| 1571 | mdname(mddev), s, | ||
| 1572 | (unsigned long long)( | ||
| 1573 | sect + rdev->data_offset), | ||
| 1574 | bdevname(rdev->bdev, b)); | ||
| 1575 | printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", | ||
| 1576 | mdname(mddev), | ||
| 1577 | bdevname(rdev->bdev, b)); | ||
| 1578 | |||
| 1579 | md_error(mddev, rdev); | ||
| 1580 | } else { | ||
| 1581 | printk(KERN_INFO | ||
| 1582 | "md/raid10:%s: read error corrected" | ||
| 1583 | " (%d sectors at %llu on %s)\n", | ||
| 1584 | mdname(mddev), s, | ||
| 1585 | (unsigned long long)( | ||
| 1586 | sect + rdev->data_offset), | ||
| 1587 | bdevname(rdev->bdev, b)); | ||
| 1588 | } | ||
| 1589 | 1922 | ||
| 1590 | rdev_dec_pending(rdev, mddev); | 1923 | atomic_inc(&rdev->nr_pending); |
| 1591 | rcu_read_lock(); | 1924 | rcu_read_unlock(); |
| 1925 | switch (r10_sync_page_io(rdev, | ||
| 1926 | r10_bio->devs[sl].addr + | ||
| 1927 | sect, | ||
| 1928 | s<<9, conf->tmppage, | ||
| 1929 | READ)) { | ||
| 1930 | case 0: | ||
| 1931 | /* Well, this device is dead */ | ||
| 1932 | printk(KERN_NOTICE | ||
| 1933 | "md/raid10:%s: unable to read back " | ||
| 1934 | "corrected sectors" | ||
| 1935 | " (%d sectors at %llu on %s)\n", | ||
| 1936 | mdname(mddev), s, | ||
| 1937 | (unsigned long long)( | ||
| 1938 | sect + rdev->data_offset), | ||
| 1939 | bdevname(rdev->bdev, b)); | ||
| 1940 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | ||
| 1941 | "drive\n", | ||
| 1942 | mdname(mddev), | ||
| 1943 | bdevname(rdev->bdev, b)); | ||
| 1944 | break; | ||
| 1945 | case 1: | ||
| 1946 | printk(KERN_INFO | ||
| 1947 | "md/raid10:%s: read error corrected" | ||
| 1948 | " (%d sectors at %llu on %s)\n", | ||
| 1949 | mdname(mddev), s, | ||
| 1950 | (unsigned long long)( | ||
| 1951 | sect + rdev->data_offset), | ||
| 1952 | bdevname(rdev->bdev, b)); | ||
| 1953 | atomic_add(s, &rdev->corrected_errors); | ||
| 1592 | } | 1954 | } |
| 1955 | |||
| 1956 | rdev_dec_pending(rdev, mddev); | ||
| 1957 | rcu_read_lock(); | ||
| 1593 | } | 1958 | } |
| 1594 | rcu_read_unlock(); | 1959 | rcu_read_unlock(); |
| 1595 | 1960 | ||
| @@ -1598,21 +1963,254 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1598 | } | 1963 | } |
| 1599 | } | 1964 | } |
| 1600 | 1965 | ||
| 1966 | static void bi_complete(struct bio *bio, int error) | ||
| 1967 | { | ||
| 1968 | complete((struct completion *)bio->bi_private); | ||
| 1969 | } | ||
| 1970 | |||
| 1971 | static int submit_bio_wait(int rw, struct bio *bio) | ||
| 1972 | { | ||
| 1973 | struct completion event; | ||
| 1974 | rw |= REQ_SYNC; | ||
| 1975 | |||
| 1976 | init_completion(&event); | ||
| 1977 | bio->bi_private = &event; | ||
| 1978 | bio->bi_end_io = bi_complete; | ||
| 1979 | submit_bio(rw, bio); | ||
| 1980 | wait_for_completion(&event); | ||
| 1981 | |||
| 1982 | return test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 1983 | } | ||
| 1984 | |||
| 1985 | static int narrow_write_error(r10bio_t *r10_bio, int i) | ||
| 1986 | { | ||
| 1987 | struct bio *bio = r10_bio->master_bio; | ||
| 1988 | mddev_t *mddev = r10_bio->mddev; | ||
| 1989 | conf_t *conf = mddev->private; | ||
| 1990 | mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; | ||
| 1991 | /* bio has the data to be written to slot 'i' where | ||
| 1992 | * we just recently had a write error. | ||
| 1993 | * We repeatedly clone the bio and trim down to one block, | ||
| 1994 | * then try the write. Where the write fails we record | ||
| 1995 | * a bad block. | ||
| 1996 | * It is conceivable that the bio doesn't exactly align with | ||
| 1997 | * blocks. We must handle this. | ||
| 1998 | * | ||
| 1999 | * We currently own a reference to the rdev. | ||
| 2000 | */ | ||
| 2001 | |||
| 2002 | int block_sectors; | ||
| 2003 | sector_t sector; | ||
| 2004 | int sectors; | ||
| 2005 | int sect_to_write = r10_bio->sectors; | ||
| 2006 | int ok = 1; | ||
| 2007 | |||
| 2008 | if (rdev->badblocks.shift < 0) | ||
| 2009 | return 0; | ||
| 2010 | |||
| 2011 | block_sectors = 1 << rdev->badblocks.shift; | ||
| 2012 | sector = r10_bio->sector; | ||
| 2013 | sectors = ((r10_bio->sector + block_sectors) | ||
| 2014 | & ~(sector_t)(block_sectors - 1)) | ||
| 2015 | - sector; | ||
| 2016 | |||
| 2017 | while (sect_to_write) { | ||
| 2018 | struct bio *wbio; | ||
| 2019 | if (sectors > sect_to_write) | ||
| 2020 | sectors = sect_to_write; | ||
| 2021 | /* Write at 'sector' for 'sectors' */ | ||
| 2022 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | ||
| 2023 | md_trim_bio(wbio, sector - bio->bi_sector, sectors); | ||
| 2024 | wbio->bi_sector = (r10_bio->devs[i].addr+ | ||
| 2025 | rdev->data_offset+ | ||
| 2026 | (sector - r10_bio->sector)); | ||
| 2027 | wbio->bi_bdev = rdev->bdev; | ||
| 2028 | if (submit_bio_wait(WRITE, wbio) == 0) | ||
| 2029 | /* Failure! */ | ||
| 2030 | ok = rdev_set_badblocks(rdev, sector, | ||
| 2031 | sectors, 0) | ||
| 2032 | && ok; | ||
| 2033 | |||
| 2034 | bio_put(wbio); | ||
| 2035 | sect_to_write -= sectors; | ||
| 2036 | sector += sectors; | ||
| 2037 | sectors = block_sectors; | ||
| 2038 | } | ||
| 2039 | return ok; | ||
| 2040 | } | ||
| 2041 | |||
| 2042 | static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio) | ||
| 2043 | { | ||
| 2044 | int slot = r10_bio->read_slot; | ||
| 2045 | int mirror = r10_bio->devs[slot].devnum; | ||
| 2046 | struct bio *bio; | ||
| 2047 | conf_t *conf = mddev->private; | ||
| 2048 | mdk_rdev_t *rdev; | ||
| 2049 | char b[BDEVNAME_SIZE]; | ||
| 2050 | unsigned long do_sync; | ||
| 2051 | int max_sectors; | ||
| 2052 | |||
| 2053 | /* we got a read error. Maybe the drive is bad. Maybe just | ||
| 2054 | * the block and we can fix it. | ||
| 2055 | * We freeze all other IO, and try reading the block from | ||
| 2056 | * other devices. When we find one, we re-write | ||
| 2057 | * and check it that fixes the read error. | ||
| 2058 | * This is all done synchronously while the array is | ||
| 2059 | * frozen. | ||
| 2060 | */ | ||
| 2061 | if (mddev->ro == 0) { | ||
| 2062 | freeze_array(conf); | ||
| 2063 | fix_read_error(conf, mddev, r10_bio); | ||
| 2064 | unfreeze_array(conf); | ||
| 2065 | } | ||
| 2066 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
| 2067 | |||
| 2068 | bio = r10_bio->devs[slot].bio; | ||
| 2069 | bdevname(bio->bi_bdev, b); | ||
| 2070 | r10_bio->devs[slot].bio = | ||
| 2071 | mddev->ro ? IO_BLOCKED : NULL; | ||
| 2072 | read_more: | ||
| 2073 | mirror = read_balance(conf, r10_bio, &max_sectors); | ||
| 2074 | if (mirror == -1) { | ||
| 2075 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | ||
| 2076 | " read error for block %llu\n", | ||
| 2077 | mdname(mddev), b, | ||
| 2078 | (unsigned long long)r10_bio->sector); | ||
| 2079 | raid_end_bio_io(r10_bio); | ||
| 2080 | bio_put(bio); | ||
| 2081 | return; | ||
| 2082 | } | ||
| 2083 | |||
| 2084 | do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); | ||
| 2085 | if (bio) | ||
| 2086 | bio_put(bio); | ||
| 2087 | slot = r10_bio->read_slot; | ||
| 2088 | rdev = conf->mirrors[mirror].rdev; | ||
| 2089 | printk_ratelimited( | ||
| 2090 | KERN_ERR | ||
| 2091 | "md/raid10:%s: %s: redirecting" | ||
| 2092 | "sector %llu to another mirror\n", | ||
| 2093 | mdname(mddev), | ||
| 2094 | bdevname(rdev->bdev, b), | ||
| 2095 | (unsigned long long)r10_bio->sector); | ||
| 2096 | bio = bio_clone_mddev(r10_bio->master_bio, | ||
| 2097 | GFP_NOIO, mddev); | ||
| 2098 | md_trim_bio(bio, | ||
| 2099 | r10_bio->sector - bio->bi_sector, | ||
| 2100 | max_sectors); | ||
| 2101 | r10_bio->devs[slot].bio = bio; | ||
| 2102 | bio->bi_sector = r10_bio->devs[slot].addr | ||
| 2103 | + rdev->data_offset; | ||
| 2104 | bio->bi_bdev = rdev->bdev; | ||
| 2105 | bio->bi_rw = READ | do_sync; | ||
| 2106 | bio->bi_private = r10_bio; | ||
| 2107 | bio->bi_end_io = raid10_end_read_request; | ||
| 2108 | if (max_sectors < r10_bio->sectors) { | ||
| 2109 | /* Drat - have to split this up more */ | ||
| 2110 | struct bio *mbio = r10_bio->master_bio; | ||
| 2111 | int sectors_handled = | ||
| 2112 | r10_bio->sector + max_sectors | ||
| 2113 | - mbio->bi_sector; | ||
| 2114 | r10_bio->sectors = max_sectors; | ||
| 2115 | spin_lock_irq(&conf->device_lock); | ||
| 2116 | if (mbio->bi_phys_segments == 0) | ||
| 2117 | mbio->bi_phys_segments = 2; | ||
| 2118 | else | ||
| 2119 | mbio->bi_phys_segments++; | ||
| 2120 | spin_unlock_irq(&conf->device_lock); | ||
| 2121 | generic_make_request(bio); | ||
| 2122 | bio = NULL; | ||
| 2123 | |||
| 2124 | r10_bio = mempool_alloc(conf->r10bio_pool, | ||
| 2125 | GFP_NOIO); | ||
| 2126 | r10_bio->master_bio = mbio; | ||
| 2127 | r10_bio->sectors = (mbio->bi_size >> 9) | ||
| 2128 | - sectors_handled; | ||
| 2129 | r10_bio->state = 0; | ||
| 2130 | set_bit(R10BIO_ReadError, | ||
| 2131 | &r10_bio->state); | ||
| 2132 | r10_bio->mddev = mddev; | ||
| 2133 | r10_bio->sector = mbio->bi_sector | ||
| 2134 | + sectors_handled; | ||
| 2135 | |||
| 2136 | goto read_more; | ||
| 2137 | } else | ||
| 2138 | generic_make_request(bio); | ||
| 2139 | } | ||
| 2140 | |||
| 2141 | static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio) | ||
| 2142 | { | ||
| 2143 | /* Some sort of write request has finished and it | ||
| 2144 | * succeeded in writing where we thought there was a | ||
| 2145 | * bad block. So forget the bad block. | ||
| 2146 | * Or possibly if failed and we need to record | ||
| 2147 | * a bad block. | ||
| 2148 | */ | ||
| 2149 | int m; | ||
| 2150 | mdk_rdev_t *rdev; | ||
| 2151 | |||
| 2152 | if (test_bit(R10BIO_IsSync, &r10_bio->state) || | ||
| 2153 | test_bit(R10BIO_IsRecover, &r10_bio->state)) { | ||
| 2154 | for (m = 0; m < conf->copies; m++) { | ||
| 2155 | int dev = r10_bio->devs[m].devnum; | ||
| 2156 | rdev = conf->mirrors[dev].rdev; | ||
| 2157 | if (r10_bio->devs[m].bio == NULL) | ||
| 2158 | continue; | ||
| 2159 | if (test_bit(BIO_UPTODATE, | ||
| 2160 | &r10_bio->devs[m].bio->bi_flags)) { | ||
| 2161 | rdev_clear_badblocks( | ||
| 2162 | rdev, | ||
| 2163 | r10_bio->devs[m].addr, | ||
| 2164 | r10_bio->sectors); | ||
| 2165 | } else { | ||
| 2166 | if (!rdev_set_badblocks( | ||
| 2167 | rdev, | ||
| 2168 | r10_bio->devs[m].addr, | ||
| 2169 | r10_bio->sectors, 0)) | ||
| 2170 | md_error(conf->mddev, rdev); | ||
| 2171 | } | ||
| 2172 | } | ||
| 2173 | put_buf(r10_bio); | ||
| 2174 | } else { | ||
| 2175 | for (m = 0; m < conf->copies; m++) { | ||
| 2176 | int dev = r10_bio->devs[m].devnum; | ||
| 2177 | struct bio *bio = r10_bio->devs[m].bio; | ||
| 2178 | rdev = conf->mirrors[dev].rdev; | ||
| 2179 | if (bio == IO_MADE_GOOD) { | ||
| 2180 | rdev_clear_badblocks( | ||
| 2181 | rdev, | ||
| 2182 | r10_bio->devs[m].addr, | ||
| 2183 | r10_bio->sectors); | ||
| 2184 | rdev_dec_pending(rdev, conf->mddev); | ||
| 2185 | } else if (bio != NULL && | ||
| 2186 | !test_bit(BIO_UPTODATE, &bio->bi_flags)) { | ||
| 2187 | if (!narrow_write_error(r10_bio, m)) { | ||
| 2188 | md_error(conf->mddev, rdev); | ||
| 2189 | set_bit(R10BIO_Degraded, | ||
| 2190 | &r10_bio->state); | ||
| 2191 | } | ||
| 2192 | rdev_dec_pending(rdev, conf->mddev); | ||
| 2193 | } | ||
| 2194 | } | ||
| 2195 | if (test_bit(R10BIO_WriteError, | ||
| 2196 | &r10_bio->state)) | ||
| 2197 | close_write(r10_bio); | ||
| 2198 | raid_end_bio_io(r10_bio); | ||
| 2199 | } | ||
| 2200 | } | ||
| 2201 | |||
| 1601 | static void raid10d(mddev_t *mddev) | 2202 | static void raid10d(mddev_t *mddev) |
| 1602 | { | 2203 | { |
| 1603 | r10bio_t *r10_bio; | 2204 | r10bio_t *r10_bio; |
| 1604 | struct bio *bio; | ||
| 1605 | unsigned long flags; | 2205 | unsigned long flags; |
| 1606 | conf_t *conf = mddev->private; | 2206 | conf_t *conf = mddev->private; |
| 1607 | struct list_head *head = &conf->retry_list; | 2207 | struct list_head *head = &conf->retry_list; |
| 1608 | mdk_rdev_t *rdev; | ||
| 1609 | struct blk_plug plug; | 2208 | struct blk_plug plug; |
| 1610 | 2209 | ||
| 1611 | md_check_recovery(mddev); | 2210 | md_check_recovery(mddev); |
| 1612 | 2211 | ||
| 1613 | blk_start_plug(&plug); | 2212 | blk_start_plug(&plug); |
| 1614 | for (;;) { | 2213 | for (;;) { |
| 1615 | char b[BDEVNAME_SIZE]; | ||
| 1616 | 2214 | ||
| 1617 | flush_pending_writes(conf); | 2215 | flush_pending_writes(conf); |
| 1618 | 2216 | ||
| @@ -1628,64 +2226,26 @@ static void raid10d(mddev_t *mddev) | |||
| 1628 | 2226 | ||
| 1629 | mddev = r10_bio->mddev; | 2227 | mddev = r10_bio->mddev; |
| 1630 | conf = mddev->private; | 2228 | conf = mddev->private; |
| 1631 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) | 2229 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
| 2230 | test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
| 2231 | handle_write_completed(conf, r10_bio); | ||
| 2232 | else if (test_bit(R10BIO_IsSync, &r10_bio->state)) | ||
| 1632 | sync_request_write(mddev, r10_bio); | 2233 | sync_request_write(mddev, r10_bio); |
| 1633 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) | 2234 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
| 1634 | recovery_request_write(mddev, r10_bio); | 2235 | recovery_request_write(mddev, r10_bio); |
| 2236 | else if (test_bit(R10BIO_ReadError, &r10_bio->state)) | ||
| 2237 | handle_read_error(mddev, r10_bio); | ||
| 1635 | else { | 2238 | else { |
| 1636 | int slot = r10_bio->read_slot; | 2239 | /* just a partial read to be scheduled from a |
| 1637 | int mirror = r10_bio->devs[slot].devnum; | 2240 | * separate context |
| 1638 | /* we got a read error. Maybe the drive is bad. Maybe just | ||
| 1639 | * the block and we can fix it. | ||
| 1640 | * We freeze all other IO, and try reading the block from | ||
| 1641 | * other devices. When we find one, we re-write | ||
| 1642 | * and check it that fixes the read error. | ||
| 1643 | * This is all done synchronously while the array is | ||
| 1644 | * frozen. | ||
| 1645 | */ | 2241 | */ |
| 1646 | if (mddev->ro == 0) { | 2242 | int slot = r10_bio->read_slot; |
| 1647 | freeze_array(conf); | 2243 | generic_make_request(r10_bio->devs[slot].bio); |
| 1648 | fix_read_error(conf, mddev, r10_bio); | ||
| 1649 | unfreeze_array(conf); | ||
| 1650 | } | ||
| 1651 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
| 1652 | |||
| 1653 | bio = r10_bio->devs[slot].bio; | ||
| 1654 | r10_bio->devs[slot].bio = | ||
| 1655 | mddev->ro ? IO_BLOCKED : NULL; | ||
| 1656 | mirror = read_balance(conf, r10_bio); | ||
| 1657 | if (mirror == -1) { | ||
| 1658 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | ||
| 1659 | " read error for block %llu\n", | ||
| 1660 | mdname(mddev), | ||
| 1661 | bdevname(bio->bi_bdev,b), | ||
| 1662 | (unsigned long long)r10_bio->sector); | ||
| 1663 | raid_end_bio_io(r10_bio); | ||
| 1664 | bio_put(bio); | ||
| 1665 | } else { | ||
| 1666 | const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); | ||
| 1667 | bio_put(bio); | ||
| 1668 | slot = r10_bio->read_slot; | ||
| 1669 | rdev = conf->mirrors[mirror].rdev; | ||
| 1670 | if (printk_ratelimit()) | ||
| 1671 | printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" | ||
| 1672 | " another mirror\n", | ||
| 1673 | mdname(mddev), | ||
| 1674 | bdevname(rdev->bdev,b), | ||
| 1675 | (unsigned long long)r10_bio->sector); | ||
| 1676 | bio = bio_clone_mddev(r10_bio->master_bio, | ||
| 1677 | GFP_NOIO, mddev); | ||
| 1678 | r10_bio->devs[slot].bio = bio; | ||
| 1679 | bio->bi_sector = r10_bio->devs[slot].addr | ||
| 1680 | + rdev->data_offset; | ||
| 1681 | bio->bi_bdev = rdev->bdev; | ||
| 1682 | bio->bi_rw = READ | do_sync; | ||
| 1683 | bio->bi_private = r10_bio; | ||
| 1684 | bio->bi_end_io = raid10_end_read_request; | ||
| 1685 | generic_make_request(bio); | ||
| 1686 | } | ||
| 1687 | } | 2244 | } |
| 2245 | |||
| 1688 | cond_resched(); | 2246 | cond_resched(); |
| 2247 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | ||
| 2248 | md_check_recovery(mddev); | ||
| 1689 | } | 2249 | } |
| 1690 | blk_finish_plug(&plug); | 2250 | blk_finish_plug(&plug); |
| 1691 | } | 2251 | } |
| @@ -1746,7 +2306,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
| 1746 | int i; | 2306 | int i; |
| 1747 | int max_sync; | 2307 | int max_sync; |
| 1748 | sector_t sync_blocks; | 2308 | sector_t sync_blocks; |
| 1749 | |||
| 1750 | sector_t sectors_skipped = 0; | 2309 | sector_t sectors_skipped = 0; |
| 1751 | int chunks_skipped = 0; | 2310 | int chunks_skipped = 0; |
| 1752 | 2311 | ||
| @@ -1828,7 +2387,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
| 1828 | max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); | 2387 | max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); |
| 1829 | if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | 2388 | if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
| 1830 | /* recovery... the complicated one */ | 2389 | /* recovery... the complicated one */ |
| 1831 | int j, k; | 2390 | int j; |
| 1832 | r10_bio = NULL; | 2391 | r10_bio = NULL; |
| 1833 | 2392 | ||
| 1834 | for (i=0 ; i<conf->raid_disks; i++) { | 2393 | for (i=0 ; i<conf->raid_disks; i++) { |
| @@ -1836,6 +2395,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
| 1836 | r10bio_t *rb2; | 2395 | r10bio_t *rb2; |
| 1837 | sector_t sect; | 2396 | sector_t sect; |
| 1838 | int must_sync; | 2397 | int must_sync; |
| 2398 | int any_working; | ||
| 1839 | 2399 | ||
| 1840 | if (conf->mirrors[i].rdev == NULL || | 2400 | if (conf->mirrors[i].rdev == NULL || |
| 1841 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) | 2401 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) |
| @@ -1887,19 +2447,42 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
| 1887 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | 2447 | must_sync = bitmap_start_sync(mddev->bitmap, sect, |
| 1888 | &sync_blocks, still_degraded); | 2448 | &sync_blocks, still_degraded); |
| 1889 | 2449 | ||
| 2450 | any_working = 0; | ||
| 1890 | for (j=0; j<conf->copies;j++) { | 2451 | for (j=0; j<conf->copies;j++) { |
| 2452 | int k; | ||
| 1891 | int d = r10_bio->devs[j].devnum; | 2453 | int d = r10_bio->devs[j].devnum; |
| 2454 | sector_t from_addr, to_addr; | ||
| 2455 | mdk_rdev_t *rdev; | ||
| 2456 | sector_t sector, first_bad; | ||
| 2457 | int bad_sectors; | ||
| 1892 | if (!conf->mirrors[d].rdev || | 2458 | if (!conf->mirrors[d].rdev || |
| 1893 | !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) | 2459 | !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) |
| 1894 | continue; | 2460 | continue; |
| 1895 | /* This is where we read from */ | 2461 | /* This is where we read from */ |
| 2462 | any_working = 1; | ||
| 2463 | rdev = conf->mirrors[d].rdev; | ||
| 2464 | sector = r10_bio->devs[j].addr; | ||
| 2465 | |||
| 2466 | if (is_badblock(rdev, sector, max_sync, | ||
| 2467 | &first_bad, &bad_sectors)) { | ||
| 2468 | if (first_bad > sector) | ||
| 2469 | max_sync = first_bad - sector; | ||
| 2470 | else { | ||
| 2471 | bad_sectors -= (sector | ||
| 2472 | - first_bad); | ||
| 2473 | if (max_sync > bad_sectors) | ||
| 2474 | max_sync = bad_sectors; | ||
| 2475 | continue; | ||
| 2476 | } | ||
| 2477 | } | ||
| 1896 | bio = r10_bio->devs[0].bio; | 2478 | bio = r10_bio->devs[0].bio; |
| 1897 | bio->bi_next = biolist; | 2479 | bio->bi_next = biolist; |
| 1898 | biolist = bio; | 2480 | biolist = bio; |
| 1899 | bio->bi_private = r10_bio; | 2481 | bio->bi_private = r10_bio; |
| 1900 | bio->bi_end_io = end_sync_read; | 2482 | bio->bi_end_io = end_sync_read; |
| 1901 | bio->bi_rw = READ; | 2483 | bio->bi_rw = READ; |
| 1902 | bio->bi_sector = r10_bio->devs[j].addr + | 2484 | from_addr = r10_bio->devs[j].addr; |
| 2485 | bio->bi_sector = from_addr + | ||
| 1903 | conf->mirrors[d].rdev->data_offset; | 2486 | conf->mirrors[d].rdev->data_offset; |
| 1904 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2487 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; |
| 1905 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2488 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
| @@ -1916,26 +2499,48 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
| 1916 | bio->bi_private = r10_bio; | 2499 | bio->bi_private = r10_bio; |
| 1917 | bio->bi_end_io = end_sync_write; | 2500 | bio->bi_end_io = end_sync_write; |
| 1918 | bio->bi_rw = WRITE; | 2501 | bio->bi_rw = WRITE; |
| 1919 | bio->bi_sector = r10_bio->devs[k].addr + | 2502 | to_addr = r10_bio->devs[k].addr; |
| 2503 | bio->bi_sector = to_addr + | ||
| 1920 | conf->mirrors[i].rdev->data_offset; | 2504 | conf->mirrors[i].rdev->data_offset; |
| 1921 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | 2505 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; |
| 1922 | 2506 | ||
| 1923 | r10_bio->devs[0].devnum = d; | 2507 | r10_bio->devs[0].devnum = d; |
| 2508 | r10_bio->devs[0].addr = from_addr; | ||
| 1924 | r10_bio->devs[1].devnum = i; | 2509 | r10_bio->devs[1].devnum = i; |
| 2510 | r10_bio->devs[1].addr = to_addr; | ||
| 1925 | 2511 | ||
| 1926 | break; | 2512 | break; |
| 1927 | } | 2513 | } |
| 1928 | if (j == conf->copies) { | 2514 | if (j == conf->copies) { |
| 1929 | /* Cannot recover, so abort the recovery */ | 2515 | /* Cannot recover, so abort the recovery or |
| 2516 | * record a bad block */ | ||
| 1930 | put_buf(r10_bio); | 2517 | put_buf(r10_bio); |
| 1931 | if (rb2) | 2518 | if (rb2) |
| 1932 | atomic_dec(&rb2->remaining); | 2519 | atomic_dec(&rb2->remaining); |
| 1933 | r10_bio = rb2; | 2520 | r10_bio = rb2; |
| 1934 | if (!test_and_set_bit(MD_RECOVERY_INTR, | 2521 | if (any_working) { |
| 1935 | &mddev->recovery)) | 2522 | /* problem is that there are bad blocks |
| 1936 | printk(KERN_INFO "md/raid10:%s: insufficient " | 2523 | * on other device(s) |
| 1937 | "working devices for recovery.\n", | 2524 | */ |
| 1938 | mdname(mddev)); | 2525 | int k; |
| 2526 | for (k = 0; k < conf->copies; k++) | ||
| 2527 | if (r10_bio->devs[k].devnum == i) | ||
| 2528 | break; | ||
| 2529 | if (!rdev_set_badblocks( | ||
| 2530 | conf->mirrors[i].rdev, | ||
| 2531 | r10_bio->devs[k].addr, | ||
| 2532 | max_sync, 0)) | ||
| 2533 | any_working = 0; | ||
| 2534 | } | ||
| 2535 | if (!any_working) { | ||
| 2536 | if (!test_and_set_bit(MD_RECOVERY_INTR, | ||
| 2537 | &mddev->recovery)) | ||
| 2538 | printk(KERN_INFO "md/raid10:%s: insufficient " | ||
| 2539 | "working devices for recovery.\n", | ||
| 2540 | mdname(mddev)); | ||
| 2541 | conf->mirrors[i].recovery_disabled | ||
| 2542 | = mddev->recovery_disabled; | ||
| 2543 | } | ||
| 1939 | break; | 2544 | break; |
| 1940 | } | 2545 | } |
| 1941 | } | 2546 | } |
| @@ -1979,12 +2584,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
| 1979 | 2584 | ||
| 1980 | for (i=0; i<conf->copies; i++) { | 2585 | for (i=0; i<conf->copies; i++) { |
| 1981 | int d = r10_bio->devs[i].devnum; | 2586 | int d = r10_bio->devs[i].devnum; |
| 2587 | sector_t first_bad, sector; | ||
| 2588 | int bad_sectors; | ||
| 2589 | |||
| 1982 | bio = r10_bio->devs[i].bio; | 2590 | bio = r10_bio->devs[i].bio; |
| 1983 | bio->bi_end_io = NULL; | 2591 | bio->bi_end_io = NULL; |
| 1984 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | 2592 | clear_bit(BIO_UPTODATE, &bio->bi_flags); |
| 1985 | if (conf->mirrors[d].rdev == NULL || | 2593 | if (conf->mirrors[d].rdev == NULL || |
| 1986 | test_bit(Faulty, &conf->mirrors[d].rdev->flags)) | 2594 | test_bit(Faulty, &conf->mirrors[d].rdev->flags)) |
| 1987 | continue; | 2595 | continue; |
| 2596 | sector = r10_bio->devs[i].addr; | ||
| 2597 | if (is_badblock(conf->mirrors[d].rdev, | ||
| 2598 | sector, max_sync, | ||
| 2599 | &first_bad, &bad_sectors)) { | ||
| 2600 | if (first_bad > sector) | ||
| 2601 | max_sync = first_bad - sector; | ||
| 2602 | else { | ||
| 2603 | bad_sectors -= (sector - first_bad); | ||
| 2604 | if (max_sync > bad_sectors) | ||
| 2605 | max_sync = max_sync; | ||
| 2606 | continue; | ||
| 2607 | } | ||
| 2608 | } | ||
| 1988 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2609 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
| 1989 | atomic_inc(&r10_bio->remaining); | 2610 | atomic_inc(&r10_bio->remaining); |
| 1990 | bio->bi_next = biolist; | 2611 | bio->bi_next = biolist; |
| @@ -1992,7 +2613,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
| 1992 | bio->bi_private = r10_bio; | 2613 | bio->bi_private = r10_bio; |
| 1993 | bio->bi_end_io = end_sync_read; | 2614 | bio->bi_end_io = end_sync_read; |
| 1994 | bio->bi_rw = READ; | 2615 | bio->bi_rw = READ; |
| 1995 | bio->bi_sector = r10_bio->devs[i].addr + | 2616 | bio->bi_sector = sector + |
| 1996 | conf->mirrors[d].rdev->data_offset; | 2617 | conf->mirrors[d].rdev->data_offset; |
| 1997 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2618 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; |
| 1998 | count++; | 2619 | count++; |
| @@ -2079,7 +2700,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
| 2079 | return sectors_skipped + nr_sectors; | 2700 | return sectors_skipped + nr_sectors; |
| 2080 | giveup: | 2701 | giveup: |
| 2081 | /* There is nowhere to write, so all non-sync | 2702 | /* There is nowhere to write, so all non-sync |
| 2082 | * drives must be failed, so try the next chunk... | 2703 | * drives must be failed or in resync, all drives |
| 2704 | * have a bad block, so try the next chunk... | ||
| 2083 | */ | 2705 | */ |
| 2084 | if (sector_nr + max_sync < max_sector) | 2706 | if (sector_nr + max_sync < max_sector) |
| 2085 | max_sector = sector_nr + max_sync; | 2707 | max_sector = sector_nr + max_sync; |
| @@ -2249,6 +2871,7 @@ static int run(mddev_t *mddev) | |||
| 2249 | (conf->raid_disks / conf->near_copies)); | 2871 | (conf->raid_disks / conf->near_copies)); |
| 2250 | 2872 | ||
| 2251 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2873 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
| 2874 | |||
| 2252 | disk_idx = rdev->raid_disk; | 2875 | disk_idx = rdev->raid_disk; |
| 2253 | if (disk_idx >= conf->raid_disks | 2876 | if (disk_idx >= conf->raid_disks |
| 2254 | || disk_idx < 0) | 2877 | || disk_idx < 0) |
| @@ -2271,7 +2894,7 @@ static int run(mddev_t *mddev) | |||
| 2271 | disk->head_position = 0; | 2894 | disk->head_position = 0; |
| 2272 | } | 2895 | } |
| 2273 | /* need to check that every block has at least one working mirror */ | 2896 | /* need to check that every block has at least one working mirror */ |
| 2274 | if (!enough(conf)) { | 2897 | if (!enough(conf, -1)) { |
| 2275 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", | 2898 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", |
| 2276 | mdname(mddev)); | 2899 | mdname(mddev)); |
| 2277 | goto out_free_conf; | 2900 | goto out_free_conf; |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 944b1104d3b4..79cb52a0d4a2 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
| @@ -6,6 +6,11 @@ typedef struct mirror_info mirror_info_t; | |||
| 6 | struct mirror_info { | 6 | struct mirror_info { |
| 7 | mdk_rdev_t *rdev; | 7 | mdk_rdev_t *rdev; |
| 8 | sector_t head_position; | 8 | sector_t head_position; |
| 9 | int recovery_disabled; /* matches | ||
| 10 | * mddev->recovery_disabled | ||
| 11 | * when we shouldn't try | ||
| 12 | * recovering this device. | ||
| 13 | */ | ||
| 9 | }; | 14 | }; |
| 10 | 15 | ||
| 11 | typedef struct r10bio_s r10bio_t; | 16 | typedef struct r10bio_s r10bio_t; |
| @@ -113,10 +118,26 @@ struct r10bio_s { | |||
| 113 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | 118 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer |
| 114 | */ | 119 | */ |
| 115 | #define IO_BLOCKED ((struct bio*)1) | 120 | #define IO_BLOCKED ((struct bio*)1) |
| 121 | /* When we successfully write to a known bad-block, we need to remove the | ||
| 122 | * bad-block marking which must be done from process context. So we record | ||
| 123 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
| 124 | */ | ||
| 125 | #define IO_MADE_GOOD ((struct bio *)2) | ||
| 126 | |||
| 127 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
| 116 | 128 | ||
| 117 | /* bits for r10bio.state */ | 129 | /* bits for r10bio.state */ |
| 118 | #define R10BIO_Uptodate 0 | 130 | #define R10BIO_Uptodate 0 |
| 119 | #define R10BIO_IsSync 1 | 131 | #define R10BIO_IsSync 1 |
| 120 | #define R10BIO_IsRecover 2 | 132 | #define R10BIO_IsRecover 2 |
| 121 | #define R10BIO_Degraded 3 | 133 | #define R10BIO_Degraded 3 |
| 134 | /* Set ReadError on bios that experience a read error | ||
| 135 | * so that raid10d knows what to do with them. | ||
| 136 | */ | ||
| 137 | #define R10BIO_ReadError 4 | ||
| 138 | /* If a write for this request means we can clear some | ||
| 139 | * known-bad-block records, we set this flag. | ||
| 140 | */ | ||
| 141 | #define R10BIO_MadeGood 5 | ||
| 142 | #define R10BIO_WriteError 6 | ||
| 122 | #endif | 143 | #endif |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b72edf35ec54..dbae459fb02d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -51,6 +51,7 @@ | |||
| 51 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
| 52 | #include <linux/cpu.h> | 52 | #include <linux/cpu.h> |
| 53 | #include <linux/slab.h> | 53 | #include <linux/slab.h> |
| 54 | #include <linux/ratelimit.h> | ||
| 54 | #include "md.h" | 55 | #include "md.h" |
| 55 | #include "raid5.h" | 56 | #include "raid5.h" |
| 56 | #include "raid0.h" | 57 | #include "raid0.h" |
| @@ -96,8 +97,6 @@ | |||
| 96 | #define __inline__ | 97 | #define __inline__ |
| 97 | #endif | 98 | #endif |
| 98 | 99 | ||
| 99 | #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) | ||
| 100 | |||
| 101 | /* | 100 | /* |
| 102 | * We maintain a biased count of active stripes in the bottom 16 bits of | 101 | * We maintain a biased count of active stripes in the bottom 16 bits of |
| 103 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits | 102 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits |
| @@ -341,7 +340,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | |||
| 341 | (unsigned long long)sh->sector, i, dev->toread, | 340 | (unsigned long long)sh->sector, i, dev->toread, |
| 342 | dev->read, dev->towrite, dev->written, | 341 | dev->read, dev->towrite, dev->written, |
| 343 | test_bit(R5_LOCKED, &dev->flags)); | 342 | test_bit(R5_LOCKED, &dev->flags)); |
| 344 | BUG(); | 343 | WARN_ON(1); |
| 345 | } | 344 | } |
| 346 | dev->flags = 0; | 345 | dev->flags = 0; |
| 347 | raid5_build_block(sh, i, previous); | 346 | raid5_build_block(sh, i, previous); |
| @@ -527,6 +526,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 527 | atomic_inc(&rdev->nr_pending); | 526 | atomic_inc(&rdev->nr_pending); |
| 528 | rcu_read_unlock(); | 527 | rcu_read_unlock(); |
| 529 | 528 | ||
| 529 | /* We have already checked bad blocks for reads. Now | ||
| 530 | * need to check for writes. | ||
| 531 | */ | ||
| 532 | while ((rw & WRITE) && rdev && | ||
| 533 | test_bit(WriteErrorSeen, &rdev->flags)) { | ||
| 534 | sector_t first_bad; | ||
| 535 | int bad_sectors; | ||
| 536 | int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, | ||
| 537 | &first_bad, &bad_sectors); | ||
| 538 | if (!bad) | ||
| 539 | break; | ||
| 540 | |||
| 541 | if (bad < 0) { | ||
| 542 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
| 543 | if (!conf->mddev->external && | ||
| 544 | conf->mddev->flags) { | ||
| 545 | /* It is very unlikely, but we might | ||
| 546 | * still need to write out the | ||
| 547 | * bad block log - better give it | ||
| 548 | * a chance*/ | ||
| 549 | md_check_recovery(conf->mddev); | ||
| 550 | } | ||
| 551 | md_wait_for_blocked_rdev(rdev, conf->mddev); | ||
| 552 | } else { | ||
| 553 | /* Acknowledged bad block - skip the write */ | ||
| 554 | rdev_dec_pending(rdev, conf->mddev); | ||
| 555 | rdev = NULL; | ||
| 556 | } | ||
| 557 | } | ||
| 558 | |||
| 530 | if (rdev) { | 559 | if (rdev) { |
| 531 | if (s->syncing || s->expanding || s->expanded) | 560 | if (s->syncing || s->expanding || s->expanded) |
| 532 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 561 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
| @@ -548,10 +577,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 548 | bi->bi_io_vec[0].bv_offset = 0; | 577 | bi->bi_io_vec[0].bv_offset = 0; |
| 549 | bi->bi_size = STRIPE_SIZE; | 578 | bi->bi_size = STRIPE_SIZE; |
| 550 | bi->bi_next = NULL; | 579 | bi->bi_next = NULL; |
| 551 | if ((rw & WRITE) && | ||
| 552 | test_bit(R5_ReWrite, &sh->dev[i].flags)) | ||
| 553 | atomic_add(STRIPE_SECTORS, | ||
| 554 | &rdev->corrected_errors); | ||
| 555 | generic_make_request(bi); | 580 | generic_make_request(bi); |
| 556 | } else { | 581 | } else { |
| 557 | if (rw & WRITE) | 582 | if (rw & WRITE) |
| @@ -1020,12 +1045,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
| 1020 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { | 1045 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { |
| 1021 | struct bio *wbi; | 1046 | struct bio *wbi; |
| 1022 | 1047 | ||
| 1023 | spin_lock(&sh->lock); | 1048 | spin_lock_irq(&sh->raid_conf->device_lock); |
| 1024 | chosen = dev->towrite; | 1049 | chosen = dev->towrite; |
| 1025 | dev->towrite = NULL; | 1050 | dev->towrite = NULL; |
| 1026 | BUG_ON(dev->written); | 1051 | BUG_ON(dev->written); |
| 1027 | wbi = dev->written = chosen; | 1052 | wbi = dev->written = chosen; |
| 1028 | spin_unlock(&sh->lock); | 1053 | spin_unlock_irq(&sh->raid_conf->device_lock); |
| 1029 | 1054 | ||
| 1030 | while (wbi && wbi->bi_sector < | 1055 | while (wbi && wbi->bi_sector < |
| 1031 | dev->sector + STRIPE_SECTORS) { | 1056 | dev->sector + STRIPE_SECTORS) { |
| @@ -1315,12 +1340,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
| 1315 | static int grow_one_stripe(raid5_conf_t *conf) | 1340 | static int grow_one_stripe(raid5_conf_t *conf) |
| 1316 | { | 1341 | { |
| 1317 | struct stripe_head *sh; | 1342 | struct stripe_head *sh; |
| 1318 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); | 1343 | sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); |
| 1319 | if (!sh) | 1344 | if (!sh) |
| 1320 | return 0; | 1345 | return 0; |
| 1321 | memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); | 1346 | |
| 1322 | sh->raid_conf = conf; | 1347 | sh->raid_conf = conf; |
| 1323 | spin_lock_init(&sh->lock); | ||
| 1324 | #ifdef CONFIG_MULTICORE_RAID456 | 1348 | #ifdef CONFIG_MULTICORE_RAID456 |
| 1325 | init_waitqueue_head(&sh->ops.wait_for_ops); | 1349 | init_waitqueue_head(&sh->ops.wait_for_ops); |
| 1326 | #endif | 1350 | #endif |
| @@ -1435,14 +1459,11 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
| 1435 | return -ENOMEM; | 1459 | return -ENOMEM; |
| 1436 | 1460 | ||
| 1437 | for (i = conf->max_nr_stripes; i; i--) { | 1461 | for (i = conf->max_nr_stripes; i; i--) { |
| 1438 | nsh = kmem_cache_alloc(sc, GFP_KERNEL); | 1462 | nsh = kmem_cache_zalloc(sc, GFP_KERNEL); |
| 1439 | if (!nsh) | 1463 | if (!nsh) |
| 1440 | break; | 1464 | break; |
| 1441 | 1465 | ||
| 1442 | memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); | ||
| 1443 | |||
| 1444 | nsh->raid_conf = conf; | 1466 | nsh->raid_conf = conf; |
| 1445 | spin_lock_init(&nsh->lock); | ||
| 1446 | #ifdef CONFIG_MULTICORE_RAID456 | 1467 | #ifdef CONFIG_MULTICORE_RAID456 |
| 1447 | init_waitqueue_head(&nsh->ops.wait_for_ops); | 1468 | init_waitqueue_head(&nsh->ops.wait_for_ops); |
| 1448 | #endif | 1469 | #endif |
| @@ -1587,12 +1608,15 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1587 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 1608 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
| 1588 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | 1609 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { |
| 1589 | rdev = conf->disks[i].rdev; | 1610 | rdev = conf->disks[i].rdev; |
| 1590 | printk_rl(KERN_INFO "md/raid:%s: read error corrected" | 1611 | printk_ratelimited( |
| 1591 | " (%lu sectors at %llu on %s)\n", | 1612 | KERN_INFO |
| 1592 | mdname(conf->mddev), STRIPE_SECTORS, | 1613 | "md/raid:%s: read error corrected" |
| 1593 | (unsigned long long)(sh->sector | 1614 | " (%lu sectors at %llu on %s)\n", |
| 1594 | + rdev->data_offset), | 1615 | mdname(conf->mddev), STRIPE_SECTORS, |
| 1595 | bdevname(rdev->bdev, b)); | 1616 | (unsigned long long)(sh->sector |
| 1617 | + rdev->data_offset), | ||
| 1618 | bdevname(rdev->bdev, b)); | ||
| 1619 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | ||
| 1596 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1620 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
| 1597 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | 1621 | clear_bit(R5_ReWrite, &sh->dev[i].flags); |
| 1598 | } | 1622 | } |
| @@ -1606,22 +1630,24 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1606 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | 1630 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); |
| 1607 | atomic_inc(&rdev->read_errors); | 1631 | atomic_inc(&rdev->read_errors); |
| 1608 | if (conf->mddev->degraded >= conf->max_degraded) | 1632 | if (conf->mddev->degraded >= conf->max_degraded) |
| 1609 | printk_rl(KERN_WARNING | 1633 | printk_ratelimited( |
| 1610 | "md/raid:%s: read error not correctable " | 1634 | KERN_WARNING |
| 1611 | "(sector %llu on %s).\n", | 1635 | "md/raid:%s: read error not correctable " |
| 1612 | mdname(conf->mddev), | 1636 | "(sector %llu on %s).\n", |
| 1613 | (unsigned long long)(sh->sector | 1637 | mdname(conf->mddev), |
| 1614 | + rdev->data_offset), | 1638 | (unsigned long long)(sh->sector |
| 1615 | bdn); | 1639 | + rdev->data_offset), |
| 1640 | bdn); | ||
| 1616 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) | 1641 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) |
| 1617 | /* Oh, no!!! */ | 1642 | /* Oh, no!!! */ |
| 1618 | printk_rl(KERN_WARNING | 1643 | printk_ratelimited( |
| 1619 | "md/raid:%s: read error NOT corrected!! " | 1644 | KERN_WARNING |
| 1620 | "(sector %llu on %s).\n", | 1645 | "md/raid:%s: read error NOT corrected!! " |
| 1621 | mdname(conf->mddev), | 1646 | "(sector %llu on %s).\n", |
| 1622 | (unsigned long long)(sh->sector | 1647 | mdname(conf->mddev), |
| 1623 | + rdev->data_offset), | 1648 | (unsigned long long)(sh->sector |
| 1624 | bdn); | 1649 | + rdev->data_offset), |
| 1650 | bdn); | ||
| 1625 | else if (atomic_read(&rdev->read_errors) | 1651 | else if (atomic_read(&rdev->read_errors) |
| 1626 | > conf->max_nr_stripes) | 1652 | > conf->max_nr_stripes) |
| 1627 | printk(KERN_WARNING | 1653 | printk(KERN_WARNING |
| @@ -1649,6 +1675,8 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
| 1649 | raid5_conf_t *conf = sh->raid_conf; | 1675 | raid5_conf_t *conf = sh->raid_conf; |
| 1650 | int disks = sh->disks, i; | 1676 | int disks = sh->disks, i; |
| 1651 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1677 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
| 1678 | sector_t first_bad; | ||
| 1679 | int bad_sectors; | ||
| 1652 | 1680 | ||
| 1653 | for (i=0 ; i<disks; i++) | 1681 | for (i=0 ; i<disks; i++) |
| 1654 | if (bi == &sh->dev[i].req) | 1682 | if (bi == &sh->dev[i].req) |
| @@ -1662,8 +1690,12 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
| 1662 | return; | 1690 | return; |
| 1663 | } | 1691 | } |
| 1664 | 1692 | ||
| 1665 | if (!uptodate) | 1693 | if (!uptodate) { |
| 1666 | md_error(conf->mddev, conf->disks[i].rdev); | 1694 | set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); |
| 1695 | set_bit(R5_WriteError, &sh->dev[i].flags); | ||
| 1696 | } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, | ||
| 1697 | &first_bad, &bad_sectors)) | ||
| 1698 | set_bit(R5_MadeGood, &sh->dev[i].flags); | ||
| 1667 | 1699 | ||
| 1668 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | 1700 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); |
| 1669 | 1701 | ||
| @@ -1710,6 +1742,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1710 | */ | 1742 | */ |
| 1711 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 1743 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
| 1712 | } | 1744 | } |
| 1745 | set_bit(Blocked, &rdev->flags); | ||
| 1713 | set_bit(Faulty, &rdev->flags); | 1746 | set_bit(Faulty, &rdev->flags); |
| 1714 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1747 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
| 1715 | printk(KERN_ALERT | 1748 | printk(KERN_ALERT |
| @@ -1760,7 +1793,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
| 1760 | /* | 1793 | /* |
| 1761 | * Select the parity disk based on the user selected algorithm. | 1794 | * Select the parity disk based on the user selected algorithm. |
| 1762 | */ | 1795 | */ |
| 1763 | pd_idx = qd_idx = ~0; | 1796 | pd_idx = qd_idx = -1; |
| 1764 | switch(conf->level) { | 1797 | switch(conf->level) { |
| 1765 | case 4: | 1798 | case 4: |
| 1766 | pd_idx = data_disks; | 1799 | pd_idx = data_disks; |
| @@ -2143,12 +2176,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2143 | raid5_conf_t *conf = sh->raid_conf; | 2176 | raid5_conf_t *conf = sh->raid_conf; |
| 2144 | int firstwrite=0; | 2177 | int firstwrite=0; |
| 2145 | 2178 | ||
| 2146 | pr_debug("adding bh b#%llu to stripe s#%llu\n", | 2179 | pr_debug("adding bi b#%llu to stripe s#%llu\n", |
| 2147 | (unsigned long long)bi->bi_sector, | 2180 | (unsigned long long)bi->bi_sector, |
| 2148 | (unsigned long long)sh->sector); | 2181 | (unsigned long long)sh->sector); |
| 2149 | 2182 | ||
| 2150 | 2183 | ||
| 2151 | spin_lock(&sh->lock); | ||
| 2152 | spin_lock_irq(&conf->device_lock); | 2184 | spin_lock_irq(&conf->device_lock); |
| 2153 | if (forwrite) { | 2185 | if (forwrite) { |
| 2154 | bip = &sh->dev[dd_idx].towrite; | 2186 | bip = &sh->dev[dd_idx].towrite; |
| @@ -2169,19 +2201,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2169 | bi->bi_next = *bip; | 2201 | bi->bi_next = *bip; |
| 2170 | *bip = bi; | 2202 | *bip = bi; |
| 2171 | bi->bi_phys_segments++; | 2203 | bi->bi_phys_segments++; |
| 2172 | spin_unlock_irq(&conf->device_lock); | ||
| 2173 | spin_unlock(&sh->lock); | ||
| 2174 | |||
| 2175 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | ||
| 2176 | (unsigned long long)bi->bi_sector, | ||
| 2177 | (unsigned long long)sh->sector, dd_idx); | ||
| 2178 | |||
| 2179 | if (conf->mddev->bitmap && firstwrite) { | ||
| 2180 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | ||
| 2181 | STRIPE_SECTORS, 0); | ||
| 2182 | sh->bm_seq = conf->seq_flush+1; | ||
| 2183 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
| 2184 | } | ||
| 2185 | 2204 | ||
| 2186 | if (forwrite) { | 2205 | if (forwrite) { |
| 2187 | /* check if page is covered */ | 2206 | /* check if page is covered */ |
| @@ -2196,12 +2215,23 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2196 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | 2215 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) |
| 2197 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | 2216 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); |
| 2198 | } | 2217 | } |
| 2218 | spin_unlock_irq(&conf->device_lock); | ||
| 2219 | |||
| 2220 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | ||
| 2221 | (unsigned long long)(*bip)->bi_sector, | ||
| 2222 | (unsigned long long)sh->sector, dd_idx); | ||
| 2223 | |||
| 2224 | if (conf->mddev->bitmap && firstwrite) { | ||
| 2225 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | ||
| 2226 | STRIPE_SECTORS, 0); | ||
| 2227 | sh->bm_seq = conf->seq_flush+1; | ||
| 2228 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
| 2229 | } | ||
| 2199 | return 1; | 2230 | return 1; |
| 2200 | 2231 | ||
| 2201 | overlap: | 2232 | overlap: |
| 2202 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); | 2233 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); |
| 2203 | spin_unlock_irq(&conf->device_lock); | 2234 | spin_unlock_irq(&conf->device_lock); |
| 2204 | spin_unlock(&sh->lock); | ||
| 2205 | return 0; | 2235 | return 0; |
| 2206 | } | 2236 | } |
| 2207 | 2237 | ||
| @@ -2238,9 +2268,18 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2238 | rcu_read_lock(); | 2268 | rcu_read_lock(); |
| 2239 | rdev = rcu_dereference(conf->disks[i].rdev); | 2269 | rdev = rcu_dereference(conf->disks[i].rdev); |
| 2240 | if (rdev && test_bit(In_sync, &rdev->flags)) | 2270 | if (rdev && test_bit(In_sync, &rdev->flags)) |
| 2241 | /* multiple read failures in one stripe */ | 2271 | atomic_inc(&rdev->nr_pending); |
| 2242 | md_error(conf->mddev, rdev); | 2272 | else |
| 2273 | rdev = NULL; | ||
| 2243 | rcu_read_unlock(); | 2274 | rcu_read_unlock(); |
| 2275 | if (rdev) { | ||
| 2276 | if (!rdev_set_badblocks( | ||
| 2277 | rdev, | ||
| 2278 | sh->sector, | ||
| 2279 | STRIPE_SECTORS, 0)) | ||
| 2280 | md_error(conf->mddev, rdev); | ||
| 2281 | rdev_dec_pending(rdev, conf->mddev); | ||
| 2282 | } | ||
| 2244 | } | 2283 | } |
| 2245 | spin_lock_irq(&conf->device_lock); | 2284 | spin_lock_irq(&conf->device_lock); |
| 2246 | /* fail all writes first */ | 2285 | /* fail all writes first */ |
| @@ -2308,6 +2347,10 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2308 | if (bitmap_end) | 2347 | if (bitmap_end) |
| 2309 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | 2348 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, |
| 2310 | STRIPE_SECTORS, 0, 0); | 2349 | STRIPE_SECTORS, 0, 0); |
| 2350 | /* If we were in the middle of a write the parity block might | ||
| 2351 | * still be locked - so just clear all R5_LOCKED flags | ||
| 2352 | */ | ||
| 2353 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
| 2311 | } | 2354 | } |
| 2312 | 2355 | ||
| 2313 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) | 2356 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) |
| @@ -2315,109 +2358,73 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2315 | md_wakeup_thread(conf->mddev->thread); | 2358 | md_wakeup_thread(conf->mddev->thread); |
| 2316 | } | 2359 | } |
| 2317 | 2360 | ||
| 2318 | /* fetch_block5 - checks the given member device to see if its data needs | 2361 | static void |
| 2319 | * to be read or computed to satisfy a request. | 2362 | handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh, |
| 2320 | * | 2363 | struct stripe_head_state *s) |
| 2321 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
| 2322 | * 0 to tell the loop in handle_stripe_fill5 to continue | ||
| 2323 | */ | ||
| 2324 | static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, | ||
| 2325 | int disk_idx, int disks) | ||
| 2326 | { | ||
| 2327 | struct r5dev *dev = &sh->dev[disk_idx]; | ||
| 2328 | struct r5dev *failed_dev = &sh->dev[s->failed_num]; | ||
| 2329 | |||
| 2330 | /* is the data in this block needed, and can we get it? */ | ||
| 2331 | if (!test_bit(R5_LOCKED, &dev->flags) && | ||
| 2332 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
| 2333 | (dev->toread || | ||
| 2334 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | ||
| 2335 | s->syncing || s->expanding || | ||
| 2336 | (s->failed && | ||
| 2337 | (failed_dev->toread || | ||
| 2338 | (failed_dev->towrite && | ||
| 2339 | !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { | ||
| 2340 | /* We would like to get this block, possibly by computing it, | ||
| 2341 | * otherwise read it if the backing disk is insync | ||
| 2342 | */ | ||
| 2343 | if ((s->uptodate == disks - 1) && | ||
| 2344 | (s->failed && disk_idx == s->failed_num)) { | ||
| 2345 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
| 2346 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
| 2347 | set_bit(R5_Wantcompute, &dev->flags); | ||
| 2348 | sh->ops.target = disk_idx; | ||
| 2349 | sh->ops.target2 = -1; | ||
| 2350 | s->req_compute = 1; | ||
| 2351 | /* Careful: from this point on 'uptodate' is in the eye | ||
| 2352 | * of raid_run_ops which services 'compute' operations | ||
| 2353 | * before writes. R5_Wantcompute flags a block that will | ||
| 2354 | * be R5_UPTODATE by the time it is needed for a | ||
| 2355 | * subsequent operation. | ||
| 2356 | */ | ||
| 2357 | s->uptodate++; | ||
| 2358 | return 1; /* uptodate + compute == disks */ | ||
| 2359 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
| 2360 | set_bit(R5_LOCKED, &dev->flags); | ||
| 2361 | set_bit(R5_Wantread, &dev->flags); | ||
| 2362 | s->locked++; | ||
| 2363 | pr_debug("Reading block %d (sync=%d)\n", disk_idx, | ||
| 2364 | s->syncing); | ||
| 2365 | } | ||
| 2366 | } | ||
| 2367 | |||
| 2368 | return 0; | ||
| 2369 | } | ||
| 2370 | |||
| 2371 | /** | ||
| 2372 | * handle_stripe_fill5 - read or compute data to satisfy pending requests. | ||
| 2373 | */ | ||
| 2374 | static void handle_stripe_fill5(struct stripe_head *sh, | ||
| 2375 | struct stripe_head_state *s, int disks) | ||
| 2376 | { | 2364 | { |
| 2365 | int abort = 0; | ||
| 2377 | int i; | 2366 | int i; |
| 2378 | 2367 | ||
| 2379 | /* look for blocks to read/compute, skip this if a compute | 2368 | md_done_sync(conf->mddev, STRIPE_SECTORS, 0); |
| 2380 | * is already in flight, or if the stripe contents are in the | 2369 | clear_bit(STRIPE_SYNCING, &sh->state); |
| 2381 | * midst of changing due to a write | 2370 | s->syncing = 0; |
| 2371 | /* There is nothing more to do for sync/check/repair. | ||
| 2372 | * For recover we need to record a bad block on all | ||
| 2373 | * non-sync devices, or abort the recovery | ||
| 2382 | */ | 2374 | */ |
| 2383 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | 2375 | if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) |
| 2384 | !sh->reconstruct_state) | 2376 | return; |
| 2385 | for (i = disks; i--; ) | 2377 | /* During recovery devices cannot be removed, so locking and |
| 2386 | if (fetch_block5(sh, s, i, disks)) | 2378 | * refcounting of rdevs is not needed |
| 2387 | break; | 2379 | */ |
| 2388 | set_bit(STRIPE_HANDLE, &sh->state); | 2380 | for (i = 0; i < conf->raid_disks; i++) { |
| 2381 | mdk_rdev_t *rdev = conf->disks[i].rdev; | ||
| 2382 | if (!rdev | ||
| 2383 | || test_bit(Faulty, &rdev->flags) | ||
| 2384 | || test_bit(In_sync, &rdev->flags)) | ||
| 2385 | continue; | ||
| 2386 | if (!rdev_set_badblocks(rdev, sh->sector, | ||
| 2387 | STRIPE_SECTORS, 0)) | ||
| 2388 | abort = 1; | ||
| 2389 | } | ||
| 2390 | if (abort) { | ||
| 2391 | conf->recovery_disabled = conf->mddev->recovery_disabled; | ||
| 2392 | set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery); | ||
| 2393 | } | ||
| 2389 | } | 2394 | } |
| 2390 | 2395 | ||
| 2391 | /* fetch_block6 - checks the given member device to see if its data needs | 2396 | /* fetch_block - checks the given member device to see if its data needs |
| 2392 | * to be read or computed to satisfy a request. | 2397 | * to be read or computed to satisfy a request. |
| 2393 | * | 2398 | * |
| 2394 | * Returns 1 when no more member devices need to be checked, otherwise returns | 2399 | * Returns 1 when no more member devices need to be checked, otherwise returns |
| 2395 | * 0 to tell the loop in handle_stripe_fill6 to continue | 2400 | * 0 to tell the loop in handle_stripe_fill to continue |
| 2396 | */ | 2401 | */ |
| 2397 | static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | 2402 | static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, |
| 2398 | struct r6_state *r6s, int disk_idx, int disks) | 2403 | int disk_idx, int disks) |
| 2399 | { | 2404 | { |
| 2400 | struct r5dev *dev = &sh->dev[disk_idx]; | 2405 | struct r5dev *dev = &sh->dev[disk_idx]; |
| 2401 | struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], | 2406 | struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], |
| 2402 | &sh->dev[r6s->failed_num[1]] }; | 2407 | &sh->dev[s->failed_num[1]] }; |
| 2403 | 2408 | ||
| 2409 | /* is the data in this block needed, and can we get it? */ | ||
| 2404 | if (!test_bit(R5_LOCKED, &dev->flags) && | 2410 | if (!test_bit(R5_LOCKED, &dev->flags) && |
| 2405 | !test_bit(R5_UPTODATE, &dev->flags) && | 2411 | !test_bit(R5_UPTODATE, &dev->flags) && |
| 2406 | (dev->toread || | 2412 | (dev->toread || |
| 2407 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | 2413 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
| 2408 | s->syncing || s->expanding || | 2414 | s->syncing || s->expanding || |
| 2409 | (s->failed >= 1 && | 2415 | (s->failed >= 1 && fdev[0]->toread) || |
| 2410 | (fdev[0]->toread || s->to_write)) || | 2416 | (s->failed >= 2 && fdev[1]->toread) || |
| 2411 | (s->failed >= 2 && | 2417 | (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && |
| 2412 | (fdev[1]->toread || s->to_write)))) { | 2418 | !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || |
| 2419 | (sh->raid_conf->level == 6 && s->failed && s->to_write))) { | ||
| 2413 | /* we would like to get this block, possibly by computing it, | 2420 | /* we would like to get this block, possibly by computing it, |
| 2414 | * otherwise read it if the backing disk is insync | 2421 | * otherwise read it if the backing disk is insync |
| 2415 | */ | 2422 | */ |
| 2416 | BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); | 2423 | BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); |
| 2417 | BUG_ON(test_bit(R5_Wantread, &dev->flags)); | 2424 | BUG_ON(test_bit(R5_Wantread, &dev->flags)); |
| 2418 | if ((s->uptodate == disks - 1) && | 2425 | if ((s->uptodate == disks - 1) && |
| 2419 | (s->failed && (disk_idx == r6s->failed_num[0] || | 2426 | (s->failed && (disk_idx == s->failed_num[0] || |
| 2420 | disk_idx == r6s->failed_num[1]))) { | 2427 | disk_idx == s->failed_num[1]))) { |
| 2421 | /* have disk failed, and we're requested to fetch it; | 2428 | /* have disk failed, and we're requested to fetch it; |
| 2422 | * do compute it | 2429 | * do compute it |
| 2423 | */ | 2430 | */ |
| @@ -2429,6 +2436,12 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | |||
| 2429 | sh->ops.target = disk_idx; | 2436 | sh->ops.target = disk_idx; |
| 2430 | sh->ops.target2 = -1; /* no 2nd target */ | 2437 | sh->ops.target2 = -1; /* no 2nd target */ |
| 2431 | s->req_compute = 1; | 2438 | s->req_compute = 1; |
| 2439 | /* Careful: from this point on 'uptodate' is in the eye | ||
| 2440 | * of raid_run_ops which services 'compute' operations | ||
| 2441 | * before writes. R5_Wantcompute flags a block that will | ||
| 2442 | * be R5_UPTODATE by the time it is needed for a | ||
| 2443 | * subsequent operation. | ||
| 2444 | */ | ||
| 2432 | s->uptodate++; | 2445 | s->uptodate++; |
| 2433 | return 1; | 2446 | return 1; |
| 2434 | } else if (s->uptodate == disks-2 && s->failed >= 2) { | 2447 | } else if (s->uptodate == disks-2 && s->failed >= 2) { |
| @@ -2469,11 +2482,11 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | |||
| 2469 | } | 2482 | } |
| 2470 | 2483 | ||
| 2471 | /** | 2484 | /** |
| 2472 | * handle_stripe_fill6 - read or compute data to satisfy pending requests. | 2485 | * handle_stripe_fill - read or compute data to satisfy pending requests. |
| 2473 | */ | 2486 | */ |
| 2474 | static void handle_stripe_fill6(struct stripe_head *sh, | 2487 | static void handle_stripe_fill(struct stripe_head *sh, |
| 2475 | struct stripe_head_state *s, struct r6_state *r6s, | 2488 | struct stripe_head_state *s, |
| 2476 | int disks) | 2489 | int disks) |
| 2477 | { | 2490 | { |
| 2478 | int i; | 2491 | int i; |
| 2479 | 2492 | ||
| @@ -2484,7 +2497,7 @@ static void handle_stripe_fill6(struct stripe_head *sh, | |||
| 2484 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | 2497 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && |
| 2485 | !sh->reconstruct_state) | 2498 | !sh->reconstruct_state) |
| 2486 | for (i = disks; i--; ) | 2499 | for (i = disks; i--; ) |
| 2487 | if (fetch_block6(sh, s, r6s, i, disks)) | 2500 | if (fetch_block(sh, s, i, disks)) |
| 2488 | break; | 2501 | break; |
| 2489 | set_bit(STRIPE_HANDLE, &sh->state); | 2502 | set_bit(STRIPE_HANDLE, &sh->state); |
| 2490 | } | 2503 | } |
| @@ -2540,11 +2553,19 @@ static void handle_stripe_clean_event(raid5_conf_t *conf, | |||
| 2540 | md_wakeup_thread(conf->mddev->thread); | 2553 | md_wakeup_thread(conf->mddev->thread); |
| 2541 | } | 2554 | } |
| 2542 | 2555 | ||
| 2543 | static void handle_stripe_dirtying5(raid5_conf_t *conf, | 2556 | static void handle_stripe_dirtying(raid5_conf_t *conf, |
| 2544 | struct stripe_head *sh, struct stripe_head_state *s, int disks) | 2557 | struct stripe_head *sh, |
| 2558 | struct stripe_head_state *s, | ||
| 2559 | int disks) | ||
| 2545 | { | 2560 | { |
| 2546 | int rmw = 0, rcw = 0, i; | 2561 | int rmw = 0, rcw = 0, i; |
| 2547 | for (i = disks; i--; ) { | 2562 | if (conf->max_degraded == 2) { |
| 2563 | /* RAID6 requires 'rcw' in current implementation | ||
| 2564 | * Calculate the real rcw later - for now fake it | ||
| 2565 | * look like rcw is cheaper | ||
| 2566 | */ | ||
| 2567 | rcw = 1; rmw = 2; | ||
| 2568 | } else for (i = disks; i--; ) { | ||
| 2548 | /* would I have to read this buffer for read_modify_write */ | 2569 | /* would I have to read this buffer for read_modify_write */ |
| 2549 | struct r5dev *dev = &sh->dev[i]; | 2570 | struct r5dev *dev = &sh->dev[i]; |
| 2550 | if ((dev->towrite || i == sh->pd_idx) && | 2571 | if ((dev->towrite || i == sh->pd_idx) && |
| @@ -2591,16 +2612,19 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
| 2591 | } | 2612 | } |
| 2592 | } | 2613 | } |
| 2593 | } | 2614 | } |
| 2594 | if (rcw <= rmw && rcw > 0) | 2615 | if (rcw <= rmw && rcw > 0) { |
| 2595 | /* want reconstruct write, but need to get some data */ | 2616 | /* want reconstruct write, but need to get some data */ |
| 2617 | rcw = 0; | ||
| 2596 | for (i = disks; i--; ) { | 2618 | for (i = disks; i--; ) { |
| 2597 | struct r5dev *dev = &sh->dev[i]; | 2619 | struct r5dev *dev = &sh->dev[i]; |
| 2598 | if (!test_bit(R5_OVERWRITE, &dev->flags) && | 2620 | if (!test_bit(R5_OVERWRITE, &dev->flags) && |
| 2599 | i != sh->pd_idx && | 2621 | i != sh->pd_idx && i != sh->qd_idx && |
| 2600 | !test_bit(R5_LOCKED, &dev->flags) && | 2622 | !test_bit(R5_LOCKED, &dev->flags) && |
| 2601 | !(test_bit(R5_UPTODATE, &dev->flags) || | 2623 | !(test_bit(R5_UPTODATE, &dev->flags) || |
| 2602 | test_bit(R5_Wantcompute, &dev->flags)) && | 2624 | test_bit(R5_Wantcompute, &dev->flags))) { |
| 2603 | test_bit(R5_Insync, &dev->flags)) { | 2625 | rcw++; |
| 2626 | if (!test_bit(R5_Insync, &dev->flags)) | ||
| 2627 | continue; /* it's a failed drive */ | ||
| 2604 | if ( | 2628 | if ( |
| 2605 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 2629 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
| 2606 | pr_debug("Read_old block " | 2630 | pr_debug("Read_old block " |
| @@ -2614,6 +2638,7 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
| 2614 | } | 2638 | } |
| 2615 | } | 2639 | } |
| 2616 | } | 2640 | } |
| 2641 | } | ||
| 2617 | /* now if nothing is locked, and if we have enough data, | 2642 | /* now if nothing is locked, and if we have enough data, |
| 2618 | * we can start a write request | 2643 | * we can start a write request |
| 2619 | */ | 2644 | */ |
| @@ -2630,53 +2655,6 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
| 2630 | schedule_reconstruction(sh, s, rcw == 0, 0); | 2655 | schedule_reconstruction(sh, s, rcw == 0, 0); |
| 2631 | } | 2656 | } |
| 2632 | 2657 | ||
| 2633 | static void handle_stripe_dirtying6(raid5_conf_t *conf, | ||
| 2634 | struct stripe_head *sh, struct stripe_head_state *s, | ||
| 2635 | struct r6_state *r6s, int disks) | ||
| 2636 | { | ||
| 2637 | int rcw = 0, pd_idx = sh->pd_idx, i; | ||
| 2638 | int qd_idx = sh->qd_idx; | ||
| 2639 | |||
| 2640 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 2641 | for (i = disks; i--; ) { | ||
| 2642 | struct r5dev *dev = &sh->dev[i]; | ||
| 2643 | /* check if we haven't enough data */ | ||
| 2644 | if (!test_bit(R5_OVERWRITE, &dev->flags) && | ||
| 2645 | i != pd_idx && i != qd_idx && | ||
| 2646 | !test_bit(R5_LOCKED, &dev->flags) && | ||
| 2647 | !(test_bit(R5_UPTODATE, &dev->flags) || | ||
| 2648 | test_bit(R5_Wantcompute, &dev->flags))) { | ||
| 2649 | rcw++; | ||
| 2650 | if (!test_bit(R5_Insync, &dev->flags)) | ||
| 2651 | continue; /* it's a failed drive */ | ||
| 2652 | |||
| 2653 | if ( | ||
| 2654 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
| 2655 | pr_debug("Read_old stripe %llu " | ||
| 2656 | "block %d for Reconstruct\n", | ||
| 2657 | (unsigned long long)sh->sector, i); | ||
| 2658 | set_bit(R5_LOCKED, &dev->flags); | ||
| 2659 | set_bit(R5_Wantread, &dev->flags); | ||
| 2660 | s->locked++; | ||
| 2661 | } else { | ||
| 2662 | pr_debug("Request delayed stripe %llu " | ||
| 2663 | "block %d for Reconstruct\n", | ||
| 2664 | (unsigned long long)sh->sector, i); | ||
| 2665 | set_bit(STRIPE_DELAYED, &sh->state); | ||
| 2666 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 2667 | } | ||
| 2668 | } | ||
| 2669 | } | ||
| 2670 | /* now if nothing is locked, and if we have enough data, we can start a | ||
| 2671 | * write request | ||
| 2672 | */ | ||
| 2673 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && | ||
| 2674 | s->locked == 0 && rcw == 0 && | ||
| 2675 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | ||
| 2676 | schedule_reconstruction(sh, s, 1, 0); | ||
| 2677 | } | ||
| 2678 | } | ||
| 2679 | |||
| 2680 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | 2658 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, |
| 2681 | struct stripe_head_state *s, int disks) | 2659 | struct stripe_head_state *s, int disks) |
| 2682 | { | 2660 | { |
| @@ -2695,7 +2673,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2695 | s->uptodate--; | 2673 | s->uptodate--; |
| 2696 | break; | 2674 | break; |
| 2697 | } | 2675 | } |
| 2698 | dev = &sh->dev[s->failed_num]; | 2676 | dev = &sh->dev[s->failed_num[0]]; |
| 2699 | /* fall through */ | 2677 | /* fall through */ |
| 2700 | case check_state_compute_result: | 2678 | case check_state_compute_result: |
| 2701 | sh->check_state = check_state_idle; | 2679 | sh->check_state = check_state_idle; |
| @@ -2767,7 +2745,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2767 | 2745 | ||
| 2768 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | 2746 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, |
| 2769 | struct stripe_head_state *s, | 2747 | struct stripe_head_state *s, |
| 2770 | struct r6_state *r6s, int disks) | 2748 | int disks) |
| 2771 | { | 2749 | { |
| 2772 | int pd_idx = sh->pd_idx; | 2750 | int pd_idx = sh->pd_idx; |
| 2773 | int qd_idx = sh->qd_idx; | 2751 | int qd_idx = sh->qd_idx; |
| @@ -2786,14 +2764,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2786 | switch (sh->check_state) { | 2764 | switch (sh->check_state) { |
| 2787 | case check_state_idle: | 2765 | case check_state_idle: |
| 2788 | /* start a new check operation if there are < 2 failures */ | 2766 | /* start a new check operation if there are < 2 failures */ |
| 2789 | if (s->failed == r6s->q_failed) { | 2767 | if (s->failed == s->q_failed) { |
| 2790 | /* The only possible failed device holds Q, so it | 2768 | /* The only possible failed device holds Q, so it |
| 2791 | * makes sense to check P (If anything else were failed, | 2769 | * makes sense to check P (If anything else were failed, |
| 2792 | * we would have used P to recreate it). | 2770 | * we would have used P to recreate it). |
| 2793 | */ | 2771 | */ |
| 2794 | sh->check_state = check_state_run; | 2772 | sh->check_state = check_state_run; |
| 2795 | } | 2773 | } |
| 2796 | if (!r6s->q_failed && s->failed < 2) { | 2774 | if (!s->q_failed && s->failed < 2) { |
| 2797 | /* Q is not failed, and we didn't use it to generate | 2775 | /* Q is not failed, and we didn't use it to generate |
| 2798 | * anything, so it makes sense to check it | 2776 | * anything, so it makes sense to check it |
| 2799 | */ | 2777 | */ |
| @@ -2835,13 +2813,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2835 | */ | 2813 | */ |
| 2836 | BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ | 2814 | BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ |
| 2837 | if (s->failed == 2) { | 2815 | if (s->failed == 2) { |
| 2838 | dev = &sh->dev[r6s->failed_num[1]]; | 2816 | dev = &sh->dev[s->failed_num[1]]; |
| 2839 | s->locked++; | 2817 | s->locked++; |
| 2840 | set_bit(R5_LOCKED, &dev->flags); | 2818 | set_bit(R5_LOCKED, &dev->flags); |
| 2841 | set_bit(R5_Wantwrite, &dev->flags); | 2819 | set_bit(R5_Wantwrite, &dev->flags); |
| 2842 | } | 2820 | } |
| 2843 | if (s->failed >= 1) { | 2821 | if (s->failed >= 1) { |
| 2844 | dev = &sh->dev[r6s->failed_num[0]]; | 2822 | dev = &sh->dev[s->failed_num[0]]; |
| 2845 | s->locked++; | 2823 | s->locked++; |
| 2846 | set_bit(R5_LOCKED, &dev->flags); | 2824 | set_bit(R5_LOCKED, &dev->flags); |
| 2847 | set_bit(R5_Wantwrite, &dev->flags); | 2825 | set_bit(R5_Wantwrite, &dev->flags); |
| @@ -2928,8 +2906,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2928 | } | 2906 | } |
| 2929 | } | 2907 | } |
| 2930 | 2908 | ||
| 2931 | static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | 2909 | static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh) |
| 2932 | struct r6_state *r6s) | ||
| 2933 | { | 2910 | { |
| 2934 | int i; | 2911 | int i; |
| 2935 | 2912 | ||
| @@ -2971,7 +2948,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2971 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); | 2948 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); |
| 2972 | for (j = 0; j < conf->raid_disks; j++) | 2949 | for (j = 0; j < conf->raid_disks; j++) |
| 2973 | if (j != sh2->pd_idx && | 2950 | if (j != sh2->pd_idx && |
| 2974 | (!r6s || j != sh2->qd_idx) && | 2951 | j != sh2->qd_idx && |
| 2975 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) | 2952 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) |
| 2976 | break; | 2953 | break; |
| 2977 | if (j == conf->raid_disks) { | 2954 | if (j == conf->raid_disks) { |
| @@ -3006,43 +2983,35 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 3006 | * | 2983 | * |
| 3007 | */ | 2984 | */ |
| 3008 | 2985 | ||
| 3009 | static void handle_stripe5(struct stripe_head *sh) | 2986 | static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) |
| 3010 | { | 2987 | { |
| 3011 | raid5_conf_t *conf = sh->raid_conf; | 2988 | raid5_conf_t *conf = sh->raid_conf; |
| 3012 | int disks = sh->disks, i; | 2989 | int disks = sh->disks; |
| 3013 | struct bio *return_bi = NULL; | ||
| 3014 | struct stripe_head_state s; | ||
| 3015 | struct r5dev *dev; | 2990 | struct r5dev *dev; |
| 3016 | mdk_rdev_t *blocked_rdev = NULL; | 2991 | int i; |
| 3017 | int prexor; | ||
| 3018 | int dec_preread_active = 0; | ||
| 3019 | 2992 | ||
| 3020 | memset(&s, 0, sizeof(s)); | 2993 | memset(s, 0, sizeof(*s)); |
| 3021 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " | ||
| 3022 | "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, | ||
| 3023 | atomic_read(&sh->count), sh->pd_idx, sh->check_state, | ||
| 3024 | sh->reconstruct_state); | ||
| 3025 | 2994 | ||
| 3026 | spin_lock(&sh->lock); | 2995 | s->syncing = test_bit(STRIPE_SYNCING, &sh->state); |
| 3027 | clear_bit(STRIPE_HANDLE, &sh->state); | 2996 | s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
| 3028 | clear_bit(STRIPE_DELAYED, &sh->state); | 2997 | s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); |
| 3029 | 2998 | s->failed_num[0] = -1; | |
| 3030 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | 2999 | s->failed_num[1] = -1; |
| 3031 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
| 3032 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | ||
| 3033 | 3000 | ||
| 3034 | /* Now to look around and see what can be done */ | 3001 | /* Now to look around and see what can be done */ |
| 3035 | rcu_read_lock(); | 3002 | rcu_read_lock(); |
| 3003 | spin_lock_irq(&conf->device_lock); | ||
| 3036 | for (i=disks; i--; ) { | 3004 | for (i=disks; i--; ) { |
| 3037 | mdk_rdev_t *rdev; | 3005 | mdk_rdev_t *rdev; |
| 3006 | sector_t first_bad; | ||
| 3007 | int bad_sectors; | ||
| 3008 | int is_bad = 0; | ||
| 3038 | 3009 | ||
| 3039 | dev = &sh->dev[i]; | 3010 | dev = &sh->dev[i]; |
| 3040 | 3011 | ||
| 3041 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " | 3012 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
| 3042 | "written %p\n", i, dev->flags, dev->toread, dev->read, | 3013 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
| 3043 | dev->towrite, dev->written); | 3014 | /* maybe we can reply to a read |
| 3044 | |||
| 3045 | /* maybe we can request a biofill operation | ||
| 3046 | * | 3015 | * |
| 3047 | * new wantfill requests are only permitted while | 3016 | * new wantfill requests are only permitted while |
| 3048 | * ops_complete_biofill is guaranteed to be inactive | 3017 | * ops_complete_biofill is guaranteed to be inactive |
| @@ -3052,37 +3021,74 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 3052 | set_bit(R5_Wantfill, &dev->flags); | 3021 | set_bit(R5_Wantfill, &dev->flags); |
| 3053 | 3022 | ||
| 3054 | /* now count some things */ | 3023 | /* now count some things */ |
| 3055 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; | 3024 | if (test_bit(R5_LOCKED, &dev->flags)) |
| 3056 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; | 3025 | s->locked++; |
| 3057 | if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; | 3026 | if (test_bit(R5_UPTODATE, &dev->flags)) |
| 3027 | s->uptodate++; | ||
| 3028 | if (test_bit(R5_Wantcompute, &dev->flags)) { | ||
| 3029 | s->compute++; | ||
| 3030 | BUG_ON(s->compute > 2); | ||
| 3031 | } | ||
| 3058 | 3032 | ||
| 3059 | if (test_bit(R5_Wantfill, &dev->flags)) | 3033 | if (test_bit(R5_Wantfill, &dev->flags)) |
| 3060 | s.to_fill++; | 3034 | s->to_fill++; |
| 3061 | else if (dev->toread) | 3035 | else if (dev->toread) |
| 3062 | s.to_read++; | 3036 | s->to_read++; |
| 3063 | if (dev->towrite) { | 3037 | if (dev->towrite) { |
| 3064 | s.to_write++; | 3038 | s->to_write++; |
| 3065 | if (!test_bit(R5_OVERWRITE, &dev->flags)) | 3039 | if (!test_bit(R5_OVERWRITE, &dev->flags)) |
| 3066 | s.non_overwrite++; | 3040 | s->non_overwrite++; |
| 3067 | } | 3041 | } |
| 3068 | if (dev->written) | 3042 | if (dev->written) |
| 3069 | s.written++; | 3043 | s->written++; |
| 3070 | rdev = rcu_dereference(conf->disks[i].rdev); | 3044 | rdev = rcu_dereference(conf->disks[i].rdev); |
| 3071 | if (blocked_rdev == NULL && | 3045 | if (rdev) { |
| 3072 | rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | 3046 | is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, |
| 3073 | blocked_rdev = rdev; | 3047 | &first_bad, &bad_sectors); |
| 3074 | atomic_inc(&rdev->nr_pending); | 3048 | if (s->blocked_rdev == NULL |
| 3049 | && (test_bit(Blocked, &rdev->flags) | ||
| 3050 | || is_bad < 0)) { | ||
| 3051 | if (is_bad < 0) | ||
| 3052 | set_bit(BlockedBadBlocks, | ||
| 3053 | &rdev->flags); | ||
| 3054 | s->blocked_rdev = rdev; | ||
| 3055 | atomic_inc(&rdev->nr_pending); | ||
| 3056 | } | ||
| 3075 | } | 3057 | } |
| 3076 | clear_bit(R5_Insync, &dev->flags); | 3058 | clear_bit(R5_Insync, &dev->flags); |
| 3077 | if (!rdev) | 3059 | if (!rdev) |
| 3078 | /* Not in-sync */; | 3060 | /* Not in-sync */; |
| 3079 | else if (test_bit(In_sync, &rdev->flags)) | 3061 | else if (is_bad) { |
| 3062 | /* also not in-sync */ | ||
| 3063 | if (!test_bit(WriteErrorSeen, &rdev->flags)) { | ||
| 3064 | /* treat as in-sync, but with a read error | ||
| 3065 | * which we can now try to correct | ||
| 3066 | */ | ||
| 3067 | set_bit(R5_Insync, &dev->flags); | ||
| 3068 | set_bit(R5_ReadError, &dev->flags); | ||
| 3069 | } | ||
| 3070 | } else if (test_bit(In_sync, &rdev->flags)) | ||
| 3080 | set_bit(R5_Insync, &dev->flags); | 3071 | set_bit(R5_Insync, &dev->flags); |
| 3081 | else { | 3072 | else { |
| 3082 | /* could be in-sync depending on recovery/reshape status */ | 3073 | /* in sync if before recovery_offset */ |
| 3083 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | 3074 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) |
| 3084 | set_bit(R5_Insync, &dev->flags); | 3075 | set_bit(R5_Insync, &dev->flags); |
| 3085 | } | 3076 | } |
| 3077 | if (test_bit(R5_WriteError, &dev->flags)) { | ||
| 3078 | clear_bit(R5_Insync, &dev->flags); | ||
| 3079 | if (!test_bit(Faulty, &rdev->flags)) { | ||
| 3080 | s->handle_bad_blocks = 1; | ||
| 3081 | atomic_inc(&rdev->nr_pending); | ||
| 3082 | } else | ||
| 3083 | clear_bit(R5_WriteError, &dev->flags); | ||
| 3084 | } | ||
| 3085 | if (test_bit(R5_MadeGood, &dev->flags)) { | ||
| 3086 | if (!test_bit(Faulty, &rdev->flags)) { | ||
| 3087 | s->handle_bad_blocks = 1; | ||
| 3088 | atomic_inc(&rdev->nr_pending); | ||
| 3089 | } else | ||
| 3090 | clear_bit(R5_MadeGood, &dev->flags); | ||
| 3091 | } | ||
| 3086 | if (!test_bit(R5_Insync, &dev->flags)) { | 3092 | if (!test_bit(R5_Insync, &dev->flags)) { |
| 3087 | /* The ReadError flag will just be confusing now */ | 3093 | /* The ReadError flag will just be confusing now */ |
| 3088 | clear_bit(R5_ReadError, &dev->flags); | 3094 | clear_bit(R5_ReadError, &dev->flags); |
| @@ -3091,313 +3097,60 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 3091 | if (test_bit(R5_ReadError, &dev->flags)) | 3097 | if (test_bit(R5_ReadError, &dev->flags)) |
| 3092 | clear_bit(R5_Insync, &dev->flags); | 3098 | clear_bit(R5_Insync, &dev->flags); |
| 3093 | if (!test_bit(R5_Insync, &dev->flags)) { | 3099 | if (!test_bit(R5_Insync, &dev->flags)) { |
| 3094 | s.failed++; | 3100 | if (s->failed < 2) |
| 3095 | s.failed_num = i; | 3101 | s->failed_num[s->failed] = i; |
| 3102 | s->failed++; | ||
| 3096 | } | 3103 | } |
| 3097 | } | 3104 | } |
| 3105 | spin_unlock_irq(&conf->device_lock); | ||
| 3098 | rcu_read_unlock(); | 3106 | rcu_read_unlock(); |
| 3099 | |||
| 3100 | if (unlikely(blocked_rdev)) { | ||
| 3101 | if (s.syncing || s.expanding || s.expanded || | ||
| 3102 | s.to_write || s.written) { | ||
| 3103 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 3104 | goto unlock; | ||
| 3105 | } | ||
| 3106 | /* There is nothing for the blocked_rdev to block */ | ||
| 3107 | rdev_dec_pending(blocked_rdev, conf->mddev); | ||
| 3108 | blocked_rdev = NULL; | ||
| 3109 | } | ||
| 3110 | |||
| 3111 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { | ||
| 3112 | set_bit(STRIPE_OP_BIOFILL, &s.ops_request); | ||
| 3113 | set_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
| 3114 | } | ||
| 3115 | |||
| 3116 | pr_debug("locked=%d uptodate=%d to_read=%d" | ||
| 3117 | " to_write=%d failed=%d failed_num=%d\n", | ||
| 3118 | s.locked, s.uptodate, s.to_read, s.to_write, | ||
| 3119 | s.failed, s.failed_num); | ||
| 3120 | /* check if the array has lost two devices and, if so, some requests might | ||
| 3121 | * need to be failed | ||
| 3122 | */ | ||
| 3123 | if (s.failed > 1 && s.to_read+s.to_write+s.written) | ||
| 3124 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); | ||
| 3125 | if (s.failed > 1 && s.syncing) { | ||
| 3126 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | ||
| 3127 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
| 3128 | s.syncing = 0; | ||
| 3129 | } | ||
| 3130 | |||
| 3131 | /* might be able to return some write requests if the parity block | ||
| 3132 | * is safe, or on a failed drive | ||
| 3133 | */ | ||
| 3134 | dev = &sh->dev[sh->pd_idx]; | ||
| 3135 | if ( s.written && | ||
| 3136 | ((test_bit(R5_Insync, &dev->flags) && | ||
| 3137 | !test_bit(R5_LOCKED, &dev->flags) && | ||
| 3138 | test_bit(R5_UPTODATE, &dev->flags)) || | ||
| 3139 | (s.failed == 1 && s.failed_num == sh->pd_idx))) | ||
| 3140 | handle_stripe_clean_event(conf, sh, disks, &return_bi); | ||
| 3141 | |||
| 3142 | /* Now we might consider reading some blocks, either to check/generate | ||
| 3143 | * parity, or to satisfy requests | ||
| 3144 | * or to load a block that is being partially written. | ||
| 3145 | */ | ||
| 3146 | if (s.to_read || s.non_overwrite || | ||
| 3147 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) | ||
| 3148 | handle_stripe_fill5(sh, &s, disks); | ||
| 3149 | |||
| 3150 | /* Now we check to see if any write operations have recently | ||
| 3151 | * completed | ||
| 3152 | */ | ||
| 3153 | prexor = 0; | ||
| 3154 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) | ||
| 3155 | prexor = 1; | ||
| 3156 | if (sh->reconstruct_state == reconstruct_state_drain_result || | ||
| 3157 | sh->reconstruct_state == reconstruct_state_prexor_drain_result) { | ||
| 3158 | sh->reconstruct_state = reconstruct_state_idle; | ||
| 3159 | |||
| 3160 | /* All the 'written' buffers and the parity block are ready to | ||
| 3161 | * be written back to disk | ||
| 3162 | */ | ||
| 3163 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | ||
| 3164 | for (i = disks; i--; ) { | ||
| 3165 | dev = &sh->dev[i]; | ||
| 3166 | if (test_bit(R5_LOCKED, &dev->flags) && | ||
| 3167 | (i == sh->pd_idx || dev->written)) { | ||
| 3168 | pr_debug("Writing block %d\n", i); | ||
| 3169 | set_bit(R5_Wantwrite, &dev->flags); | ||
| 3170 | if (prexor) | ||
| 3171 | continue; | ||
| 3172 | if (!test_bit(R5_Insync, &dev->flags) || | ||
| 3173 | (i == sh->pd_idx && s.failed == 0)) | ||
| 3174 | set_bit(STRIPE_INSYNC, &sh->state); | ||
| 3175 | } | ||
| 3176 | } | ||
| 3177 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
| 3178 | dec_preread_active = 1; | ||
| 3179 | } | ||
| 3180 | |||
| 3181 | /* Now to consider new write requests and what else, if anything | ||
| 3182 | * should be read. We do not handle new writes when: | ||
| 3183 | * 1/ A 'write' operation (copy+xor) is already in flight. | ||
| 3184 | * 2/ A 'check' operation is in flight, as it may clobber the parity | ||
| 3185 | * block. | ||
| 3186 | */ | ||
| 3187 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | ||
| 3188 | handle_stripe_dirtying5(conf, sh, &s, disks); | ||
| 3189 | |||
| 3190 | /* maybe we need to check and possibly fix the parity for this stripe | ||
| 3191 | * Any reads will already have been scheduled, so we just see if enough | ||
| 3192 | * data is available. The parity check is held off while parity | ||
| 3193 | * dependent operations are in flight. | ||
| 3194 | */ | ||
| 3195 | if (sh->check_state || | ||
| 3196 | (s.syncing && s.locked == 0 && | ||
| 3197 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | ||
| 3198 | !test_bit(STRIPE_INSYNC, &sh->state))) | ||
| 3199 | handle_parity_checks5(conf, sh, &s, disks); | ||
| 3200 | |||
| 3201 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | ||
| 3202 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | ||
| 3203 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
| 3204 | } | ||
| 3205 | |||
| 3206 | /* If the failed drive is just a ReadError, then we might need to progress | ||
| 3207 | * the repair/check process | ||
| 3208 | */ | ||
| 3209 | if (s.failed == 1 && !conf->mddev->ro && | ||
| 3210 | test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) | ||
| 3211 | && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) | ||
| 3212 | && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) | ||
| 3213 | ) { | ||
| 3214 | dev = &sh->dev[s.failed_num]; | ||
| 3215 | if (!test_bit(R5_ReWrite, &dev->flags)) { | ||
| 3216 | set_bit(R5_Wantwrite, &dev->flags); | ||
| 3217 | set_bit(R5_ReWrite, &dev->flags); | ||
| 3218 | set_bit(R5_LOCKED, &dev->flags); | ||
| 3219 | s.locked++; | ||
| 3220 | } else { | ||
| 3221 | /* let's read it back */ | ||
| 3222 | set_bit(R5_Wantread, &dev->flags); | ||
| 3223 | set_bit(R5_LOCKED, &dev->flags); | ||
| 3224 | s.locked++; | ||
| 3225 | } | ||
| 3226 | } | ||
| 3227 | |||
| 3228 | /* Finish reconstruct operations initiated by the expansion process */ | ||
| 3229 | if (sh->reconstruct_state == reconstruct_state_result) { | ||
| 3230 | struct stripe_head *sh2 | ||
| 3231 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | ||
| 3232 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | ||
| 3233 | /* sh cannot be written until sh2 has been read. | ||
| 3234 | * so arrange for sh to be delayed a little | ||
| 3235 | */ | ||
| 3236 | set_bit(STRIPE_DELAYED, &sh->state); | ||
| 3237 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 3238 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
| 3239 | &sh2->state)) | ||
| 3240 | atomic_inc(&conf->preread_active_stripes); | ||
| 3241 | release_stripe(sh2); | ||
| 3242 | goto unlock; | ||
| 3243 | } | ||
| 3244 | if (sh2) | ||
| 3245 | release_stripe(sh2); | ||
| 3246 | |||
| 3247 | sh->reconstruct_state = reconstruct_state_idle; | ||
| 3248 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
| 3249 | for (i = conf->raid_disks; i--; ) { | ||
| 3250 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
| 3251 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
| 3252 | s.locked++; | ||
| 3253 | } | ||
| 3254 | } | ||
| 3255 | |||
| 3256 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | ||
| 3257 | !sh->reconstruct_state) { | ||
| 3258 | /* Need to write out all blocks after computing parity */ | ||
| 3259 | sh->disks = conf->raid_disks; | ||
| 3260 | stripe_set_idx(sh->sector, conf, 0, sh); | ||
| 3261 | schedule_reconstruction(sh, &s, 1, 1); | ||
| 3262 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { | ||
| 3263 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | ||
| 3264 | atomic_dec(&conf->reshape_stripes); | ||
| 3265 | wake_up(&conf->wait_for_overlap); | ||
| 3266 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); | ||
| 3267 | } | ||
| 3268 | |||
| 3269 | if (s.expanding && s.locked == 0 && | ||
| 3270 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) | ||
| 3271 | handle_stripe_expansion(conf, sh, NULL); | ||
| 3272 | |||
| 3273 | unlock: | ||
| 3274 | spin_unlock(&sh->lock); | ||
| 3275 | |||
| 3276 | /* wait for this device to become unblocked */ | ||
| 3277 | if (unlikely(blocked_rdev)) | ||
| 3278 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | ||
| 3279 | |||
| 3280 | if (s.ops_request) | ||
| 3281 | raid_run_ops(sh, s.ops_request); | ||
| 3282 | |||
| 3283 | ops_run_io(sh, &s); | ||
| 3284 | |||
| 3285 | if (dec_preread_active) { | ||
| 3286 | /* We delay this until after ops_run_io so that if make_request | ||
| 3287 | * is waiting on a flush, it won't continue until the writes | ||
| 3288 | * have actually been submitted. | ||
| 3289 | */ | ||
| 3290 | atomic_dec(&conf->preread_active_stripes); | ||
| 3291 | if (atomic_read(&conf->preread_active_stripes) < | ||
| 3292 | IO_THRESHOLD) | ||
| 3293 | md_wakeup_thread(conf->mddev->thread); | ||
| 3294 | } | ||
| 3295 | return_io(return_bi); | ||
| 3296 | } | 3107 | } |
| 3297 | 3108 | ||
| 3298 | static void handle_stripe6(struct stripe_head *sh) | 3109 | static void handle_stripe(struct stripe_head *sh) |
| 3299 | { | 3110 | { |
| 3111 | struct stripe_head_state s; | ||
| 3300 | raid5_conf_t *conf = sh->raid_conf; | 3112 | raid5_conf_t *conf = sh->raid_conf; |
| 3113 | int i; | ||
| 3114 | int prexor; | ||
| 3301 | int disks = sh->disks; | 3115 | int disks = sh->disks; |
| 3302 | struct bio *return_bi = NULL; | 3116 | struct r5dev *pdev, *qdev; |
| 3303 | int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; | 3117 | |
| 3304 | struct stripe_head_state s; | 3118 | clear_bit(STRIPE_HANDLE, &sh->state); |
| 3305 | struct r6_state r6s; | 3119 | if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) { |
| 3306 | struct r5dev *dev, *pdev, *qdev; | 3120 | /* already being handled, ensure it gets handled |
| 3307 | mdk_rdev_t *blocked_rdev = NULL; | 3121 | * again when current action finishes */ |
| 3308 | int dec_preread_active = 0; | 3122 | set_bit(STRIPE_HANDLE, &sh->state); |
| 3123 | return; | ||
| 3124 | } | ||
| 3125 | |||
| 3126 | if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { | ||
| 3127 | set_bit(STRIPE_SYNCING, &sh->state); | ||
| 3128 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
| 3129 | } | ||
| 3130 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
| 3309 | 3131 | ||
| 3310 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " | 3132 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " |
| 3311 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", | 3133 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", |
| 3312 | (unsigned long long)sh->sector, sh->state, | 3134 | (unsigned long long)sh->sector, sh->state, |
| 3313 | atomic_read(&sh->count), pd_idx, qd_idx, | 3135 | atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, |
| 3314 | sh->check_state, sh->reconstruct_state); | 3136 | sh->check_state, sh->reconstruct_state); |
| 3315 | memset(&s, 0, sizeof(s)); | ||
| 3316 | |||
| 3317 | spin_lock(&sh->lock); | ||
| 3318 | clear_bit(STRIPE_HANDLE, &sh->state); | ||
| 3319 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
| 3320 | |||
| 3321 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | ||
| 3322 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
| 3323 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | ||
| 3324 | /* Now to look around and see what can be done */ | ||
| 3325 | |||
| 3326 | rcu_read_lock(); | ||
| 3327 | for (i=disks; i--; ) { | ||
| 3328 | mdk_rdev_t *rdev; | ||
| 3329 | dev = &sh->dev[i]; | ||
| 3330 | 3137 | ||
| 3331 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3138 | analyse_stripe(sh, &s); |
| 3332 | i, dev->flags, dev->toread, dev->towrite, dev->written); | ||
| 3333 | /* maybe we can reply to a read | ||
| 3334 | * | ||
| 3335 | * new wantfill requests are only permitted while | ||
| 3336 | * ops_complete_biofill is guaranteed to be inactive | ||
| 3337 | */ | ||
| 3338 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && | ||
| 3339 | !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) | ||
| 3340 | set_bit(R5_Wantfill, &dev->flags); | ||
| 3341 | 3139 | ||
| 3342 | /* now count some things */ | 3140 | if (s.handle_bad_blocks) { |
| 3343 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; | 3141 | set_bit(STRIPE_HANDLE, &sh->state); |
| 3344 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; | 3142 | goto finish; |
| 3345 | if (test_bit(R5_Wantcompute, &dev->flags)) { | ||
| 3346 | s.compute++; | ||
| 3347 | BUG_ON(s.compute > 2); | ||
| 3348 | } | ||
| 3349 | |||
| 3350 | if (test_bit(R5_Wantfill, &dev->flags)) { | ||
| 3351 | s.to_fill++; | ||
| 3352 | } else if (dev->toread) | ||
| 3353 | s.to_read++; | ||
| 3354 | if (dev->towrite) { | ||
| 3355 | s.to_write++; | ||
| 3356 | if (!test_bit(R5_OVERWRITE, &dev->flags)) | ||
| 3357 | s.non_overwrite++; | ||
| 3358 | } | ||
| 3359 | if (dev->written) | ||
| 3360 | s.written++; | ||
| 3361 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
| 3362 | if (blocked_rdev == NULL && | ||
| 3363 | rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | ||
| 3364 | blocked_rdev = rdev; | ||
| 3365 | atomic_inc(&rdev->nr_pending); | ||
| 3366 | } | ||
| 3367 | clear_bit(R5_Insync, &dev->flags); | ||
| 3368 | if (!rdev) | ||
| 3369 | /* Not in-sync */; | ||
| 3370 | else if (test_bit(In_sync, &rdev->flags)) | ||
| 3371 | set_bit(R5_Insync, &dev->flags); | ||
| 3372 | else { | ||
| 3373 | /* in sync if before recovery_offset */ | ||
| 3374 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
| 3375 | set_bit(R5_Insync, &dev->flags); | ||
| 3376 | } | ||
| 3377 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
| 3378 | /* The ReadError flag will just be confusing now */ | ||
| 3379 | clear_bit(R5_ReadError, &dev->flags); | ||
| 3380 | clear_bit(R5_ReWrite, &dev->flags); | ||
| 3381 | } | ||
| 3382 | if (test_bit(R5_ReadError, &dev->flags)) | ||
| 3383 | clear_bit(R5_Insync, &dev->flags); | ||
| 3384 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
| 3385 | if (s.failed < 2) | ||
| 3386 | r6s.failed_num[s.failed] = i; | ||
| 3387 | s.failed++; | ||
| 3388 | } | ||
| 3389 | } | 3143 | } |
| 3390 | rcu_read_unlock(); | ||
| 3391 | 3144 | ||
| 3392 | if (unlikely(blocked_rdev)) { | 3145 | if (unlikely(s.blocked_rdev)) { |
| 3393 | if (s.syncing || s.expanding || s.expanded || | 3146 | if (s.syncing || s.expanding || s.expanded || |
| 3394 | s.to_write || s.written) { | 3147 | s.to_write || s.written) { |
| 3395 | set_bit(STRIPE_HANDLE, &sh->state); | 3148 | set_bit(STRIPE_HANDLE, &sh->state); |
| 3396 | goto unlock; | 3149 | goto finish; |
| 3397 | } | 3150 | } |
| 3398 | /* There is nothing for the blocked_rdev to block */ | 3151 | /* There is nothing for the blocked_rdev to block */ |
| 3399 | rdev_dec_pending(blocked_rdev, conf->mddev); | 3152 | rdev_dec_pending(s.blocked_rdev, conf->mddev); |
| 3400 | blocked_rdev = NULL; | 3153 | s.blocked_rdev = NULL; |
| 3401 | } | 3154 | } |
| 3402 | 3155 | ||
| 3403 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { | 3156 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { |
| @@ -3408,83 +3161,88 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3408 | pr_debug("locked=%d uptodate=%d to_read=%d" | 3161 | pr_debug("locked=%d uptodate=%d to_read=%d" |
| 3409 | " to_write=%d failed=%d failed_num=%d,%d\n", | 3162 | " to_write=%d failed=%d failed_num=%d,%d\n", |
| 3410 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, | 3163 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, |
| 3411 | r6s.failed_num[0], r6s.failed_num[1]); | 3164 | s.failed_num[0], s.failed_num[1]); |
| 3412 | /* check if the array has lost >2 devices and, if so, some requests | 3165 | /* check if the array has lost more than max_degraded devices and, |
| 3413 | * might need to be failed | 3166 | * if so, some requests might need to be failed. |
| 3414 | */ | 3167 | */ |
| 3415 | if (s.failed > 2 && s.to_read+s.to_write+s.written) | 3168 | if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written) |
| 3416 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); | 3169 | handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); |
| 3417 | if (s.failed > 2 && s.syncing) { | 3170 | if (s.failed > conf->max_degraded && s.syncing) |
| 3418 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 3171 | handle_failed_sync(conf, sh, &s); |
| 3419 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
| 3420 | s.syncing = 0; | ||
| 3421 | } | ||
| 3422 | 3172 | ||
| 3423 | /* | 3173 | /* |
| 3424 | * might be able to return some write requests if the parity blocks | 3174 | * might be able to return some write requests if the parity blocks |
| 3425 | * are safe, or on a failed drive | 3175 | * are safe, or on a failed drive |
| 3426 | */ | 3176 | */ |
| 3427 | pdev = &sh->dev[pd_idx]; | 3177 | pdev = &sh->dev[sh->pd_idx]; |
| 3428 | r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) | 3178 | s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) |
| 3429 | || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); | 3179 | || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); |
| 3430 | qdev = &sh->dev[qd_idx]; | 3180 | qdev = &sh->dev[sh->qd_idx]; |
| 3431 | r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) | 3181 | s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) |
| 3432 | || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); | 3182 | || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) |
| 3433 | 3183 | || conf->level < 6; | |
| 3434 | if ( s.written && | 3184 | |
| 3435 | ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) | 3185 | if (s.written && |
| 3186 | (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) | ||
| 3436 | && !test_bit(R5_LOCKED, &pdev->flags) | 3187 | && !test_bit(R5_LOCKED, &pdev->flags) |
| 3437 | && test_bit(R5_UPTODATE, &pdev->flags)))) && | 3188 | && test_bit(R5_UPTODATE, &pdev->flags)))) && |
| 3438 | ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) | 3189 | (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) |
| 3439 | && !test_bit(R5_LOCKED, &qdev->flags) | 3190 | && !test_bit(R5_LOCKED, &qdev->flags) |
| 3440 | && test_bit(R5_UPTODATE, &qdev->flags))))) | 3191 | && test_bit(R5_UPTODATE, &qdev->flags))))) |
| 3441 | handle_stripe_clean_event(conf, sh, disks, &return_bi); | 3192 | handle_stripe_clean_event(conf, sh, disks, &s.return_bi); |
| 3442 | 3193 | ||
| 3443 | /* Now we might consider reading some blocks, either to check/generate | 3194 | /* Now we might consider reading some blocks, either to check/generate |
| 3444 | * parity, or to satisfy requests | 3195 | * parity, or to satisfy requests |
| 3445 | * or to load a block that is being partially written. | 3196 | * or to load a block that is being partially written. |
| 3446 | */ | 3197 | */ |
| 3447 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || | 3198 | if (s.to_read || s.non_overwrite |
| 3448 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) | 3199 | || (conf->level == 6 && s.to_write && s.failed) |
| 3449 | handle_stripe_fill6(sh, &s, &r6s, disks); | 3200 | || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) |
| 3201 | handle_stripe_fill(sh, &s, disks); | ||
| 3450 | 3202 | ||
| 3451 | /* Now we check to see if any write operations have recently | 3203 | /* Now we check to see if any write operations have recently |
| 3452 | * completed | 3204 | * completed |
| 3453 | */ | 3205 | */ |
| 3454 | if (sh->reconstruct_state == reconstruct_state_drain_result) { | 3206 | prexor = 0; |
| 3455 | 3207 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) | |
| 3208 | prexor = 1; | ||
| 3209 | if (sh->reconstruct_state == reconstruct_state_drain_result || | ||
| 3210 | sh->reconstruct_state == reconstruct_state_prexor_drain_result) { | ||
| 3456 | sh->reconstruct_state = reconstruct_state_idle; | 3211 | sh->reconstruct_state = reconstruct_state_idle; |
| 3457 | /* All the 'written' buffers and the parity blocks are ready to | 3212 | |
| 3213 | /* All the 'written' buffers and the parity block are ready to | ||
| 3458 | * be written back to disk | 3214 | * be written back to disk |
| 3459 | */ | 3215 | */ |
| 3460 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | 3216 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); |
| 3461 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); | 3217 | BUG_ON(sh->qd_idx >= 0 && |
| 3218 | !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); | ||
| 3462 | for (i = disks; i--; ) { | 3219 | for (i = disks; i--; ) { |
| 3463 | dev = &sh->dev[i]; | 3220 | struct r5dev *dev = &sh->dev[i]; |
| 3464 | if (test_bit(R5_LOCKED, &dev->flags) && | 3221 | if (test_bit(R5_LOCKED, &dev->flags) && |
| 3465 | (i == sh->pd_idx || i == qd_idx || | 3222 | (i == sh->pd_idx || i == sh->qd_idx || |
| 3466 | dev->written)) { | 3223 | dev->written)) { |
| 3467 | pr_debug("Writing block %d\n", i); | 3224 | pr_debug("Writing block %d\n", i); |
| 3468 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | ||
| 3469 | set_bit(R5_Wantwrite, &dev->flags); | 3225 | set_bit(R5_Wantwrite, &dev->flags); |
| 3226 | if (prexor) | ||
| 3227 | continue; | ||
| 3470 | if (!test_bit(R5_Insync, &dev->flags) || | 3228 | if (!test_bit(R5_Insync, &dev->flags) || |
| 3471 | ((i == sh->pd_idx || i == qd_idx) && | 3229 | ((i == sh->pd_idx || i == sh->qd_idx) && |
| 3472 | s.failed == 0)) | 3230 | s.failed == 0)) |
| 3473 | set_bit(STRIPE_INSYNC, &sh->state); | 3231 | set_bit(STRIPE_INSYNC, &sh->state); |
| 3474 | } | 3232 | } |
| 3475 | } | 3233 | } |
| 3476 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 3234 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
| 3477 | dec_preread_active = 1; | 3235 | s.dec_preread_active = 1; |
| 3478 | } | 3236 | } |
| 3479 | 3237 | ||
| 3480 | /* Now to consider new write requests and what else, if anything | 3238 | /* Now to consider new write requests and what else, if anything |
| 3481 | * should be read. We do not handle new writes when: | 3239 | * should be read. We do not handle new writes when: |
| 3482 | * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. | 3240 | * 1/ A 'write' operation (copy+xor) is already in flight. |
| 3483 | * 2/ A 'check' operation is in flight, as it may clobber the parity | 3241 | * 2/ A 'check' operation is in flight, as it may clobber the parity |
| 3484 | * block. | 3242 | * block. |
| 3485 | */ | 3243 | */ |
| 3486 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | 3244 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) |
| 3487 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); | 3245 | handle_stripe_dirtying(conf, sh, &s, disks); |
| 3488 | 3246 | ||
| 3489 | /* maybe we need to check and possibly fix the parity for this stripe | 3247 | /* maybe we need to check and possibly fix the parity for this stripe |
| 3490 | * Any reads will already have been scheduled, so we just see if enough | 3248 | * Any reads will already have been scheduled, so we just see if enough |
| @@ -3494,20 +3252,24 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3494 | if (sh->check_state || | 3252 | if (sh->check_state || |
| 3495 | (s.syncing && s.locked == 0 && | 3253 | (s.syncing && s.locked == 0 && |
| 3496 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | 3254 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && |
| 3497 | !test_bit(STRIPE_INSYNC, &sh->state))) | 3255 | !test_bit(STRIPE_INSYNC, &sh->state))) { |
| 3498 | handle_parity_checks6(conf, sh, &s, &r6s, disks); | 3256 | if (conf->level == 6) |
| 3257 | handle_parity_checks6(conf, sh, &s, disks); | ||
| 3258 | else | ||
| 3259 | handle_parity_checks5(conf, sh, &s, disks); | ||
| 3260 | } | ||
| 3499 | 3261 | ||
| 3500 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 3262 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
| 3501 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 3263 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); |
| 3502 | clear_bit(STRIPE_SYNCING, &sh->state); | 3264 | clear_bit(STRIPE_SYNCING, &sh->state); |
| 3503 | } | 3265 | } |
| 3504 | 3266 | ||
| 3505 | /* If the failed drives are just a ReadError, then we might need | 3267 | /* If the failed drives are just a ReadError, then we might need |
| 3506 | * to progress the repair/check process | 3268 | * to progress the repair/check process |
| 3507 | */ | 3269 | */ |
| 3508 | if (s.failed <= 2 && !conf->mddev->ro) | 3270 | if (s.failed <= conf->max_degraded && !conf->mddev->ro) |
| 3509 | for (i = 0; i < s.failed; i++) { | 3271 | for (i = 0; i < s.failed; i++) { |
| 3510 | dev = &sh->dev[r6s.failed_num[i]]; | 3272 | struct r5dev *dev = &sh->dev[s.failed_num[i]]; |
| 3511 | if (test_bit(R5_ReadError, &dev->flags) | 3273 | if (test_bit(R5_ReadError, &dev->flags) |
| 3512 | && !test_bit(R5_LOCKED, &dev->flags) | 3274 | && !test_bit(R5_LOCKED, &dev->flags) |
| 3513 | && test_bit(R5_UPTODATE, &dev->flags) | 3275 | && test_bit(R5_UPTODATE, &dev->flags) |
| @@ -3526,8 +3288,26 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3526 | } | 3288 | } |
| 3527 | } | 3289 | } |
| 3528 | 3290 | ||
| 3291 | |||
| 3529 | /* Finish reconstruct operations initiated by the expansion process */ | 3292 | /* Finish reconstruct operations initiated by the expansion process */ |
| 3530 | if (sh->reconstruct_state == reconstruct_state_result) { | 3293 | if (sh->reconstruct_state == reconstruct_state_result) { |
| 3294 | struct stripe_head *sh_src | ||
| 3295 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | ||
| 3296 | if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { | ||
| 3297 | /* sh cannot be written until sh_src has been read. | ||
| 3298 | * so arrange for sh to be delayed a little | ||
| 3299 | */ | ||
| 3300 | set_bit(STRIPE_DELAYED, &sh->state); | ||
| 3301 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 3302 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
| 3303 | &sh_src->state)) | ||
| 3304 | atomic_inc(&conf->preread_active_stripes); | ||
| 3305 | release_stripe(sh_src); | ||
| 3306 | goto finish; | ||
| 3307 | } | ||
| 3308 | if (sh_src) | ||
| 3309 | release_stripe(sh_src); | ||
| 3310 | |||
| 3531 | sh->reconstruct_state = reconstruct_state_idle; | 3311 | sh->reconstruct_state = reconstruct_state_idle; |
| 3532 | clear_bit(STRIPE_EXPANDING, &sh->state); | 3312 | clear_bit(STRIPE_EXPANDING, &sh->state); |
| 3533 | for (i = conf->raid_disks; i--; ) { | 3313 | for (i = conf->raid_disks; i--; ) { |
| @@ -3539,24 +3319,7 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3539 | 3319 | ||
| 3540 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | 3320 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && |
| 3541 | !sh->reconstruct_state) { | 3321 | !sh->reconstruct_state) { |
| 3542 | struct stripe_head *sh2 | 3322 | /* Need to write out all blocks after computing parity */ |
| 3543 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | ||
| 3544 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | ||
| 3545 | /* sh cannot be written until sh2 has been read. | ||
| 3546 | * so arrange for sh to be delayed a little | ||
| 3547 | */ | ||
| 3548 | set_bit(STRIPE_DELAYED, &sh->state); | ||
| 3549 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 3550 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
| 3551 | &sh2->state)) | ||
| 3552 | atomic_inc(&conf->preread_active_stripes); | ||
| 3553 | release_stripe(sh2); | ||
| 3554 | goto unlock; | ||
| 3555 | } | ||
| 3556 | if (sh2) | ||
| 3557 | release_stripe(sh2); | ||
| 3558 | |||
| 3559 | /* Need to write out all blocks after computing P&Q */ | ||
| 3560 | sh->disks = conf->raid_disks; | 3323 | sh->disks = conf->raid_disks; |
| 3561 | stripe_set_idx(sh->sector, conf, 0, sh); | 3324 | stripe_set_idx(sh->sector, conf, 0, sh); |
| 3562 | schedule_reconstruction(sh, &s, 1, 1); | 3325 | schedule_reconstruction(sh, &s, 1, 1); |
| @@ -3569,22 +3332,39 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3569 | 3332 | ||
| 3570 | if (s.expanding && s.locked == 0 && | 3333 | if (s.expanding && s.locked == 0 && |
| 3571 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) | 3334 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) |
| 3572 | handle_stripe_expansion(conf, sh, &r6s); | 3335 | handle_stripe_expansion(conf, sh); |
| 3573 | |||
| 3574 | unlock: | ||
| 3575 | spin_unlock(&sh->lock); | ||
| 3576 | 3336 | ||
| 3337 | finish: | ||
| 3577 | /* wait for this device to become unblocked */ | 3338 | /* wait for this device to become unblocked */ |
| 3578 | if (unlikely(blocked_rdev)) | 3339 | if (unlikely(s.blocked_rdev)) |
| 3579 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 3340 | md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); |
| 3341 | |||
| 3342 | if (s.handle_bad_blocks) | ||
| 3343 | for (i = disks; i--; ) { | ||
| 3344 | mdk_rdev_t *rdev; | ||
| 3345 | struct r5dev *dev = &sh->dev[i]; | ||
| 3346 | if (test_and_clear_bit(R5_WriteError, &dev->flags)) { | ||
| 3347 | /* We own a safe reference to the rdev */ | ||
| 3348 | rdev = conf->disks[i].rdev; | ||
| 3349 | if (!rdev_set_badblocks(rdev, sh->sector, | ||
| 3350 | STRIPE_SECTORS, 0)) | ||
| 3351 | md_error(conf->mddev, rdev); | ||
| 3352 | rdev_dec_pending(rdev, conf->mddev); | ||
| 3353 | } | ||
| 3354 | if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { | ||
| 3355 | rdev = conf->disks[i].rdev; | ||
| 3356 | rdev_clear_badblocks(rdev, sh->sector, | ||
| 3357 | STRIPE_SECTORS); | ||
| 3358 | rdev_dec_pending(rdev, conf->mddev); | ||
| 3359 | } | ||
| 3360 | } | ||
| 3580 | 3361 | ||
| 3581 | if (s.ops_request) | 3362 | if (s.ops_request) |
| 3582 | raid_run_ops(sh, s.ops_request); | 3363 | raid_run_ops(sh, s.ops_request); |
| 3583 | 3364 | ||
| 3584 | ops_run_io(sh, &s); | 3365 | ops_run_io(sh, &s); |
| 3585 | 3366 | ||
| 3586 | 3367 | if (s.dec_preread_active) { | |
| 3587 | if (dec_preread_active) { | ||
| 3588 | /* We delay this until after ops_run_io so that if make_request | 3368 | /* We delay this until after ops_run_io so that if make_request |
| 3589 | * is waiting on a flush, it won't continue until the writes | 3369 | * is waiting on a flush, it won't continue until the writes |
| 3590 | * have actually been submitted. | 3370 | * have actually been submitted. |
| @@ -3595,15 +3375,9 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3595 | md_wakeup_thread(conf->mddev->thread); | 3375 | md_wakeup_thread(conf->mddev->thread); |
| 3596 | } | 3376 | } |
| 3597 | 3377 | ||
| 3598 | return_io(return_bi); | 3378 | return_io(s.return_bi); |
| 3599 | } | ||
| 3600 | 3379 | ||
| 3601 | static void handle_stripe(struct stripe_head *sh) | 3380 | clear_bit(STRIPE_ACTIVE, &sh->state); |
| 3602 | { | ||
| 3603 | if (sh->raid_conf->level == 6) | ||
| 3604 | handle_stripe6(sh); | ||
| 3605 | else | ||
| 3606 | handle_stripe5(sh); | ||
| 3607 | } | 3381 | } |
| 3608 | 3382 | ||
| 3609 | static void raid5_activate_delayed(raid5_conf_t *conf) | 3383 | static void raid5_activate_delayed(raid5_conf_t *conf) |
| @@ -3833,6 +3607,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) | |||
| 3833 | rcu_read_lock(); | 3607 | rcu_read_lock(); |
| 3834 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); | 3608 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); |
| 3835 | if (rdev && test_bit(In_sync, &rdev->flags)) { | 3609 | if (rdev && test_bit(In_sync, &rdev->flags)) { |
| 3610 | sector_t first_bad; | ||
| 3611 | int bad_sectors; | ||
| 3612 | |||
| 3836 | atomic_inc(&rdev->nr_pending); | 3613 | atomic_inc(&rdev->nr_pending); |
| 3837 | rcu_read_unlock(); | 3614 | rcu_read_unlock(); |
| 3838 | raid_bio->bi_next = (void*)rdev; | 3615 | raid_bio->bi_next = (void*)rdev; |
| @@ -3840,8 +3617,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) | |||
| 3840 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); | 3617 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); |
| 3841 | align_bi->bi_sector += rdev->data_offset; | 3618 | align_bi->bi_sector += rdev->data_offset; |
| 3842 | 3619 | ||
| 3843 | if (!bio_fits_rdev(align_bi)) { | 3620 | if (!bio_fits_rdev(align_bi) || |
| 3844 | /* too big in some way */ | 3621 | is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, |
| 3622 | &first_bad, &bad_sectors)) { | ||
| 3623 | /* too big in some way, or has a known bad block */ | ||
| 3845 | bio_put(align_bi); | 3624 | bio_put(align_bi); |
| 3846 | rdev_dec_pending(rdev, mddev); | 3625 | rdev_dec_pending(rdev, mddev); |
| 3847 | return 0; | 3626 | return 0; |
| @@ -4016,7 +3795,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
| 4016 | } | 3795 | } |
| 4017 | } | 3796 | } |
| 4018 | 3797 | ||
| 4019 | if (bio_data_dir(bi) == WRITE && | 3798 | if (rw == WRITE && |
| 4020 | logical_sector >= mddev->suspend_lo && | 3799 | logical_sector >= mddev->suspend_lo && |
| 4021 | logical_sector < mddev->suspend_hi) { | 3800 | logical_sector < mddev->suspend_hi) { |
| 4022 | release_stripe(sh); | 3801 | release_stripe(sh); |
| @@ -4034,7 +3813,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
| 4034 | } | 3813 | } |
| 4035 | 3814 | ||
| 4036 | if (test_bit(STRIPE_EXPANDING, &sh->state) || | 3815 | if (test_bit(STRIPE_EXPANDING, &sh->state) || |
| 4037 | !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { | 3816 | !add_stripe_bio(sh, bi, dd_idx, rw)) { |
| 4038 | /* Stripe is busy expanding or | 3817 | /* Stripe is busy expanding or |
| 4039 | * add failed due to overlap. Flush everything | 3818 | * add failed due to overlap. Flush everything |
| 4040 | * and wait a while | 3819 | * and wait a while |
| @@ -4375,10 +4154,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
| 4375 | 4154 | ||
| 4376 | bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); | 4155 | bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); |
| 4377 | 4156 | ||
| 4378 | spin_lock(&sh->lock); | 4157 | set_bit(STRIPE_SYNC_REQUESTED, &sh->state); |
| 4379 | set_bit(STRIPE_SYNCING, &sh->state); | ||
| 4380 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
| 4381 | spin_unlock(&sh->lock); | ||
| 4382 | 4158 | ||
| 4383 | handle_stripe(sh); | 4159 | handle_stripe(sh); |
| 4384 | release_stripe(sh); | 4160 | release_stripe(sh); |
| @@ -4509,6 +4285,9 @@ static void raid5d(mddev_t *mddev) | |||
| 4509 | release_stripe(sh); | 4285 | release_stripe(sh); |
| 4510 | cond_resched(); | 4286 | cond_resched(); |
| 4511 | 4287 | ||
| 4288 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | ||
| 4289 | md_check_recovery(mddev); | ||
| 4290 | |||
| 4512 | spin_lock_irq(&conf->device_lock); | 4291 | spin_lock_irq(&conf->device_lock); |
| 4513 | } | 4292 | } |
| 4514 | pr_debug("%d stripes handled\n", handled); | 4293 | pr_debug("%d stripes handled\n", handled); |
| @@ -5313,6 +5092,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) | |||
| 5313 | * isn't possible. | 5092 | * isn't possible. |
| 5314 | */ | 5093 | */ |
| 5315 | if (!test_bit(Faulty, &rdev->flags) && | 5094 | if (!test_bit(Faulty, &rdev->flags) && |
| 5095 | mddev->recovery_disabled != conf->recovery_disabled && | ||
| 5316 | !has_failed(conf) && | 5096 | !has_failed(conf) && |
| 5317 | number < conf->raid_disks) { | 5097 | number < conf->raid_disks) { |
| 5318 | err = -EBUSY; | 5098 | err = -EBUSY; |
| @@ -5341,6 +5121,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 5341 | int first = 0; | 5121 | int first = 0; |
| 5342 | int last = conf->raid_disks - 1; | 5122 | int last = conf->raid_disks - 1; |
| 5343 | 5123 | ||
| 5124 | if (mddev->recovery_disabled == conf->recovery_disabled) | ||
| 5125 | return -EBUSY; | ||
| 5126 | |||
| 5344 | if (has_failed(conf)) | 5127 | if (has_failed(conf)) |
| 5345 | /* no point adding a device */ | 5128 | /* no point adding a device */ |
| 5346 | return -EINVAL; | 5129 | return -EINVAL; |
| @@ -5519,16 +5302,14 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
| 5519 | if (rdev->raid_disk < 0 && | 5302 | if (rdev->raid_disk < 0 && |
| 5520 | !test_bit(Faulty, &rdev->flags)) { | 5303 | !test_bit(Faulty, &rdev->flags)) { |
| 5521 | if (raid5_add_disk(mddev, rdev) == 0) { | 5304 | if (raid5_add_disk(mddev, rdev) == 0) { |
| 5522 | char nm[20]; | ||
| 5523 | if (rdev->raid_disk | 5305 | if (rdev->raid_disk |
| 5524 | >= conf->previous_raid_disks) { | 5306 | >= conf->previous_raid_disks) { |
| 5525 | set_bit(In_sync, &rdev->flags); | 5307 | set_bit(In_sync, &rdev->flags); |
| 5526 | added_devices++; | 5308 | added_devices++; |
| 5527 | } else | 5309 | } else |
| 5528 | rdev->recovery_offset = 0; | 5310 | rdev->recovery_offset = 0; |
| 5529 | sprintf(nm, "rd%d", rdev->raid_disk); | 5311 | |
| 5530 | if (sysfs_create_link(&mddev->kobj, | 5312 | if (sysfs_link_rdev(mddev, rdev)) |
| 5531 | &rdev->kobj, nm)) | ||
| 5532 | /* Failure here is OK */; | 5313 | /* Failure here is OK */; |
| 5533 | } | 5314 | } |
| 5534 | } else if (rdev->raid_disk >= conf->previous_raid_disks | 5315 | } else if (rdev->raid_disk >= conf->previous_raid_disks |
| @@ -5624,9 +5405,7 @@ static void raid5_finish_reshape(mddev_t *mddev) | |||
| 5624 | d++) { | 5405 | d++) { |
| 5625 | mdk_rdev_t *rdev = conf->disks[d].rdev; | 5406 | mdk_rdev_t *rdev = conf->disks[d].rdev; |
| 5626 | if (rdev && raid5_remove_disk(mddev, d) == 0) { | 5407 | if (rdev && raid5_remove_disk(mddev, d) == 0) { |
| 5627 | char nm[20]; | 5408 | sysfs_unlink_rdev(mddev, rdev); |
| 5628 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
| 5629 | sysfs_remove_link(&mddev->kobj, nm); | ||
| 5630 | rdev->raid_disk = -1; | 5409 | rdev->raid_disk = -1; |
| 5631 | } | 5410 | } |
| 5632 | } | 5411 | } |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 3ca77a2613ba..11b9566184b2 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
| @@ -6,11 +6,11 @@ | |||
| 6 | 6 | ||
| 7 | /* | 7 | /* |
| 8 | * | 8 | * |
| 9 | * Each stripe contains one buffer per disc. Each buffer can be in | 9 | * Each stripe contains one buffer per device. Each buffer can be in |
| 10 | * one of a number of states stored in "flags". Changes between | 10 | * one of a number of states stored in "flags". Changes between |
| 11 | * these states happen *almost* exclusively under a per-stripe | 11 | * these states happen *almost* exclusively under the protection of the |
| 12 | * spinlock. Some very specific changes can happen in bi_end_io, and | 12 | * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and |
| 13 | * these are not protected by the spin lock. | 13 | * these are not protected by STRIPE_ACTIVE. |
| 14 | * | 14 | * |
| 15 | * The flag bits that are used to represent these states are: | 15 | * The flag bits that are used to represent these states are: |
| 16 | * R5_UPTODATE and R5_LOCKED | 16 | * R5_UPTODATE and R5_LOCKED |
| @@ -76,12 +76,10 @@ | |||
| 76 | * block and the cached buffer are successfully written, any buffer on | 76 | * block and the cached buffer are successfully written, any buffer on |
| 77 | * a written list can be returned with b_end_io. | 77 | * a written list can be returned with b_end_io. |
| 78 | * | 78 | * |
| 79 | * The write list and read list both act as fifos. The read list is | 79 | * The write list and read list both act as fifos. The read list, |
| 80 | * protected by the device_lock. The write and written lists are | 80 | * write list and written list are protected by the device_lock. |
| 81 | * protected by the stripe lock. The device_lock, which can be | 81 | * The device_lock is only for list manipulations and will only be |
| 82 | * claimed while the stipe lock is held, is only for list | 82 | * held for a very short time. It can be claimed from interrupts. |
| 83 | * manipulations and will only be held for a very short time. It can | ||
| 84 | * be claimed from interrupts. | ||
| 85 | * | 83 | * |
| 86 | * | 84 | * |
| 87 | * Stripes in the stripe cache can be on one of two lists (or on | 85 | * Stripes in the stripe cache can be on one of two lists (or on |
| @@ -96,7 +94,6 @@ | |||
| 96 | * | 94 | * |
| 97 | * The inactive_list, handle_list and hash bucket lists are all protected by the | 95 | * The inactive_list, handle_list and hash bucket lists are all protected by the |
| 98 | * device_lock. | 96 | * device_lock. |
| 99 | * - stripes on the inactive_list never have their stripe_lock held. | ||
| 100 | * - stripes have a reference counter. If count==0, they are on a list. | 97 | * - stripes have a reference counter. If count==0, they are on a list. |
| 101 | * - If a stripe might need handling, STRIPE_HANDLE is set. | 98 | * - If a stripe might need handling, STRIPE_HANDLE is set. |
| 102 | * - When refcount reaches zero, then if STRIPE_HANDLE it is put on | 99 | * - When refcount reaches zero, then if STRIPE_HANDLE it is put on |
| @@ -116,10 +113,10 @@ | |||
| 116 | * attach a request to an active stripe (add_stripe_bh()) | 113 | * attach a request to an active stripe (add_stripe_bh()) |
| 117 | * lockdev attach-buffer unlockdev | 114 | * lockdev attach-buffer unlockdev |
| 118 | * handle a stripe (handle_stripe()) | 115 | * handle a stripe (handle_stripe()) |
| 119 | * lockstripe clrSTRIPE_HANDLE ... | 116 | * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ... |
| 120 | * (lockdev check-buffers unlockdev) .. | 117 | * (lockdev check-buffers unlockdev) .. |
| 121 | * change-state .. | 118 | * change-state .. |
| 122 | * record io/ops needed unlockstripe schedule io/ops | 119 | * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops |
| 123 | * release an active stripe (release_stripe()) | 120 | * release an active stripe (release_stripe()) |
| 124 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev | 121 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev |
| 125 | * | 122 | * |
| @@ -128,8 +125,7 @@ | |||
| 128 | * on a cached buffer, and plus one if the stripe is undergoing stripe | 125 | * on a cached buffer, and plus one if the stripe is undergoing stripe |
| 129 | * operations. | 126 | * operations. |
| 130 | * | 127 | * |
| 131 | * Stripe operations are performed outside the stripe lock, | 128 | * The stripe operations are: |
| 132 | * the stripe operations are: | ||
| 133 | * -copying data between the stripe cache and user application buffers | 129 | * -copying data between the stripe cache and user application buffers |
| 134 | * -computing blocks to save a disk access, or to recover a missing block | 130 | * -computing blocks to save a disk access, or to recover a missing block |
| 135 | * -updating the parity on a write operation (reconstruct write and | 131 | * -updating the parity on a write operation (reconstruct write and |
| @@ -159,7 +155,8 @@ | |||
| 159 | */ | 155 | */ |
| 160 | 156 | ||
| 161 | /* | 157 | /* |
| 162 | * Operations state - intermediate states that are visible outside of sh->lock | 158 | * Operations state - intermediate states that are visible outside of |
| 159 | * STRIPE_ACTIVE. | ||
| 163 | * In general _idle indicates nothing is running, _run indicates a data | 160 | * In general _idle indicates nothing is running, _run indicates a data |
| 164 | * processing operation is active, and _result means the data processing result | 161 | * processing operation is active, and _result means the data processing result |
| 165 | * is stable and can be acted upon. For simple operations like biofill and | 162 | * is stable and can be acted upon. For simple operations like biofill and |
| @@ -209,7 +206,6 @@ struct stripe_head { | |||
| 209 | short ddf_layout;/* use DDF ordering to calculate Q */ | 206 | short ddf_layout;/* use DDF ordering to calculate Q */ |
| 210 | unsigned long state; /* state flags */ | 207 | unsigned long state; /* state flags */ |
| 211 | atomic_t count; /* nr of active thread/requests */ | 208 | atomic_t count; /* nr of active thread/requests */ |
| 212 | spinlock_t lock; | ||
| 213 | int bm_seq; /* sequence number for bitmap flushes */ | 209 | int bm_seq; /* sequence number for bitmap flushes */ |
| 214 | int disks; /* disks in stripe */ | 210 | int disks; /* disks in stripe */ |
| 215 | enum check_states check_state; | 211 | enum check_states check_state; |
| @@ -240,19 +236,20 @@ struct stripe_head { | |||
| 240 | }; | 236 | }; |
| 241 | 237 | ||
| 242 | /* stripe_head_state - collects and tracks the dynamic state of a stripe_head | 238 | /* stripe_head_state - collects and tracks the dynamic state of a stripe_head |
| 243 | * for handle_stripe. It is only valid under spin_lock(sh->lock); | 239 | * for handle_stripe. |
| 244 | */ | 240 | */ |
| 245 | struct stripe_head_state { | 241 | struct stripe_head_state { |
| 246 | int syncing, expanding, expanded; | 242 | int syncing, expanding, expanded; |
| 247 | int locked, uptodate, to_read, to_write, failed, written; | 243 | int locked, uptodate, to_read, to_write, failed, written; |
| 248 | int to_fill, compute, req_compute, non_overwrite; | 244 | int to_fill, compute, req_compute, non_overwrite; |
| 249 | int failed_num; | 245 | int failed_num[2]; |
| 246 | int p_failed, q_failed; | ||
| 247 | int dec_preread_active; | ||
| 250 | unsigned long ops_request; | 248 | unsigned long ops_request; |
| 251 | }; | ||
| 252 | 249 | ||
| 253 | /* r6_state - extra state data only relevant to r6 */ | 250 | struct bio *return_bi; |
| 254 | struct r6_state { | 251 | mdk_rdev_t *blocked_rdev; |
| 255 | int p_failed, q_failed, failed_num[2]; | 252 | int handle_bad_blocks; |
| 256 | }; | 253 | }; |
| 257 | 254 | ||
| 258 | /* Flags */ | 255 | /* Flags */ |
| @@ -268,14 +265,16 @@ struct r6_state { | |||
| 268 | #define R5_ReWrite 9 /* have tried to over-write the readerror */ | 265 | #define R5_ReWrite 9 /* have tried to over-write the readerror */ |
| 269 | 266 | ||
| 270 | #define R5_Expanded 10 /* This block now has post-expand data */ | 267 | #define R5_Expanded 10 /* This block now has post-expand data */ |
| 271 | #define R5_Wantcompute 11 /* compute_block in progress treat as | 268 | #define R5_Wantcompute 11 /* compute_block in progress treat as |
| 272 | * uptodate | 269 | * uptodate |
| 273 | */ | 270 | */ |
| 274 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs | 271 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs |
| 275 | * filling | 272 | * filling |
| 276 | */ | 273 | */ |
| 277 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ | 274 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ |
| 278 | #define R5_WantFUA 14 /* Write should be FUA */ | 275 | #define R5_WantFUA 14 /* Write should be FUA */ |
| 276 | #define R5_WriteError 15 /* got a write error - need to record it */ | ||
| 277 | #define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/ | ||
| 279 | /* | 278 | /* |
| 280 | * Write method | 279 | * Write method |
| 281 | */ | 280 | */ |
| @@ -289,21 +288,25 @@ struct r6_state { | |||
| 289 | /* | 288 | /* |
| 290 | * Stripe state | 289 | * Stripe state |
| 291 | */ | 290 | */ |
| 292 | #define STRIPE_HANDLE 2 | 291 | enum { |
| 293 | #define STRIPE_SYNCING 3 | 292 | STRIPE_ACTIVE, |
| 294 | #define STRIPE_INSYNC 4 | 293 | STRIPE_HANDLE, |
| 295 | #define STRIPE_PREREAD_ACTIVE 5 | 294 | STRIPE_SYNC_REQUESTED, |
| 296 | #define STRIPE_DELAYED 6 | 295 | STRIPE_SYNCING, |
| 297 | #define STRIPE_DEGRADED 7 | 296 | STRIPE_INSYNC, |
| 298 | #define STRIPE_BIT_DELAY 8 | 297 | STRIPE_PREREAD_ACTIVE, |
| 299 | #define STRIPE_EXPANDING 9 | 298 | STRIPE_DELAYED, |
| 300 | #define STRIPE_EXPAND_SOURCE 10 | 299 | STRIPE_DEGRADED, |
| 301 | #define STRIPE_EXPAND_READY 11 | 300 | STRIPE_BIT_DELAY, |
| 302 | #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ | 301 | STRIPE_EXPANDING, |
| 303 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ | 302 | STRIPE_EXPAND_SOURCE, |
| 304 | #define STRIPE_BIOFILL_RUN 14 | 303 | STRIPE_EXPAND_READY, |
| 305 | #define STRIPE_COMPUTE_RUN 15 | 304 | STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */ |
| 306 | #define STRIPE_OPS_REQ_PENDING 16 | 305 | STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ |
| 306 | STRIPE_BIOFILL_RUN, | ||
| 307 | STRIPE_COMPUTE_RUN, | ||
| 308 | STRIPE_OPS_REQ_PENDING, | ||
| 309 | }; | ||
| 307 | 310 | ||
| 308 | /* | 311 | /* |
| 309 | * Operation request flags | 312 | * Operation request flags |
| @@ -336,7 +339,7 @@ struct r6_state { | |||
| 336 | * PREREAD_ACTIVE. | 339 | * PREREAD_ACTIVE. |
| 337 | * In stripe_handle, if we find pre-reading is necessary, we do it if | 340 | * In stripe_handle, if we find pre-reading is necessary, we do it if |
| 338 | * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. | 341 | * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. |
| 339 | * HANDLE gets cleared if stripe_handle leave nothing locked. | 342 | * HANDLE gets cleared if stripe_handle leaves nothing locked. |
| 340 | */ | 343 | */ |
| 341 | 344 | ||
| 342 | 345 | ||
| @@ -399,7 +402,7 @@ struct raid5_private_data { | |||
| 399 | * (fresh device added). | 402 | * (fresh device added). |
| 400 | * Cleared when a sync completes. | 403 | * Cleared when a sync completes. |
| 401 | */ | 404 | */ |
| 402 | 405 | int recovery_disabled; | |
| 403 | /* per cpu variables */ | 406 | /* per cpu variables */ |
| 404 | struct raid5_percpu { | 407 | struct raid5_percpu { |
| 405 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | 408 | struct page *spare_page; /* Used when checking P/Q in raid6 */ |
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 75cbf4f62fe8..9e65d9e20662 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h | |||
| @@ -245,10 +245,16 @@ struct mdp_superblock_1 { | |||
| 245 | __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ | 245 | __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ |
| 246 | __u8 devflags; /* per-device flags. Only one defined...*/ | 246 | __u8 devflags; /* per-device flags. Only one defined...*/ |
| 247 | #define WriteMostly1 1 /* mask for writemostly flag in above */ | 247 | #define WriteMostly1 1 /* mask for writemostly flag in above */ |
| 248 | __u8 pad2[64-57]; /* set to 0 when writing */ | 248 | /* Bad block log. If there are any bad blocks the feature flag is set. |
| 249 | * If offset and size are non-zero, that space is reserved and available | ||
| 250 | */ | ||
| 251 | __u8 bblog_shift; /* shift from sectors to block size */ | ||
| 252 | __le16 bblog_size; /* number of sectors reserved for list */ | ||
| 253 | __le32 bblog_offset; /* sector offset from superblock to bblog, | ||
| 254 | * signed - not unsigned */ | ||
| 249 | 255 | ||
| 250 | /* array state information - 64 bytes */ | 256 | /* array state information - 64 bytes */ |
| 251 | __le64 utime; /* 40 bits second, 24 btes microseconds */ | 257 | __le64 utime; /* 40 bits second, 24 bits microseconds */ |
| 252 | __le64 events; /* incremented when superblock updated */ | 258 | __le64 events; /* incremented when superblock updated */ |
| 253 | __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ | 259 | __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ |
| 254 | __le32 sb_csum; /* checksum up to devs[max_dev] */ | 260 | __le32 sb_csum; /* checksum up to devs[max_dev] */ |
| @@ -270,8 +276,8 @@ struct mdp_superblock_1 { | |||
| 270 | * must be honoured | 276 | * must be honoured |
| 271 | */ | 277 | */ |
| 272 | #define MD_FEATURE_RESHAPE_ACTIVE 4 | 278 | #define MD_FEATURE_RESHAPE_ACTIVE 4 |
| 279 | #define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ | ||
| 273 | 280 | ||
| 274 | #define MD_FEATURE_ALL (1|2|4) | 281 | #define MD_FEATURE_ALL (1|2|4|8) |
| 275 | 282 | ||
| 276 | #endif | 283 | #endif |
| 277 | |||
